2008-10-21 21:56:29
昨日、pythonのスクリプトを書くのを忘れていました。さらにちょっと検討してみたので、今日の時点でのスクリプトを掲げておきます。流れは昨日書いたとおり。
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import MySQLdb
from urllib import urlopen
import xml.etree.ElementTree as ET
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from types import *
s3conn = S3Connection()
bucket = s3conn.get_bucket('pdffiles-pmid')
bucket.set_acl('private')
filelist = os.listdir('./')
verbindung = MySQLdb.connect(host='localhost',user='tolleetlege',passwd='password',db='database')
cursor = verbindung.cursor()
idlist = []
for pdf_file in filelist:
if pdf_file.find('pdf') > 0:
pmid = pdf_file.replace(".pdf","")
sql = "select * from pdfdata where pmid = '" + pmid + "'"
cursor.execute(sql)
res = cursor.fetchall()
if len(res) == 0:
idlist.append(pmid)
else:
print pmid + ': This article already exixts in the database.\n ------'
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=' + ','.join(idlist) + '&retmode=xml&rettype=full'
xml = urlopen(url).read()
dom = ET.fromstring(xml)
for particle in dom.findall('PubmedArticle'):
pmid = particle.findtext('./MedlineCitation/PMID')
filename = pmid + '.pdf'
os.system("pdftotext -enc UTF-8 -nopgbrk " + filename)
fulltext = open((pmid + '.txt'),'r').read()
fulltext = MySQLdb.escape_string(fulltext)
journal = particle.findtext('./MedlineCitation/Article/Journal/ISOAbbreviation')
if type(journal) == NoneType:
journal = particle.findtext('./MedlineCitation/MedlineJournalInfo/MedlineTA')
journal = journal.replace('.','')
volume = particle.findtext('./MedlineCitation/Article/Journal/JournalIssue/Volume')
if type(volume) == NoneType:
print pmid + ': This article is in press.\n ------'
else:
pubyear = particle.findtext('./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
pages = particle.findtext('./MedlineCitation/Article/Pagination/MedlinePgn')
title = particle.findtext('./MedlineCitation/Article/ArticleTitle')
abstract = particle.findtext('./MedlineCitation/Article/Abstract/AbstractText')
abstract = MySQLdb.escape_string(abstract)
authors = particle.findall('./MedlineCitation/Article/AuthorList/Author')
aulist = []
for author in authors:
aname = author.findtext('./LastName') + "_" + author.findtext('./Initials')
aulist.append(aname)
au = ', '.join(aulist)
print pmid, journal, "(", pubyear, ")", volume, ":", pages
print title
try:
isitems = "','".join([pmid,au,title,journal,volume,pages,pubyear,abstract,fulltext])
insert = "INSERT INTO pdfdata (pmid,authors,title,journal_title,journal_vol,journal_pages,year,abstract,fullcontent) VALUES ('" + isitems + "')"
cursor.execute(insert)
print 'Data have been uploaded.\n ------'
if bucket.lookup(filename) == None:
k = Key(bucket)
k.key = filename
k.set_contents_from_filename(filename)
k.get_contents_to_filename("d" + filename)
except UnicodeDecodeError:
print "UnicodeDecodeError\n ------"
verbindung.close()
os.system("rm *.txt")
pdftotext をいちいちテキストファイルで保存してから読み込むのは無駄なような気がして、標準出力にしたらそれをpythonが受け取ってくれるんじゃないかと思ったのだ。今日初めて知ったのだが、出力ファイルを" - "とすれば、標準出力になるのだという。PHPなら、
$text = shell_exec("pdftotext filename.pdf -")
echo $text
とすれば、ちゃんと表示されるのだが、pythonで同じように、
text = os.system("pdftotext filename.pd -")
とやっても、textに入っているのは 0 という数値である。なぜだ。コードエラーは相変わらず。もう解らないからいい。