2008-10-21 21:56:29
昨日、pythonのスクリプトを書くのを忘れていました。さらにちょっと検討してみたので、今日の時点でのスクリプトを掲げておきます。流れは昨日書いたとおり。
#!/usr/bin/python # -*- coding: utf-8 -*- import os import MySQLdb from urllib import urlopen import xml.etree.ElementTree as ET from boto.s3.connection import S3Connection from boto.s3.key import Key from types import * s3conn = S3Connection() bucket = s3conn.get_bucket('pdffiles-pmid') bucket.set_acl('private') filelist = os.listdir('./') verbindung = MySQLdb.connect(host='localhost',user='tolleetlege',passwd='password',db='database') cursor = verbindung.cursor() idlist = [] for pdf_file in filelist: if pdf_file.find('pdf') > 0: pmid = pdf_file.replace(".pdf","") sql = "select * from pdfdata where pmid = '" + pmid + "'" cursor.execute(sql) res = cursor.fetchall() if len(res) == 0: idlist.append(pmid) else: print pmid + ': This article already exixts in the database.\n ------' url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=' + ','.join(idlist) + '&retmode=xml&rettype=full' xml = urlopen(url).read() dom = ET.fromstring(xml) for particle in dom.findall('PubmedArticle'): pmid = particle.findtext('./MedlineCitation/PMID') filename = pmid + '.pdf' os.system("pdftotext -enc UTF-8 -nopgbrk " + filename) fulltext = open((pmid + '.txt'),'r').read() fulltext = MySQLdb.escape_string(fulltext) journal = particle.findtext('./MedlineCitation/Article/Journal/ISOAbbreviation') if type(journal) == NoneType: journal = particle.findtext('./MedlineCitation/MedlineJournalInfo/MedlineTA') journal = journal.replace('.','') volume = particle.findtext('./MedlineCitation/Article/Journal/JournalIssue/Volume') if type(volume) == NoneType: print pmid + ': This article is in press.\n ------' else: pubyear = particle.findtext('./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year') pages = particle.findtext('./MedlineCitation/Article/Pagination/MedlinePgn') title = particle.findtext('./MedlineCitation/Article/ArticleTitle') abstract = particle.findtext('./MedlineCitation/Article/Abstract/AbstractText') abstract = MySQLdb.escape_string(abstract) authors = particle.findall('./MedlineCitation/Article/AuthorList/Author') aulist = [] for author in authors: aname = author.findtext('./LastName') + "_" + author.findtext('./Initials') aulist.append(aname) au = ', '.join(aulist) print pmid, journal, "(", pubyear, ")", volume, ":", pages print title try: isitems = "','".join([pmid,au,title,journal,volume,pages,pubyear,abstract,fulltext]) insert = "INSERT INTO pdfdata (pmid,authors,title,journal_title,journal_vol,journal_pages,year,abstract,fullcontent) VALUES ('" + isitems + "')" cursor.execute(insert) print 'Data have been uploaded.\n ------' if bucket.lookup(filename) == None: k = Key(bucket) k.key = filename k.set_contents_from_filename(filename) k.get_contents_to_filename("d" + filename) except UnicodeDecodeError: print "UnicodeDecodeError\n ------" verbindung.close() os.system("rm *.txt")
pdftotext をいちいちテキストファイルで保存してから読み込むのは無駄なような気がして、標準出力にしたらそれをpythonが受け取ってくれるんじゃないかと思ったのだ。今日初めて知ったのだが、出力ファイルを" - "とすれば、標準出力になるのだという。PHPなら、
$text = shell_exec("pdftotext filename.pdf -")
echo $text
とすれば、ちゃんと表示されるのだが、pythonで同じように、
text = os.system("pdftotext filename.pd -")
とやっても、textに入っているのは 0 という数値である。なぜだ。コードエラーは相変わらず。もう解らないからいい。