import json, sys from datetime import datetime, date import urllib.request import xml.etree.ElementTree as ET tool = "https://covid19.lis-lab.fr" email = "benoit.favre@univ-amu.fr" month_mapping = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12', } def map_month(text): key = text[:3].lower().capitalize() if key in month_mapping: return month_mapping[key] return text def make_batches(sequence, size=100): i = 0 while i < len(sequence): yield sequence[i: i + size] i += size def fetch(articles): ids = [article['pmid'] for article in articles] by_id = {str(article['pmid']): article for article in articles} url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids])) with urllib.request.urlopen(url) as response: read = response.read() root = ET.fromstring(read) for article in root.findall('.//PubmedArticle'): pmid = article.findtext('.//PMID') if pmid in by_id: found = by_id[pmid] abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')]) found['abstract'] = abstract if len(sys.argv) != 2: print('usage: %s <articles-json>' % sys.argv[0]) sys.exit(1) with open(sys.argv[1]) as fp: articles = json.loads(fp.read()) for batch in make_batches(articles, 100): fetch(batch) print(json.dumps(articles, indent=2))