diff --git a/json_add_mesh_terms.py b/json_add_mesh_terms.py new file mode 100644 index 0000000000000000000000000000000000000000..8647502fea6c6e89f3deea33ad30ee5aa3f80dc7 --- /dev/null +++ b/json_add_mesh_terms.py @@ -0,0 +1,66 @@ +import json, sys +from datetime import datetime, date +import urllib.request +import xml.etree.ElementTree as ET +import time + +tool = "https://covid19.lis-lab.fr" +email = "benoit.favre@univ-amu.fr" + +month_mapping = { +'Jan': '01', +'Feb': '02', +'Mar': '03', +'Apr': '04', +'May': '05', +'Jun': '06', +'Jul': '07', +'Aug': '08', +'Sep': '09', +'Oct': '10', +'Nov': '11', +'Dec': '12', +} + +def map_month(text): + key = text[:3].lower().capitalize() + if key in month_mapping: + return month_mapping[key] + return text + +def make_batches(sequence, size=100): + i = 0 + while i < len(sequence): + yield sequence[i: i + size] + i += size + +def fetch(articles): + ids = [article['pmid'] if 'pmid' in article else article['pubmed_id'] for article in articles] + by_id = {str(article['pmid'] if 'pmid' in article else article['pubmed_id']): article for article in articles} + url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids])) + with urllib.request.urlopen(url) as response: + read = response.read() + #print(str(read, 'utf8')) + root = ET.fromstring(read) + for article in root.findall('.//PubmedArticle'): + pmid = article.findtext('.//PMID') + if pmid in by_id: + found = by_id[pmid] + mesh_terms = [''.join(item.itertext()) for item in article.findall('.//MeshHeading/DescriptorName')] + if len(mesh_terms) > 0: + found['mesh_terms'] = mesh_terms + time.sleep(1) + +if len(sys.argv) != 2: + print('usage: %s <articles-json>' % sys.argv[0]) + sys.exit(1) + +with open(sys.argv[1]) as fp: + articles = json.loads(fp.read()) + +for batch in make_batches(articles, 100): + fetch(batch) + +articles = [article for article in articles if 'mesh_terms' in article] +print(json.dumps(articles, indent=2)) + diff --git a/pubmed_mesh_queries.py b/pubmed_mesh_queries.py new file mode 100644 index 0000000000000000000000000000000000000000..adb09ad99e3be40b0432c94853d454f5a78c3eb1 --- /dev/null +++ b/pubmed_mesh_queries.py @@ -0,0 +1,49 @@ +import json, sys +from pymed import PubMed +from datetime import datetime, date +import time +import collections + +pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr") + +base_query ='"COVID-19" OR Coronavirus OR "Corona virus" OR "2019-nCoV" OR "SARS-CoV" OR "MERS-CoV" OR "Severe Acute Respiratory Syndrome" OR "Middle East Respiratory Syndrome"' +#query ='"COVID-19"' +#today = datetime.now().isoformat().split('T')[0] +#query = '(("%s"[Date - Publication] : "%s"[Date - Publication])) AND COVID-19[Text Word]' % (today, today) + +count = collections.defaultdict(int) +seen = {} +data = [] + +for keyword in [ 'Diagnostic', 'Therapeutics', 'Epidemiology', 'Prognosis', 'Recommendations', 'Modeling', 'Hepato-gastroenterology', 'Neurology', 'Cardiology', 'Hematology', 'Geriatrics', 'Infectiology', 'Obstetric gynecology', 'Dermatology', 'Paediatrics', 'Pulmonology', 'Psychiatry', 'Virology', 'Anesthesics', 'Radiology', 'Hygiene', 'Nephrology', 'Lockdown', 'Immunity' ]: + query = '"%s"[MeSH] AND (%s)' % (keyword, base_query) + + results = pubmed.query(query, max_results=10000) + count[keyword] = 0 + + for result in results: + entry = result.toDict() + pmid = entry['pubmed_id'].split('\n')[0] + entry['pmid'] = entry['pubmid_id'] = pmid + if pmid not in seen: + seen[pmid] = len(data) + entry['url'] = 'https://www.ncbi.nlm.nih.gov/pubmed/' + pmid + if 'authors' in entry: + entry['authors'] = '; '.join(['%s, %s' % (x['lastname'], x['firstname']) for x in entry['authors']]) + if 'xml' in entry: + del entry['xml'] + for key, value in entry.items(): + if type(value) in [datetime, date]: + entry[key] = value.isoformat() + entry['mesh_query'] = [] + data.append(entry) + data[seen[pmid]]['mesh_query'].append(keyword) + count[keyword] += 1 + time.sleep(1) + #print(data) + +#print(len(data), file=sys.stderr) +for keyword, value in count.items(): + print(value, keyword, file=sys.stderr) +print(json.dumps(data, indent=2)) +