add mesh scripts

27613256 · Benoit Favre · d00a9d23 · 27613256 · 27613256
Commit 27613256 authored 4 years ago by Benoit Favre
--- a/json_add_mesh_terms.py
+++ b/json_add_mesh_terms.py
+import json, sys
+from datetime import datetime, date
+import urllib.request
+import xml.etree.ElementTree as ET
+import time
+
+tool = "https://covid19.lis-lab.fr"
+email = "benoit.favre@univ-amu.fr"
+
+month_mapping = {
+'Jan': '01',
+'Feb': '02',
+'Mar': '03',
+'Apr': '04',
+'May': '05',
+'Jun': '06',
+'Jul': '07',
+'Aug': '08',
+'Sep': '09',
+'Oct': '10',
+'Nov': '11',
+'Dec': '12',
+}
+
+def map_month(text):
+  key = text[:3].lower().capitalize()
+  if key in month_mapping:
+    return month_mapping[key]
+  return text
+
+def make_batches(sequence, size=100):
+  i = 0
+  while i < len(sequence):
+    yield sequence[i: i + size]
+    i += size
+
+def fetch(articles):
+  ids = [article['pmid'] if 'pmid' in article else article['pubmed_id'] for article in articles]
+  by_id = {str(article['pmid'] if 'pmid' in article else article['pubmed_id']): article for article in articles}
+  url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids]))
+  with urllib.request.urlopen(url) as response:
+    read = response.read()
+    #print(str(read, 'utf8'))
+    root = ET.fromstring(read)
+    for article in root.findall('.//PubmedArticle'):
+      pmid = article.findtext('.//PMID')
+      if pmid in by_id:
+        found = by_id[pmid]
+        mesh_terms = [''.join(item.itertext()) for item in article.findall('.//MeshHeading/DescriptorName')]
+        if len(mesh_terms) > 0:
+          found['mesh_terms'] = mesh_terms
+  time.sleep(1)
+
+if len(sys.argv) != 2:
+  print('usage: %s <articles-json>' % sys.argv[0])
+  sys.exit(1)
+
+with open(sys.argv[1]) as fp:
+  articles = json.loads(fp.read())
+
+for batch in make_batches(articles, 100):
+  fetch(batch)
+
+articles = [article for article in articles if 'mesh_terms' in article]
+print(json.dumps(articles, indent=2))
+
--- a/pubmed_mesh_queries.py
+++ b/pubmed_mesh_queries.py
+import json, sys
+from pymed import PubMed
+from datetime import datetime, date
+import time
+import collections
+
+pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
+
+base_query ='"COVID-19" OR Coronavirus OR "Corona virus" OR "2019-nCoV" OR "SARS-CoV" OR "MERS-CoV" OR "Severe Acute Respiratory Syndrome" OR "Middle East Respiratory Syndrome"'
+#query ='"COVID-19"'
+#today = datetime.now().isoformat().split('T')[0]
+#query = '(("%s"[Date - Publication] : "%s"[Date - Publication])) AND COVID-19[Text Word]' % (today, today)
+
+count = collections.defaultdict(int)
+seen = {}
+data = []
+
+for keyword in [ 'Diagnostic', 'Therapeutics', 'Epidemiology', 'Prognosis', 'Recommendations', 'Modeling', 'Hepato-gastroenterology', 'Neurology', 'Cardiology', 'Hematology', 'Geriatrics', 'Infectiology', 'Obstetric gynecology', 'Dermatology', 'Paediatrics', 'Pulmonology', 'Psychiatry', 'Virology', 'Anesthesics', 'Radiology', 'Hygiene', 'Nephrology', 'Lockdown', 'Immunity' ]:
+  query = '"%s"[MeSH] AND (%s)' % (keyword, base_query)
+
+  results = pubmed.query(query, max_results=10000)
+  count[keyword] = 0
+
+  for result in results:
+    entry = result.toDict()
+    pmid = entry['pubmed_id'].split('\n')[0]
+    entry['pmid'] = entry['pubmid_id'] = pmid
+    if pmid not in seen:
+      seen[pmid] = len(data)
+      entry['url'] = 'https://www.ncbi.nlm.nih.gov/pubmed/' + pmid
+      if 'authors' in entry:
+        entry['authors'] = '; '.join(['%s, %s' % (x['lastname'], x['firstname']) for x in entry['authors']])
+      if 'xml' in entry:
+        del entry['xml']
+      for key, value in entry.items():
+        if type(value) in [datetime, date]:
+          entry[key] = value.isoformat()
+      entry['mesh_query'] = []
+      data.append(entry)
+    data[seen[pmid]]['mesh_query'].append(keyword)
+    count[keyword] += 1
+  time.sleep(1)
+  #print(data)
+
+#print(len(data), file=sys.stderr)
+for keyword, value in count.items():
+  print(value, keyword, file=sys.stderr)
+print(json.dumps(data, indent=2))
+