Commit 27613256 authored by Benoit Favre's avatar Benoit Favre
Browse files

add mesh scripts

parent d00a9d23
import json, sys
from datetime import datetime, date
import urllib.request
import xml.etree.ElementTree as ET
import time
tool = "https://covid19.lis-lab.fr"
email = "benoit.favre@univ-amu.fr"
month_mapping = {
'Jan': '01',
'Feb': '02',
'Mar': '03',
'Apr': '04',
'May': '05',
'Jun': '06',
'Jul': '07',
'Aug': '08',
'Sep': '09',
'Oct': '10',
'Nov': '11',
'Dec': '12',
}
def map_month(text):
key = text[:3].lower().capitalize()
if key in month_mapping:
return month_mapping[key]
return text
def make_batches(sequence, size=100):
i = 0
while i < len(sequence):
yield sequence[i: i + size]
i += size
def fetch(articles):
ids = [article['pmid'] if 'pmid' in article else article['pubmed_id'] for article in articles]
by_id = {str(article['pmid'] if 'pmid' in article else article['pubmed_id']): article for article in articles}
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids]))
with urllib.request.urlopen(url) as response:
read = response.read()
#print(str(read, 'utf8'))
root = ET.fromstring(read)
for article in root.findall('.//PubmedArticle'):
pmid = article.findtext('.//PMID')
if pmid in by_id:
found = by_id[pmid]
mesh_terms = [''.join(item.itertext()) for item in article.findall('.//MeshHeading/DescriptorName')]
if len(mesh_terms) > 0:
found['mesh_terms'] = mesh_terms
time.sleep(1)
if len(sys.argv) != 2:
print('usage: %s <articles-json>' % sys.argv[0])
sys.exit(1)
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
for batch in make_batches(articles, 100):
fetch(batch)
articles = [article for article in articles if 'mesh_terms' in article]
print(json.dumps(articles, indent=2))
import json, sys
from pymed import PubMed
from datetime import datetime, date
import time
import collections
pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
base_query ='"COVID-19" OR Coronavirus OR "Corona virus" OR "2019-nCoV" OR "SARS-CoV" OR "MERS-CoV" OR "Severe Acute Respiratory Syndrome" OR "Middle East Respiratory Syndrome"'
#query ='"COVID-19"'
#today = datetime.now().isoformat().split('T')[0]
#query = '(("%s"[Date - Publication] : "%s"[Date - Publication])) AND COVID-19[Text Word]' % (today, today)
count = collections.defaultdict(int)
seen = {}
data = []
for keyword in [ 'Diagnostic', 'Therapeutics', 'Epidemiology', 'Prognosis', 'Recommendations', 'Modeling', 'Hepato-gastroenterology', 'Neurology', 'Cardiology', 'Hematology', 'Geriatrics', 'Infectiology', 'Obstetric gynecology', 'Dermatology', 'Paediatrics', 'Pulmonology', 'Psychiatry', 'Virology', 'Anesthesics', 'Radiology', 'Hygiene', 'Nephrology', 'Lockdown', 'Immunity' ]:
query = '"%s"[MeSH] AND (%s)' % (keyword, base_query)
results = pubmed.query(query, max_results=10000)
count[keyword] = 0
for result in results:
entry = result.toDict()
pmid = entry['pubmed_id'].split('\n')[0]
entry['pmid'] = entry['pubmid_id'] = pmid
if pmid not in seen:
seen[pmid] = len(data)
entry['url'] = 'https://www.ncbi.nlm.nih.gov/pubmed/' + pmid
if 'authors' in entry:
entry['authors'] = '; '.join(['%s, %s' % (x['lastname'], x['firstname']) for x in entry['authors']])
if 'xml' in entry:
del entry['xml']
for key, value in entry.items():
if type(value) in [datetime, date]:
entry[key] = value.isoformat()
entry['mesh_query'] = []
data.append(entry)
data[seen[pmid]]['mesh_query'].append(keyword)
count[keyword] += 1
time.sleep(1)
#print(data)
#print(len(data), file=sys.stderr)
for keyword, value in count.items():
print(value, keyword, file=sys.stderr)
print(json.dumps(data, indent=2))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment