diff --git a/bibliovid_add_abstract.py b/bibliovid_add_abstract.py index ab52c1dd8ecefbe1fd6144ab074e247d631bc622..04ac5e7d58754fb9ea5daf007c38de4e5da5219d 100644 --- a/bibliovid_add_abstract.py +++ b/bibliovid_add_abstract.py @@ -45,6 +45,6 @@ for key, value in stats.items(): print(key, value, value / len(articles['results'])) with open(sys.argv[1], 'w') as fp: - fp.write(json.dumps(articles, indent=2) + fp.write(json.dumps(articles, indent=2)) diff --git a/litcovid_add_abstract.py b/litcovid_add_abstract.py new file mode 100644 index 0000000000000000000000000000000000000000..e6f1cb542794284d9244868b947b55d58ebb379c --- /dev/null +++ b/litcovid_add_abstract.py @@ -0,0 +1,60 @@ +import json, sys +from datetime import datetime, date +import urllib.request +import xml.etree.ElementTree as ET + +tool = "https://covid19.lis-lab.fr" +email = "benoit.favre@univ-amu.fr" + +month_mapping = { +'Jan': '01', +'Feb': '02', +'Mar': '03', +'Apr': '04', +'May': '05', +'Jun': '06', +'Jul': '07', +'Aug': '08', +'Sep': '09', +'Oct': '10', +'Nov': '11', +'Dec': '12', +} + +def map_month(text): + key = text[:3].lower().capitalize() + if key in month_mapping: + return month_mapping[key] + return text + +def make_batches(sequence, size=100): + i = 0 + while i < len(sequence): + yield sequence[i: i + size] + i += size + +def fetch(articles): + ids = [article['pmid'] for article in articles] + by_id = {str(article['pmid']): article for article in articles} + url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids])) + with urllib.request.urlopen(url) as response: + read = response.read() + root = ET.fromstring(read) + for article in root.findall('.//PubmedArticle'): + pmid = article.findtext('.//PMID') + if pmid in by_id: + abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')]) + by_id[pmid]['abstract'] = abstract + +if len(sys.argv) != 2: + print('usage: %s <articles-json>' % sys.argv[0]) + sys.exit(1) + +with open(sys.argv[1]) as fp: + articles = json.loads(fp.read()) + +for batch in make_batches(articles, 100): + fetch(batch) + +print(json.dumps(articles, indent=2)) + diff --git a/litcovid_scrapper.py b/litcovid_scrapper.py index da0a2af514fb696ae60195b0340754ae57423355..66821ad6370df83a3816929f0959d0e0b10df851 100644 --- a/litcovid_scrapper.py +++ b/litcovid_scrapper.py @@ -15,4 +15,4 @@ for page in range(num_pages): data = json.loads(response.read()) results.extend(data['results']) -print(json.dumps(results, indent=4)) +print(json.dumps(results, indent=2)) diff --git a/run.sh b/run.sh index fb464cd1028dddb59c6adaa2725deda243c9edae..d4cebe69e526c63057a479c12af1e64773bbd63e 100755 --- a/run.sh +++ b/run.sh @@ -13,11 +13,12 @@ out="$dir/data/"`date '+%Y%m%d'` mkdir -p "$out" # CORD-19 metadata -curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv" -python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json" +curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv" +python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json" # litcovid -python "$dir/litcovid_scrapper.py" > "$out/litcovid.json" +python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json" +python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json" # bibliovid count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'` @@ -26,5 +27,5 @@ python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliov python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json" # cleanup -rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json" +rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"