Commit a812e670 authored by Benoit Favre's avatar Benoit Favre
Browse files

add litcovid abstract scrapping

parent 7dcdb8e7
......@@ -45,6 +45,6 @@ for key, value in stats.items():
print(key, value, value / len(articles['results']))
with open(sys.argv[1], 'w') as fp:
fp.write(json.dumps(articles, indent=2)
fp.write(json.dumps(articles, indent=2))
import json, sys
from datetime import datetime, date
import urllib.request
import xml.etree.ElementTree as ET
tool = "https://covid19.lis-lab.fr"
email = "benoit.favre@univ-amu.fr"
month_mapping = {
'Jan': '01',
'Feb': '02',
'Mar': '03',
'Apr': '04',
'May': '05',
'Jun': '06',
'Jul': '07',
'Aug': '08',
'Sep': '09',
'Oct': '10',
'Nov': '11',
'Dec': '12',
}
def map_month(text):
key = text[:3].lower().capitalize()
if key in month_mapping:
return month_mapping[key]
return text
def make_batches(sequence, size=100):
i = 0
while i < len(sequence):
yield sequence[i: i + size]
i += size
def fetch(articles):
ids = [article['pmid'] for article in articles]
by_id = {str(article['pmid']): article for article in articles}
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids]))
with urllib.request.urlopen(url) as response:
read = response.read()
root = ET.fromstring(read)
for article in root.findall('.//PubmedArticle'):
pmid = article.findtext('.//PMID')
if pmid in by_id:
abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')])
by_id[pmid]['abstract'] = abstract
if len(sys.argv) != 2:
print('usage: %s <articles-json>' % sys.argv[0])
sys.exit(1)
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
for batch in make_batches(articles, 100):
fetch(batch)
print(json.dumps(articles, indent=2))
......@@ -15,4 +15,4 @@ for page in range(num_pages):
data = json.loads(response.read())
results.extend(data['results'])
print(json.dumps(results, indent=4))
print(json.dumps(results, indent=2))
......@@ -13,11 +13,12 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out"
# CORD-19 metadata
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json"
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
# litcovid
python "$dir/litcovid_scrapper.py" > "$out/litcovid.json"
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
# bibliovid
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
......@@ -26,5 +27,5 @@ python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliov
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
# cleanup
rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json"
rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment