Skip to content
Snippets Groups Projects
Commit a812e670 authored by Benoit Favre's avatar Benoit Favre
Browse files

add litcovid abstract scrapping

parent 7dcdb8e7
Branches
No related tags found
No related merge requests found
......@@ -45,6 +45,6 @@ for key, value in stats.items():
print(key, value, value / len(articles['results']))
with open(sys.argv[1], 'w') as fp:
fp.write(json.dumps(articles, indent=2)
fp.write(json.dumps(articles, indent=2))
import json, sys
from datetime import datetime, date
import urllib.request
import xml.etree.ElementTree as ET
tool = "https://covid19.lis-lab.fr"
email = "benoit.favre@univ-amu.fr"
month_mapping = {
'Jan': '01',
'Feb': '02',
'Mar': '03',
'Apr': '04',
'May': '05',
'Jun': '06',
'Jul': '07',
'Aug': '08',
'Sep': '09',
'Oct': '10',
'Nov': '11',
'Dec': '12',
}
def map_month(text):
key = text[:3].lower().capitalize()
if key in month_mapping:
return month_mapping[key]
return text
def make_batches(sequence, size=100):
i = 0
while i < len(sequence):
yield sequence[i: i + size]
i += size
def fetch(articles):
ids = [article['pmid'] for article in articles]
by_id = {str(article['pmid']): article for article in articles}
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids]))
with urllib.request.urlopen(url) as response:
read = response.read()
root = ET.fromstring(read)
for article in root.findall('.//PubmedArticle'):
pmid = article.findtext('.//PMID')
if pmid in by_id:
abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')])
by_id[pmid]['abstract'] = abstract
if len(sys.argv) != 2:
print('usage: %s <articles-json>' % sys.argv[0])
sys.exit(1)
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
for batch in make_batches(articles, 100):
fetch(batch)
print(json.dumps(articles, indent=2))
......@@ -15,4 +15,4 @@ for page in range(num_pages):
data = json.loads(response.read())
results.extend(data['results'])
print(json.dumps(results, indent=4))
print(json.dumps(results, indent=2))
......@@ -13,11 +13,12 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out"
# CORD-19 metadata
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json"
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
# litcovid
python "$dir/litcovid_scrapper.py" > "$out/litcovid.json"
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
# bibliovid
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
......@@ -26,5 +27,5 @@ python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliov
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
# cleanup
rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json"
rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment