Commit 89946775 authored by Benoit Favre's avatar Benoit Favre
Browse files

add script to normalize bibliovid data

parent a812e670
......@@ -8,7 +8,10 @@ from datetime import datetime, date
pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
articles = json.loads(sys.stdin.read())
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
print(len(articles['results']), file=sys.stderr)
def normalize(text):
return re.sub('[^a-zA-Z]', '', text).lower()
......@@ -38,13 +41,12 @@ for article in articles['results']:
if found:
break
if not found:
print('NOT FOUND:', title)
print('NOT FOUND:', title, file=sys.stderr)
print('TOTAL', len(articles['results']))
print('TOTAL', len(articles['results']), file=sys.stderr)
for key, value in stats.items():
print(key, value, value / len(articles['results']))
print(key, value, value / len(articles['results'], file=sys.stderr))
with open(sys.argv[1], 'w') as fp:
fp.write(json.dumps(articles, indent=2))
print(json.dumps(articles, indent=2))
import sys, json
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
if type(articles) == dict:
articles = articles['results']
for article in articles:
article['topics'] = [article['category']['name']] + [x['name'] for x in article['specialties']]
article['author_list'] = article['authors']
article['authors'] = ', '.join([x['name'] for x in article['authors']])
day, month, year = article['verbose_date'].split('.')
article['publication_date'] = '%s-%s-%s' % (year, month, day)
print(json.dumps(articles, indent=2))
......@@ -43,8 +43,9 @@ def fetch(articles):
for article in root.findall('.//PubmedArticle'):
pmid = article.findtext('.//PMID')
if pmid in by_id:
found = by_id[pmid]
abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')])
by_id[pmid]['abstract'] = abstract
found['abstract'] = abstract
if len(sys.argv) != 2:
print('usage: %s <articles-json>' % sys.argv[0])
......
......@@ -13,19 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out"
# CORD-19 metadata
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
#curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
#python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
# litcovid
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
#python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
#python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
# bibliovid
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
#count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
#curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
#python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" > "$out/bibliovid_stage3.json"
python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json"
# cleanup
rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"
rm "$out/*stage*"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment