Skip to content
Snippets Groups Projects
Commit 89946775 authored by Benoit Favre's avatar Benoit Favre
Browse files

add script to normalize bibliovid data

parent a812e670
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,10 @@ from datetime import datetime, date
pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
articles = json.loads(sys.stdin.read())
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
print(len(articles['results']), file=sys.stderr)
def normalize(text):
return re.sub('[^a-zA-Z]', '', text).lower()
......@@ -38,13 +41,12 @@ for article in articles['results']:
if found:
break
if not found:
print('NOT FOUND:', title)
print('NOT FOUND:', title, file=sys.stderr)
print('TOTAL', len(articles['results']))
print('TOTAL', len(articles['results']), file=sys.stderr)
for key, value in stats.items():
print(key, value, value / len(articles['results']))
print(key, value, value / len(articles['results'], file=sys.stderr))
with open(sys.argv[1], 'w') as fp:
fp.write(json.dumps(articles, indent=2))
print(json.dumps(articles, indent=2))
import sys, json
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
if type(articles) == dict:
articles = articles['results']
for article in articles:
article['topics'] = [article['category']['name']] + [x['name'] for x in article['specialties']]
article['author_list'] = article['authors']
article['authors'] = ', '.join([x['name'] for x in article['authors']])
day, month, year = article['verbose_date'].split('.')
article['publication_date'] = '%s-%s-%s' % (year, month, day)
print(json.dumps(articles, indent=2))
......@@ -43,8 +43,9 @@ def fetch(articles):
for article in root.findall('.//PubmedArticle'):
pmid = article.findtext('.//PMID')
if pmid in by_id:
found = by_id[pmid]
abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')])
by_id[pmid]['abstract'] = abstract
found['abstract'] = abstract
if len(sys.argv) != 2:
print('usage: %s <articles-json>' % sys.argv[0])
......
......@@ -13,19 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out"
# CORD-19 metadata
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
#curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
#python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
# litcovid
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
#python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
#python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
# bibliovid
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
#count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
#curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
#python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" > "$out/bibliovid_stage3.json"
python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json"
# cleanup
rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"
rm "$out/*stage*"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment