Skip to content
Snippets Groups Projects
Commit a82050a9 authored by Benoit Favre's avatar Benoit Favre
Browse files

fix bugs in bibliovid scrapper

parent 89946775
No related branches found
No related tags found
No related merge requests found
...@@ -43,10 +43,11 @@ for article in articles['results']: ...@@ -43,10 +43,11 @@ for article in articles['results']:
if not found: if not found:
print('NOT FOUND:', title, file=sys.stderr) print('NOT FOUND:', title, file=sys.stderr)
print(json.dumps(articles, indent=2))
print('TOTAL', len(articles['results']), file=sys.stderr) print('TOTAL', len(articles['results']), file=sys.stderr)
for key, value in stats.items(): for key, value in stats.items():
print(key, value, value / len(articles['results'], file=sys.stderr)) print(key, value, value / len(articles['results']), file=sys.stderr)
print(json.dumps(articles, indent=2))
...@@ -13,20 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'` ...@@ -13,20 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out" mkdir -p "$out"
# CORD-19 metadata # CORD-19 metadata
#curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv" curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
#python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json" python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
# litcovid # litcovid
#python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json" python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
#python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json" python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
# bibliovid # bibliovid
#count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'` count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
#curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json" curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
#python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json" python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" > "$out/bibliovid_stage3.json" python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" > "$out/bibliovid_stage3.json"
python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json" python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json"
# cleanup # cleanup
rm "$out/*stage*" rm "$out/"*_stage*
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment