Commit 8b828fc0 authored by Benoit Favre's avatar Benoit Favre
Browse files

add bibliovid fulltext from cord19

parent 27613256
import json
import csv
import sys
import shutil
import os
if len(sys.argv) != 5:
print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr)
sys.exit(1)
bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:]
with open(bibliovid_json) as fp:
articles = json.loads(fp.read())
by_doi = {}
for i, article in enumerate(articles):
if "doi" in article:
by_doi[article["doi"]] = i
with open(cord19_meta_csv) as fp:
reader = csv.reader(fp)
# cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
headers = {name: index for index, name in enumerate(next(reader))}
metadata = []
files = []
for row in reader:
doi = row[headers["doi"]]
if doi in by_doi:
metadata.append({name: row[index] for name, index in headers.items()})
files.extend(row[headers['pdf_json_files']].split(';'))
for name in files:
name = name.strip()
if name != '':
shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name)))
print(json.dumps(metadata))
......@@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio
python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json"
python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json"
# collect fulltext from cord-19 for bibliovid papers
# TODO: also download fulltext
if [ ! -d ../../cord-19/2021-05-24/ ]; then
echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2
exit 1
fi
python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json"
# cleanup
rm "$out/"*_stage*
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment