Skip to content
Snippets Groups Projects
Commit 8b828fc0 authored by Benoit Favre's avatar Benoit Favre
Browse files

add bibliovid fulltext from cord19

parent 27613256
No related branches found
No related tags found
No related merge requests found
import json
import csv
import sys
import shutil
import os
if len(sys.argv) != 5:
print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr)
sys.exit(1)
bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:]
with open(bibliovid_json) as fp:
articles = json.loads(fp.read())
by_doi = {}
for i, article in enumerate(articles):
if "doi" in article:
by_doi[article["doi"]] = i
with open(cord19_meta_csv) as fp:
reader = csv.reader(fp)
# cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
headers = {name: index for index, name in enumerate(next(reader))}
metadata = []
files = []
for row in reader:
doi = row[headers["doi"]]
if doi in by_doi:
metadata.append({name: row[index] for name, index in headers.items()})
files.extend(row[headers['pdf_json_files']].split(';'))
for name in files:
name = name.strip()
if name != '':
shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name)))
print(json.dumps(metadata))
......@@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio
python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json"
python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json"
# collect fulltext from cord-19 for bibliovid papers
# TODO: also download fulltext
if [ ! -d ../../cord-19/2021-05-24/ ]; then
echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2
exit 1
fi
python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json"
# cleanup
rm "$out/"*_stage*
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment