From 8b828fc011fe1bc159aaa5dfce27696c5edd14b5 Mon Sep 17 00:00:00 2001 From: Benoit Favre <benoit.favre@lis-lab.fr> Date: Mon, 31 May 2021 14:00:52 +0200 Subject: [PATCH] add bibliovid fulltext from cord19 --- bibliovid_add_fulltext.py | 38 ++++++++++++++++++++++++++++++++++++++ run.sh | 8 ++++++++ 2 files changed, 46 insertions(+) create mode 100644 bibliovid_add_fulltext.py diff --git a/bibliovid_add_fulltext.py b/bibliovid_add_fulltext.py new file mode 100644 index 0000000..6ac3fb5 --- /dev/null +++ b/bibliovid_add_fulltext.py @@ -0,0 +1,38 @@ +import json +import csv +import sys +import shutil +import os + +if len(sys.argv) != 5: + print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr) + sys.exit(1) + +bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:] + +with open(bibliovid_json) as fp: + articles = json.loads(fp.read()) + +by_doi = {} +for i, article in enumerate(articles): + if "doi" in article: + by_doi[article["doi"]] = i + +with open(cord19_meta_csv) as fp: + reader = csv.reader(fp) + # cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id + headers = {name: index for index, name in enumerate(next(reader))} + metadata = [] + files = [] + for row in reader: + doi = row[headers["doi"]] + if doi in by_doi: + metadata.append({name: row[index] for name, index in headers.items()}) + files.extend(row[headers['pdf_json_files']].split(';')) + +for name in files: + name = name.strip() + if name != '': + shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name))) + +print(json.dumps(metadata)) diff --git a/run.sh b/run.sh index 0f786ba..6eedc69 100755 --- a/run.sh +++ b/run.sh @@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json" python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json" +# collect fulltext from cord-19 for bibliovid papers +# TODO: also download fulltext +if [ ! -d ../../cord-19/2021-05-24/ ]; then + echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2 + exit 1 +fi +python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json" + # cleanup rm "$out/"*_stage* -- GitLab