diff --git a/bibliovid_add_fulltext.py b/bibliovid_add_fulltext.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac3fb5d6f51ea04b459591f011d16deae7e298a --- /dev/null +++ b/bibliovid_add_fulltext.py @@ -0,0 +1,38 @@ +import json +import csv +import sys +import shutil +import os + +if len(sys.argv) != 5: + print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr) + sys.exit(1) + +bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:] + +with open(bibliovid_json) as fp: + articles = json.loads(fp.read()) + +by_doi = {} +for i, article in enumerate(articles): + if "doi" in article: + by_doi[article["doi"]] = i + +with open(cord19_meta_csv) as fp: + reader = csv.reader(fp) + # cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id + headers = {name: index for index, name in enumerate(next(reader))} + metadata = [] + files = [] + for row in reader: + doi = row[headers["doi"]] + if doi in by_doi: + metadata.append({name: row[index] for name, index in headers.items()}) + files.extend(row[headers['pdf_json_files']].split(';')) + +for name in files: + name = name.strip() + if name != '': + shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name))) + +print(json.dumps(metadata)) diff --git a/run.sh b/run.sh index 0f786baaea754239fd58e79ac14436af43eda6d8..6eedc695871419be1c0dae0c49a71ea72de7e2ab 100755 --- a/run.sh +++ b/run.sh @@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json" python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json" +# collect fulltext from cord-19 for bibliovid papers +# TODO: also download fulltext +if [ ! -d ../../cord-19/2021-05-24/ ]; then + echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2 + exit 1 +fi +python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json" + # cleanup rm "$out/"*_stage*