From 8b828fc011fe1bc159aaa5dfce27696c5edd14b5 Mon Sep 17 00:00:00 2001
From: Benoit Favre <benoit.favre@lis-lab.fr>
Date: Mon, 31 May 2021 14:00:52 +0200
Subject: [PATCH] add bibliovid fulltext from cord19

---
 bibliovid_add_fulltext.py | 38 ++++++++++++++++++++++++++++++++++++++
 run.sh                    |  8 ++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 bibliovid_add_fulltext.py

diff --git a/bibliovid_add_fulltext.py b/bibliovid_add_fulltext.py
new file mode 100644
index 0000000..6ac3fb5
--- /dev/null
+++ b/bibliovid_add_fulltext.py
@@ -0,0 +1,38 @@
+import json
+import csv
+import sys
+import shutil
+import os
+
+if len(sys.argv) != 5:
+    print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr)
+    sys.exit(1)
+
+bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:]
+
+with open(bibliovid_json) as fp:
+    articles = json.loads(fp.read())
+
+by_doi = {}
+for i, article in enumerate(articles):
+    if "doi" in article:
+        by_doi[article["doi"]] = i
+
+with open(cord19_meta_csv) as fp:
+    reader = csv.reader(fp)
+    # cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
+    headers = {name: index for index, name in enumerate(next(reader))}
+    metadata = []
+    files = []
+    for row in reader:
+        doi = row[headers["doi"]]
+        if doi in by_doi:
+            metadata.append({name: row[index] for name, index in headers.items()})
+            files.extend(row[headers['pdf_json_files']].split(';'))
+
+for name in files:
+    name = name.strip()
+    if name != '':
+        shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name)))
+    
+print(json.dumps(metadata))
diff --git a/run.sh b/run.sh
index 0f786ba..6eedc69 100755
--- a/run.sh
+++ b/run.sh
@@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio
 python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json"
 python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json"
 
+# collect fulltext from cord-19 for bibliovid papers 
+# TODO: also download fulltext
+if [ ! -d ../../cord-19/2021-05-24/ ]; then
+  echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2
+  exit 1
+fi
+python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json"
+
 # cleanup
 rm "$out/"*_stage*
 
-- 
GitLab