bibliovid_add_fulltext.py 1.23 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
import csv
import sys
import shutil
import os

if len(sys.argv) != 5:
    print('usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>' % sys.argv[0], file=sys.stderr)
    sys.exit(1)

bibliovid_json, cord19_meta_csv, fulltext_dir, output_dir = sys.argv[1:]

with open(bibliovid_json) as fp:
    articles = json.loads(fp.read())

by_doi = {}
for i, article in enumerate(articles):
    if "doi" in article:
        by_doi[article["doi"]] = i

with open(cord19_meta_csv) as fp:
    reader = csv.reader(fp)
    # cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
    headers = {name: index for index, name in enumerate(next(reader))}
    metadata = []
    files = []
    for row in reader:
        doi = row[headers["doi"]]
        if doi in by_doi:
            metadata.append({name: row[index] for name, index in headers.items()})
            files.extend(row[headers['pdf_json_files']].split(';'))

for name in files:
    name = name.strip()
    if name != '':
        shutil.copyfile(os.path.join(fulltext_dir, name), os.path.join(output_dir, os.path.basename(name)))
    
print(json.dumps(metadata))