run.sh 1.74 KB
Newer Older
Benoit Favre's avatar
Benoit Favre committed
1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/bash

dir=`dirname "$0"`

# activate virtualenv
source "$dir/env/bin/activate"

# bail on error
set -e -u -o pipefail

# output location
out="$dir/data/"`date '+%Y%m%d'`
Benoit Favre's avatar
Benoit Favre committed
13
mkdir -p "$out/folds"
Benoit Favre's avatar
Benoit Favre committed
14
15

# CORD-19 metadata
Benoit Favre's avatar
Benoit Favre committed
16
17
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
Benoit Favre's avatar
Benoit Favre committed
18
19

# litcovid
Benoit Favre's avatar
Benoit Favre committed
20
21
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
Benoit Favre's avatar
Benoit Favre committed
22
23

# bibliovid
Benoit Favre's avatar
Benoit Favre committed
24
25
26
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
Benoit Favre's avatar
Benoit Favre committed
27
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" "$out/bibliovid_stage3.json"
28
python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json"
Benoit Favre's avatar
Benoit Favre committed
29

Benoit Favre's avatar
Benoit Favre committed
30
31
32
33
# generate folds
python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json"
python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json"

34
35
36
37
38
39
40
41
# collect fulltext from cord-19 for bibliovid papers 
# TODO: also download fulltext
if [ ! -d ../../cord-19/2021-05-24/ ]; then
  echo "ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/" >&2
  exit 1
fi
python "$dir/bibliovid_add_fulltext.py" "$out/bibliovid.json" ../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/ "$out/bibliovid_fulltext" > "$out/bibliovid_meta.json"

Benoit Favre's avatar
Benoit Favre committed
42
# cleanup
Benoit Favre's avatar
Benoit Favre committed
43
rm "$out/"*_stage*
Benoit Favre's avatar
Benoit Favre committed
44