diff --git a/run.sh b/run.sh index a64ca9a90e0b1bcaeb462a8f4e23282da915c25b..0f786baaea754239fd58e79ac14436af43eda6d8 100755 --- a/run.sh +++ b/run.sh @@ -10,7 +10,7 @@ set -e -u -o pipefail # output location out="$dir/data/"`date '+%Y%m%d'` -mkdir -p "$out" +mkdir -p "$out/folds" # CORD-19 metadata curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv" @@ -27,6 +27,10 @@ python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliov python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" "$out/bibliovid_stage3.json" python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json" +# generate folds +python "$dir/split_json_random.py" "$out/folds/bibliovid" 5 .1 .1 < "$out/bibliovid.json" +python "$dir/split_json_random.py" "$out/folds/litcovid" 5 .1 .1 < "$out/litcovid.json" + # cleanup rm "$out/"*_stage* diff --git a/split_json_random.py b/split_json_random.py new file mode 100644 index 0000000000000000000000000000000000000000..abddc83f2591c2534f0be3b24a567a702190017e --- /dev/null +++ b/split_json_random.py @@ -0,0 +1,27 @@ +import json +import sys +import random + +if len(sys.argv) != 5: + print('usage: %s <output-stem> <n-folds> <test-percent> <valid-percent>' % sys.argv[0]) + sys.exit(1) + +output_stem = sys.argv[1] +num_folds = int(sys.argv[2]) + +items = json.loads(sys.stdin.read()) + +num_test = int(float(sys.argv[3]) * len(items)) +num_valid = int(float(sys.argv[4]) * len(items)) + +for n in range(num_folds): + random.shuffle(items) + + with open(output_stem + '-%d.test' % n, 'w') as fp: + fp.write(json.dumps(items[:num_test])) + + with open(output_stem + '-%d.valid' % n, 'w') as fp: + fp.write(json.dumps(items[num_test: num_test + num_valid])) + + with open(output_stem + '-%d.train' % n, 'w') as fp: + fp.write(json.dumps(items[num_test + num_valid:]))