From 89946775c65b7eac345e384de7319ae4734712e4 Mon Sep 17 00:00:00 2001 From: Benoit Favre <benoit.favre@lis-lab.fr> Date: Mon, 1 Jun 2020 23:07:01 +0200 Subject: [PATCH] add script to normalize bibliovid data --- bibliovid_add_abstract.py | 14 ++++++++------ bibliovid_normalize.py | 17 +++++++++++++++++ litcovid_add_abstract.py | 3 ++- run.sh | 19 ++++++++++--------- 4 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 bibliovid_normalize.py diff --git a/bibliovid_add_abstract.py b/bibliovid_add_abstract.py index 04ac5e7..d3ffc90 100644 --- a/bibliovid_add_abstract.py +++ b/bibliovid_add_abstract.py @@ -8,7 +8,10 @@ from datetime import datetime, date pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr") -articles = json.loads(sys.stdin.read()) +with open(sys.argv[1]) as fp: + articles = json.loads(fp.read()) + +print(len(articles['results']), file=sys.stderr) def normalize(text): return re.sub('[^a-zA-Z]', '', text).lower() @@ -38,13 +41,12 @@ for article in articles['results']: if found: break if not found: - print('NOT FOUND:', title) + print('NOT FOUND:', title, file=sys.stderr) -print('TOTAL', len(articles['results'])) +print('TOTAL', len(articles['results']), file=sys.stderr) for key, value in stats.items(): - print(key, value, value / len(articles['results'])) + print(key, value, value / len(articles['results'], file=sys.stderr)) -with open(sys.argv[1], 'w') as fp: - fp.write(json.dumps(articles, indent=2)) +print(json.dumps(articles, indent=2)) diff --git a/bibliovid_normalize.py b/bibliovid_normalize.py new file mode 100644 index 0000000..407b2c8 --- /dev/null +++ b/bibliovid_normalize.py @@ -0,0 +1,17 @@ +import sys, json + +with open(sys.argv[1]) as fp: + articles = json.loads(fp.read()) + +if type(articles) == dict: + articles = articles['results'] + +for article in articles: + article['topics'] = [article['category']['name']] + [x['name'] for x in article['specialties']] + article['author_list'] = article['authors'] + article['authors'] = ', '.join([x['name'] for x in article['authors']]) + day, month, year = article['verbose_date'].split('.') + article['publication_date'] = '%s-%s-%s' % (year, month, day) + +print(json.dumps(articles, indent=2)) + diff --git a/litcovid_add_abstract.py b/litcovid_add_abstract.py index e6f1cb5..d74bf74 100644 --- a/litcovid_add_abstract.py +++ b/litcovid_add_abstract.py @@ -43,8 +43,9 @@ def fetch(articles): for article in root.findall('.//PubmedArticle'): pmid = article.findtext('.//PMID') if pmid in by_id: + found = by_id[pmid] abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')]) - by_id[pmid]['abstract'] = abstract + found['abstract'] = abstract if len(sys.argv) != 2: print('usage: %s <articles-json>' % sys.argv[0]) diff --git a/run.sh b/run.sh index d4cebe6..7ca228d 100755 --- a/run.sh +++ b/run.sh @@ -13,19 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'` mkdir -p "$out" # CORD-19 metadata -curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv" -python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json" +#curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv" +#python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json" # litcovid -python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json" -python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json" +#python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json" +#python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json" # bibliovid -count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'` -curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json" -python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json" -python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json" +#count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'` +#curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json" +#python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json" +python "$dir/bibliovid_add_abstract.py" "$out/bibliovid_stage2.json" > "$out/bibliovid_stage3.json" +python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/bibliovid.json" # cleanup -rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*" +rm "$out/*stage*" -- GitLab