From a812e670d9ce533ac96b7aa46f23a1b94145a56d Mon Sep 17 00:00:00 2001
From: Benoit Favre <benoit.favre@lis-lab.fr>
Date: Fri, 29 May 2020 18:06:38 +0200
Subject: [PATCH] add litcovid abstract scrapping

---
 bibliovid_add_abstract.py |  2 +-
 litcovid_add_abstract.py  | 60 +++++++++++++++++++++++++++++++++++++++
 litcovid_scrapper.py      |  2 +-
 run.sh                    |  9 +++---
 4 files changed, 67 insertions(+), 6 deletions(-)
 create mode 100644 litcovid_add_abstract.py

diff --git a/bibliovid_add_abstract.py b/bibliovid_add_abstract.py
index ab52c1d..04ac5e7 100644
--- a/bibliovid_add_abstract.py
+++ b/bibliovid_add_abstract.py
@@ -45,6 +45,6 @@ for key, value in stats.items():
   print(key, value, value / len(articles['results']))
 
 with open(sys.argv[1], 'w') as fp:
-  fp.write(json.dumps(articles, indent=2)
+  fp.write(json.dumps(articles, indent=2))
 
 
diff --git a/litcovid_add_abstract.py b/litcovid_add_abstract.py
new file mode 100644
index 0000000..e6f1cb5
--- /dev/null
+++ b/litcovid_add_abstract.py
@@ -0,0 +1,60 @@
+import json, sys
+from datetime import datetime, date
+import urllib.request
+import xml.etree.ElementTree as ET
+
+tool = "https://covid19.lis-lab.fr"
+email = "benoit.favre@univ-amu.fr"
+
+month_mapping = {
+'Jan': '01',
+'Feb': '02',
+'Mar': '03',
+'Apr': '04',
+'May': '05',
+'Jun': '06',
+'Jul': '07',
+'Aug': '08',
+'Sep': '09',
+'Oct': '10',
+'Nov': '11',
+'Dec': '12',
+}
+
+def map_month(text):
+  key = text[:3].lower().capitalize()
+  if key in month_mapping:
+    return month_mapping[key]
+  return text
+
+def make_batches(sequence, size=100):
+  i = 0
+  while i < len(sequence):
+    yield sequence[i: i + size]
+    i += size
+
+def fetch(articles):
+  ids = [article['pmid'] for article in articles]
+  by_id = {str(article['pmid']): article for article in articles}
+  url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s' % (tool, email, '&'.join(['id=' + str(i) for i in ids]))
+  with urllib.request.urlopen(url) as response:
+    read = response.read()
+    root = ET.fromstring(read)
+    for article in root.findall('.//PubmedArticle'):
+      pmid = article.findtext('.//PMID')
+      if pmid in by_id:
+        abstract = ' '.join([''.join(line.itertext()) for line in article.findall('.//Abstract/AbstractText')])
+        by_id[pmid]['abstract'] = abstract
+
+if len(sys.argv) != 2:
+  print('usage: %s <articles-json>' % sys.argv[0])
+  sys.exit(1)
+
+with open(sys.argv[1]) as fp:
+  articles = json.loads(fp.read())
+
+for batch in make_batches(articles, 100):
+  fetch(batch)
+
+print(json.dumps(articles, indent=2))
+
diff --git a/litcovid_scrapper.py b/litcovid_scrapper.py
index da0a2af..66821ad 100644
--- a/litcovid_scrapper.py
+++ b/litcovid_scrapper.py
@@ -15,4 +15,4 @@ for page in range(num_pages):
     data = json.loads(response.read())
   results.extend(data['results'])
 
-print(json.dumps(results, indent=4))
+print(json.dumps(results, indent=2))
diff --git a/run.sh b/run.sh
index fb464cd..d4cebe6 100755
--- a/run.sh
+++ b/run.sh
@@ -13,11 +13,12 @@ out="$dir/data/"`date '+%Y%m%d'`
 mkdir -p "$out"
 
 # CORD-19 metadata
-curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv"
-python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json"
+curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
+python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
 
 # litcovid
-python "$dir/litcovid_scrapper.py" > "$out/litcovid.json"
+python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
+python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
 
 # bibliovid
 count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
@@ -26,5 +27,5 @@ python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliov
 python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
 
 # cleanup
-rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json"
+rm "$out/cord19-metadata_stage*" "$out/litcovid_stage*" "$out/bibliovid_stage*"
 
-- 
GitLab