repackage for release

7dcdb8e7 · Benoit Favre · 7dcdb8e7 · 7dcdb8e7 · 7dcdb8e7 · 7dcdb8e7
Commit 7dcdb8e7 authored 4 years ago by Benoit Favre
--- a/LICENSE
+++ b/LICENSE
+Copyright 2020 Benoit Favre
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
--- a/README.md
+++ b/README.md
+COVID-19 data scrapper
+======================
+
+Install
+-------
+
+```
+virtualenv -ppython3 env
+source env/bin/activate
+pip install -r requirements.txt
+```
+
+Running
+-------
+
+This creates a directory in ./data with latest dumps in json format.
+Designed to be run at most once a day.
+
+```
+./run.sh
+```
--- a/bibliovid_add_abstract.py
+++ b/bibliovid_add_abstract.py
+# Download abstracts from pubmed when title/authors can be found.
+# Note that we might get an incorrect match.
+
+import sys, json, re, collections
+
+from pymed import PubMed
+from datetime import datetime, date
+
+pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
+
+articles = json.loads(sys.stdin.read())
+
+def normalize(text):
+  return re.sub('[^a-zA-Z]', '', text).lower()
+
+def preprocess(term):
+  return re.sub(r'[()\[\]]', ' ', term)
+
+stats = collections.defaultdict(int)
+
+for article in articles['results']:
+  title = article['title']
+  authors = ' '.join(x['name'] for x in article['authors'])
+  journal = article['journal']
+
+  found = False
+  for query in ['(%s[Title] AND (%s[Author])' % (preprocess(title), preprocess(authors)), '%s[Title]' % preprocess(title), preprocess(title)]:
+    results = pubmed.query(query, max_results=30)
+    for result in results:
+      entry = result.toDict()
+      if normalize(title) == normalize(entry['title']):
+        found = True
+        for field in ['pubmed_id', 'doi', 'abstract']:
+          if field in entry:
+            article[field] = entry[field]
+            stats[field] += 1
+        break
+    if found:
+      break
+  if not found:
+    print('NOT FOUND:', title)
+
+print('TOTAL', len(articles['results']))
+for key, value in stats.items():
+  print(key, value, value / len(articles['results']))
+
+with open(sys.argv[1], 'w') as fp:
+  fp.write(json.dumps(articles, indent=2)
+
+
--- a/bibliovid_scrapper.py
+++ b/bibliovid_scrapper.py
+# Add article syntheses to bibliovid dump using the slug field.
+# Note that we are very dependent on the html structure of the site.
+
+import urllib.request, json, sys
+from bs4 import BeautifulSoup
+import bs4
+
+with open(sys.argv[1]) as fp:
+  articles = json.loads(fp.read())
+
+for article in articles['results']:
+  url = 'https://bibliovid.org/' + article['slug'] + '-' + str(article['id'])
+  print(url, file=sys.stderr)
+  with urllib.request.urlopen(url) as response:
+    data = response.read()
+    html = BeautifulSoup(data, 'html.parser')
+    main = html.find(class_='bg-white rounded-lg p-2 md:p-6')
+    divs = main.contents 
+    #for i, div in enumerate(divs):
+    #  print('%d [%s]' % (i, div))
+    
+    def safe_text(node):
+      if type(node) is bs4.element.Tag:
+        return node.get_text().strip()
+      return ''
+
+    title = safe_text(main.find('h1'))
+    link = divs[8].find('a').attrs['href']
+    findings = safe_text(divs[12].find('div'))
+    take_away = safe_text(divs[14].contents[0].find('div'))
+    relevance_level = safe_text(divs[16].contents[0].find('div'))
+    objectives = safe_text(divs[18].contents[0].find('div'))
+    methods = safe_text(divs[20].contents[0].find('div'))
+
+    article['link'] = link
+    article['findings'] = findings
+    article['take_away'] = take_away
+    article['relevance_level'] = relevance_level
+    article['objectives'] = objectives
+    article['methods'] = methods
+
+print(json.dumps(articles, indent=2))
+
--- a/cord19_csv2json.py
+++ b/cord19_csv2json.py
+# Convert CORD-19 metadata csv to json while normalizing a few fields.
+
+import sys
+import csv
+import json
+
+data = []
+
+with open(sys.argv[1]) as fp:
+  reader = csv.reader(fp)
+  headers = next(reader)
+  for row in reader:
+    entry = {name.lower(): row[i] for i, name in enumerate(headers)}
+    # normalize a few fields
+    if 'doi' in entry and 'url' not in entry:
+      entry['url'] = 'https://www.doi.org/' + entry['doi']
+    if 'publish_time' in entry and 'publication_date' not in entry:
+      entry['publication_date'] = entry['publish_time']
+    data.append(entry)
+
+print(json.dumps(data, indent=2))
+
--- a/litcovid_scrapper.py
+++ b/litcovid_scrapper.py
+# Download litcovid json data from django API. 
+# Unfortunately, we have to do it page by page.
+
+import urllib.request, json, sys
+
+url = 'https://www.ncbi.nlm.nih.gov/research/coronavirus-api/search/'
+with urllib.request.urlopen(url) as response:
+  data = json.loads(response.read())
+  num_pages = data['total_pages']  
+
+results = []
+for page in range(num_pages):
+  print(url + '?page=%d' % page, file=sys.stderr)
+  with urllib.request.urlopen(url + '?page=%d' % (1 + page)) as response:
+    data = json.loads(response.read())
+  results.extend(data['results'])
+
+print(json.dumps(results, indent=4))
--- a/requirements.txt
+++ b/requirements.txt
+pymed
+beautifulsoup4
--- a/run.sh
+++ b/run.sh
+#!/bin/bash
+
+dir=`dirname "$0"`
+
+# activate virtualenv
+source "$dir/env/bin/activate"
+
+# bail on error
+set -e -u -o pipefail
+
+# output location
+out="$dir/data/"`date '+%Y%m%d'`
+mkdir -p "$out"
+
+# CORD-19 metadata
+curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv"
+python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json"
+
+# litcovid
+python "$dir/litcovid_scrapper.py" > "$out/litcovid.json"
+
+# bibliovid
+count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
+curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
+python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
+python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
+
+# cleanup
+rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json"
+