Commit 7dcdb8e7 authored by Benoit Favre's avatar Benoit Favre
Browse files

repackage for release

parents
Copyright 2020 Benoit Favre
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
COVID-19 data scrapper
======================
Install
-------
```
virtualenv -ppython3 env
source env/bin/activate
pip install -r requirements.txt
```
Running
-------
This creates a directory in ./data with latest dumps in json format.
Designed to be run at most once a day.
```
./run.sh
```
# Download abstracts from pubmed when title/authors can be found.
# Note that we might get an incorrect match.
import sys, json, re, collections
from pymed import PubMed
from datetime import datetime, date
pubmed = PubMed(tool="https://covid19.lis-lab.fr", email="benoit.favre@univ-amu.fr")
articles = json.loads(sys.stdin.read())
def normalize(text):
return re.sub('[^a-zA-Z]', '', text).lower()
def preprocess(term):
return re.sub(r'[()\[\]]', ' ', term)
stats = collections.defaultdict(int)
for article in articles['results']:
title = article['title']
authors = ' '.join(x['name'] for x in article['authors'])
journal = article['journal']
found = False
for query in ['(%s[Title] AND (%s[Author])' % (preprocess(title), preprocess(authors)), '%s[Title]' % preprocess(title), preprocess(title)]:
results = pubmed.query(query, max_results=30)
for result in results:
entry = result.toDict()
if normalize(title) == normalize(entry['title']):
found = True
for field in ['pubmed_id', 'doi', 'abstract']:
if field in entry:
article[field] = entry[field]
stats[field] += 1
break
if found:
break
if not found:
print('NOT FOUND:', title)
print('TOTAL', len(articles['results']))
for key, value in stats.items():
print(key, value, value / len(articles['results']))
with open(sys.argv[1], 'w') as fp:
fp.write(json.dumps(articles, indent=2)
# Add article syntheses to bibliovid dump using the slug field.
# Note that we are very dependent on the html structure of the site.
import urllib.request, json, sys
from bs4 import BeautifulSoup
import bs4
with open(sys.argv[1]) as fp:
articles = json.loads(fp.read())
for article in articles['results']:
url = 'https://bibliovid.org/' + article['slug'] + '-' + str(article['id'])
print(url, file=sys.stderr)
with urllib.request.urlopen(url) as response:
data = response.read()
html = BeautifulSoup(data, 'html.parser')
main = html.find(class_='bg-white rounded-lg p-2 md:p-6')
divs = main.contents
#for i, div in enumerate(divs):
# print('%d [%s]' % (i, div))
def safe_text(node):
if type(node) is bs4.element.Tag:
return node.get_text().strip()
return ''
title = safe_text(main.find('h1'))
link = divs[8].find('a').attrs['href']
findings = safe_text(divs[12].find('div'))
take_away = safe_text(divs[14].contents[0].find('div'))
relevance_level = safe_text(divs[16].contents[0].find('div'))
objectives = safe_text(divs[18].contents[0].find('div'))
methods = safe_text(divs[20].contents[0].find('div'))
article['link'] = link
article['findings'] = findings
article['take_away'] = take_away
article['relevance_level'] = relevance_level
article['objectives'] = objectives
article['methods'] = methods
print(json.dumps(articles, indent=2))
# Convert CORD-19 metadata csv to json while normalizing a few fields.
import sys
import csv
import json
data = []
with open(sys.argv[1]) as fp:
reader = csv.reader(fp)
headers = next(reader)
for row in reader:
entry = {name.lower(): row[i] for i, name in enumerate(headers)}
# normalize a few fields
if 'doi' in entry and 'url' not in entry:
entry['url'] = 'https://www.doi.org/' + entry['doi']
if 'publish_time' in entry and 'publication_date' not in entry:
entry['publication_date'] = entry['publish_time']
data.append(entry)
print(json.dumps(data, indent=2))
# Download litcovid json data from django API.
# Unfortunately, we have to do it page by page.
import urllib.request, json, sys
url = 'https://www.ncbi.nlm.nih.gov/research/coronavirus-api/search/'
with urllib.request.urlopen(url) as response:
data = json.loads(response.read())
num_pages = data['total_pages']
results = []
for page in range(num_pages):
print(url + '?page=%d' % page, file=sys.stderr)
with urllib.request.urlopen(url + '?page=%d' % (1 + page)) as response:
data = json.loads(response.read())
results.extend(data['results'])
print(json.dumps(results, indent=4))
#!/bin/bash
dir=`dirname "$0"`
# activate virtualenv
source "$dir/env/bin/activate"
# bail on error
set -e -u -o pipefail
# output location
out="$dir/data/"`date '+%Y%m%d'`
mkdir -p "$out"
# CORD-19 metadata
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata.csv"
python "$dir/cord19_csv2json.py" "$out/cord19-metadata.csv" > "$out/cord19-metadata.json"
# litcovid
python "$dir/litcovid_scrapper.py" > "$out/litcovid.json"
# bibliovid
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
python "$dir/bibliovid_add_abstract.py" "$out/bibliovid.json" < "$out/bibliovid_stage2.json"
# cleanup
rm "$out/cord19-metadata.csv" "$out/bibliovid_stage1.json" "$out/bibliovid_stage2.json"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment