Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bibliovid
scrappers
Commits
a82050a9
Commit
a82050a9
authored
Jun 02, 2020
by
Benoit Favre
Browse files
fix bugs in bibliovid scrapper
parent
89946775
Changes
2
Hide whitespace changes
Inline
Side-by-side
bibliovid_add_abstract.py
View file @
a82050a9
...
...
@@ -43,10 +43,11 @@ for article in articles['results']:
if
not
found
:
print
(
'NOT FOUND:'
,
title
,
file
=
sys
.
stderr
)
print
(
json
.
dumps
(
articles
,
indent
=
2
))
print
(
'TOTAL'
,
len
(
articles
[
'results'
]),
file
=
sys
.
stderr
)
for
key
,
value
in
stats
.
items
():
print
(
key
,
value
,
value
/
len
(
articles
[
'results'
],
file
=
sys
.
stderr
)
)
print
(
key
,
value
,
value
/
len
(
articles
[
'results'
]
)
,
file
=
sys
.
stderr
)
print
(
json
.
dumps
(
articles
,
indent
=
2
))
run.sh
View file @
a82050a9
...
...
@@ -13,20 +13,20 @@ out="$dir/data/"`date '+%Y%m%d'`
mkdir
-p
"
$out
"
# CORD-19 metadata
#
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv > "$out/cord19-metadata_stage1.csv"
#
python "$dir/cord19_csv2json.py" "$out/cord19-metadata_stage1.csv" > "$out/cord19-metadata.json"
curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv
>
"
$out
/cord19-metadata_stage1.csv"
python
"
$dir
/cord19_csv2json.py"
"
$out
/cord19-metadata_stage1.csv"
>
"
$out
/cord19-metadata.json"
# litcovid
#
python "$dir/litcovid_scrapper.py" > "$out/litcovid_stage1.json"
#
python "$dir/litcovid_add_abstract.py" "$out/litcovid_stage1.json" > "$out/litcovid.json"
python
"
$dir
/litcovid_scrapper.py"
>
"
$out
/litcovid_stage1.json"
python
"
$dir
/litcovid_add_abstract.py"
"
$out
/litcovid_stage1.json"
>
"
$out
/litcovid.json"
# bibliovid
#
count=`curl 'https://bibliovid.org/api/v1/posts?format=json' | python -mjson.tool | grep '"count":' | grep -o '[0-9]*'`
#
curl "https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=$count" | python -mjson.tool > "$out/bibliovid_stage1.json"
#
python "$dir/bibliovid_scrapper.py" "$out/bibliovid_stage1.json" > "$out/bibliovid_stage2.json"
count
=
`
curl
'https://bibliovid.org/api/v1/posts?format=json'
| python
-mjson
.tool |
grep
'"count":'
|
grep
-o
'[0-9]*'
`
curl
"https://bibliovid.org/api/v1/posts?format=json&offset=0&limit=
$count
"
| python
-mjson
.tool
>
"
$out
/bibliovid_stage1.json"
python
"
$dir
/bibliovid_scrapper.py"
"
$out
/bibliovid_stage1.json"
>
"
$out
/bibliovid_stage2.json"
python
"
$dir
/bibliovid_add_abstract.py"
"
$out
/bibliovid_stage2.json"
>
"
$out
/bibliovid_stage3.json"
python
"
$dir
/bibliovid_normalize.py"
"
$out
/bibliovid_stage3.json"
>
"
$out
/bibliovid.json"
# cleanup
rm
"
$out
/
*
stage*
"
rm
"
$out
/
"
*
_
stage
*
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment