Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bibliovid
scrappers
Commits
8b828fc0
Commit
8b828fc0
authored
May 31, 2021
by
Benoit Favre
Browse files
add bibliovid fulltext from cord19
parent
27613256
Changes
2
Hide whitespace changes
Inline
Side-by-side
bibliovid_add_fulltext.py
0 → 100644
View file @
8b828fc0
import
json
import
csv
import
sys
import
shutil
import
os
if
len
(
sys
.
argv
)
!=
5
:
print
(
'usage: %s <bibliovid.json> <cord19-metadata.csv> <cord19-fulltextdir> <dest-dir>'
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
bibliovid_json
,
cord19_meta_csv
,
fulltext_dir
,
output_dir
=
sys
.
argv
[
1
:]
with
open
(
bibliovid_json
)
as
fp
:
articles
=
json
.
loads
(
fp
.
read
())
by_doi
=
{}
for
i
,
article
in
enumerate
(
articles
):
if
"doi"
in
article
:
by_doi
[
article
[
"doi"
]]
=
i
with
open
(
cord19_meta_csv
)
as
fp
:
reader
=
csv
.
reader
(
fp
)
# cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
headers
=
{
name
:
index
for
index
,
name
in
enumerate
(
next
(
reader
))}
metadata
=
[]
files
=
[]
for
row
in
reader
:
doi
=
row
[
headers
[
"doi"
]]
if
doi
in
by_doi
:
metadata
.
append
({
name
:
row
[
index
]
for
name
,
index
in
headers
.
items
()})
files
.
extend
(
row
[
headers
[
'pdf_json_files'
]].
split
(
';'
))
for
name
in
files
:
name
=
name
.
strip
()
if
name
!=
''
:
shutil
.
copyfile
(
os
.
path
.
join
(
fulltext_dir
,
name
),
os
.
path
.
join
(
output_dir
,
os
.
path
.
basename
(
name
)))
print
(
json
.
dumps
(
metadata
))
run.sh
View file @
8b828fc0
...
...
@@ -31,6 +31,14 @@ python "$dir/bibliovid_normalize.py" "$out/bibliovid_stage3.json" > "$out/biblio
python
"
$dir
/split_json_random.py"
"
$out
/folds/bibliovid"
5 .1 .1 <
"
$out
/bibliovid.json"
python
"
$dir
/split_json_random.py"
"
$out
/folds/litcovid"
5 .1 .1 <
"
$out
/litcovid.json"
# collect fulltext from cord-19 for bibliovid papers
# TODO: also download fulltext
if
[
!
-d
../../cord-19/2021-05-24/
]
;
then
echo
"ERROR: count not find cord-19 fulltext in ../../cord-19/2021-05-24/"
>
&2
exit
1
fi
python
"
$dir
/bibliovid_add_fulltext.py"
"
$out
/bibliovid.json"
../../cord-19/2021-05-24/metadata.csv ../../cord-19/2021-05-24/
"
$out
/bibliovid_fulltext"
>
"
$out
/bibliovid_meta.json"
# cleanup
rm
"
$out
/"
*
_stage
*
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment