Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bibliovid
scrappers
Commits
27613256
Commit
27613256
authored
May 04, 2021
by
Benoit Favre
Browse files
add mesh scripts
parent
d00a9d23
Changes
2
Hide whitespace changes
Inline
Side-by-side
json_add_mesh_terms.py
0 → 100644
View file @
27613256
import
json
,
sys
from
datetime
import
datetime
,
date
import
urllib.request
import
xml.etree.ElementTree
as
ET
import
time
tool
=
"https://covid19.lis-lab.fr"
email
=
"benoit.favre@univ-amu.fr"
month_mapping
=
{
'Jan'
:
'01'
,
'Feb'
:
'02'
,
'Mar'
:
'03'
,
'Apr'
:
'04'
,
'May'
:
'05'
,
'Jun'
:
'06'
,
'Jul'
:
'07'
,
'Aug'
:
'08'
,
'Sep'
:
'09'
,
'Oct'
:
'10'
,
'Nov'
:
'11'
,
'Dec'
:
'12'
,
}
def
map_month
(
text
):
key
=
text
[:
3
].
lower
().
capitalize
()
if
key
in
month_mapping
:
return
month_mapping
[
key
]
return
text
def
make_batches
(
sequence
,
size
=
100
):
i
=
0
while
i
<
len
(
sequence
):
yield
sequence
[
i
:
i
+
size
]
i
+=
size
def
fetch
(
articles
):
ids
=
[
article
[
'pmid'
]
if
'pmid'
in
article
else
article
[
'pubmed_id'
]
for
article
in
articles
]
by_id
=
{
str
(
article
[
'pmid'
]
if
'pmid'
in
article
else
article
[
'pubmed_id'
]):
article
for
article
in
articles
}
url
=
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=xml&tool=%s&email=%s&%s'
%
(
tool
,
email
,
'&'
.
join
([
'id='
+
str
(
i
)
for
i
in
ids
]))
with
urllib
.
request
.
urlopen
(
url
)
as
response
:
read
=
response
.
read
()
#print(str(read, 'utf8'))
root
=
ET
.
fromstring
(
read
)
for
article
in
root
.
findall
(
'.//PubmedArticle'
):
pmid
=
article
.
findtext
(
'.//PMID'
)
if
pmid
in
by_id
:
found
=
by_id
[
pmid
]
mesh_terms
=
[
''
.
join
(
item
.
itertext
())
for
item
in
article
.
findall
(
'.//MeshHeading/DescriptorName'
)]
if
len
(
mesh_terms
)
>
0
:
found
[
'mesh_terms'
]
=
mesh_terms
time
.
sleep
(
1
)
if
len
(
sys
.
argv
)
!=
2
:
print
(
'usage: %s <articles-json>'
%
sys
.
argv
[
0
])
sys
.
exit
(
1
)
with
open
(
sys
.
argv
[
1
])
as
fp
:
articles
=
json
.
loads
(
fp
.
read
())
for
batch
in
make_batches
(
articles
,
100
):
fetch
(
batch
)
articles
=
[
article
for
article
in
articles
if
'mesh_terms'
in
article
]
print
(
json
.
dumps
(
articles
,
indent
=
2
))
pubmed_mesh_queries.py
0 → 100644
View file @
27613256
import
json
,
sys
from
pymed
import
PubMed
from
datetime
import
datetime
,
date
import
time
import
collections
pubmed
=
PubMed
(
tool
=
"https://covid19.lis-lab.fr"
,
email
=
"benoit.favre@univ-amu.fr"
)
base_query
=
'"COVID-19" OR Coronavirus OR "Corona virus" OR "2019-nCoV" OR "SARS-CoV" OR "MERS-CoV" OR "Severe Acute Respiratory Syndrome" OR "Middle East Respiratory Syndrome"'
#query ='"COVID-19"'
#today = datetime.now().isoformat().split('T')[0]
#query = '(("%s"[Date - Publication] : "%s"[Date - Publication])) AND COVID-19[Text Word]' % (today, today)
count
=
collections
.
defaultdict
(
int
)
seen
=
{}
data
=
[]
for
keyword
in
[
'Diagnostic'
,
'Therapeutics'
,
'Epidemiology'
,
'Prognosis'
,
'Recommendations'
,
'Modeling'
,
'Hepato-gastroenterology'
,
'Neurology'
,
'Cardiology'
,
'Hematology'
,
'Geriatrics'
,
'Infectiology'
,
'Obstetric gynecology'
,
'Dermatology'
,
'Paediatrics'
,
'Pulmonology'
,
'Psychiatry'
,
'Virology'
,
'Anesthesics'
,
'Radiology'
,
'Hygiene'
,
'Nephrology'
,
'Lockdown'
,
'Immunity'
]:
query
=
'"%s"[MeSH] AND (%s)'
%
(
keyword
,
base_query
)
results
=
pubmed
.
query
(
query
,
max_results
=
10000
)
count
[
keyword
]
=
0
for
result
in
results
:
entry
=
result
.
toDict
()
pmid
=
entry
[
'pubmed_id'
].
split
(
'
\n
'
)[
0
]
entry
[
'pmid'
]
=
entry
[
'pubmid_id'
]
=
pmid
if
pmid
not
in
seen
:
seen
[
pmid
]
=
len
(
data
)
entry
[
'url'
]
=
'https://www.ncbi.nlm.nih.gov/pubmed/'
+
pmid
if
'authors'
in
entry
:
entry
[
'authors'
]
=
'; '
.
join
([
'%s, %s'
%
(
x
[
'lastname'
],
x
[
'firstname'
])
for
x
in
entry
[
'authors'
]])
if
'xml'
in
entry
:
del
entry
[
'xml'
]
for
key
,
value
in
entry
.
items
():
if
type
(
value
)
in
[
datetime
,
date
]:
entry
[
key
]
=
value
.
isoformat
()
entry
[
'mesh_query'
]
=
[]
data
.
append
(
entry
)
data
[
seen
[
pmid
]][
'mesh_query'
].
append
(
keyword
)
count
[
keyword
]
+=
1
time
.
sleep
(
1
)
#print(data)
#print(len(data), file=sys.stderr)
for
keyword
,
value
in
count
.
items
():
print
(
value
,
keyword
,
file
=
sys
.
stderr
)
print
(
json
.
dumps
(
data
,
indent
=
2
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment