Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PSTAL étudiant.e.s
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Carlos Ramisch
PSTAL étudiant.e.s
Commits
7ea2bf6d
Commit
7ea2bf6d
authored
11 months ago
by
Carlos Ramisch
Browse files
Options
Downloads
Patches
Plain Diff
Update accuracy to evaluate NER with P/R/F
parent
d0769a46
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
lib/accuracy.py
+141
-70
141 additions, 70 deletions
lib/accuracy.py
lib/conllulib.py
+143
-7
143 additions, 7 deletions
lib/conllulib.py
sequoia/bin/simplify_sequoia.py
+7
-1
7 additions, 1 deletion
sequoia/bin/simplify_sequoia.py
with
291 additions
and
78 deletions
lib/accuracy.py
+
141
−
70
View file @
7ea2bf6d
#!/usr/bin/env python3
#!/usr/bin/env python3
import
sys
import
sys
import
argparse
import
argparse
import
collections
from
collections
import
defaultdict
,
Counter
import
pdb
import
pdb
from
conllulib
import
CoNLLUReader
,
Util
from
conllulib
import
CoNLLUReader
,
Util
...
@@ -23,10 +24,10 @@ parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\
...
@@ -23,10 +24,10 @@ parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\
parser
.
add_argument
(
'
-t
'
,
"
--train
"
,
metavar
=
"
FILENAME.conllu
"
,
required
=
False
,
\
parser
.
add_argument
(
'
-t
'
,
"
--train
"
,
metavar
=
"
FILENAME.conllu
"
,
required
=
False
,
\
dest
=
"
train_filename
"
,
type
=
argparse
.
FileType
(
'
r
'
,
encoding
=
'
UTF-8
'
),
\
dest
=
"
train_filename
"
,
type
=
argparse
.
FileType
(
'
r
'
,
encoding
=
'
UTF-8
'
),
\
help
=
"""
Training corpus in CoNLL-U, from which tagger was learnt.
"""
)
help
=
"""
Training corpus in CoNLL-U, from which tagger was learnt.
"""
)
parser
.
add_argument
(
'
-c
'
,
"
--tagcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
col_
name_tag
"
,
parser
.
add_argument
(
'
-c
'
,
"
--tagcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
name_tag
"
,
required
=
False
,
type
=
str
,
default
=
"
upos
"
,
help
=
"""
Column name of tags,
\
required
=
False
,
type
=
str
,
default
=
"
upos
"
,
help
=
"""
Column name of tags,
\
as defined in header. Use lowercase
"""
)
as defined in header. Use lowercase
"""
)
parser
.
add_argument
(
'
-f
'
,
"
--featcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
col_
name_feat
"
,
parser
.
add_argument
(
'
-f
'
,
"
--featcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
name_feat
"
,
required
=
False
,
type
=
str
,
default
=
"
form
"
,
help
=
"""
Column name of input
required
=
False
,
type
=
str
,
default
=
"
form
"
,
help
=
"""
Column name of input
feature, as defined in header. Use lowercase.
"""
)
feature, as defined in header. Use lowercase.
"""
)
parser
.
add_argument
(
'
-u
'
,
"
--upos-filter
"
,
metavar
=
"
NAME
"
,
dest
=
"
upos_filter
"
,
parser
.
add_argument
(
'
-u
'
,
"
--upos-filter
"
,
metavar
=
"
NAME
"
,
dest
=
"
upos_filter
"
,
...
@@ -34,13 +35,20 @@ parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter",
...
@@ -34,13 +35,20 @@ parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter",
help
=
"""
Only calculate accuracy for words with UPOS in this list.
\
help
=
"""
Only calculate accuracy for words with UPOS in this list.
\
Empty list = no filter.
"""
)
Empty list = no filter.
"""
)
########################################################################
########################################################################
########
def
process_args
(
parser
):
def
process_args
(
parser
):
"""
Show (in debug mode) and process all command line options. Checks tag and feat
columns appear in corpora. Create training corpus vocabulary if option present
for OOV status check. Input is an instance of `argparse.ArgumentParser`,
returns list of `args`, `gold_corpus` and `pred_corpus` as `CoNLLUReader`,
`train_vocab` dictionary.
"""
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
Util
.
DEBUG_FLAG
=
args
.
DEBUG_FLAG
Util
.
DEBUG_FLAG
=
args
.
DEBUG_FLAG
args
.
col_
name_tag
=
args
.
col_
name_tag
.
lower
()
args
.
name_tag
=
args
.
name_tag
.
lower
()
args
.
col_
name_feat
=
args
.
col_
name_feat
.
lower
()
args
.
name_feat
=
args
.
name_feat
.
lower
()
Util
.
debug
(
"
Command-line arguments and defaults:
"
)
Util
.
debug
(
"
Command-line arguments and defaults:
"
)
for
(
k
,
v
)
in
vars
(
args
).
items
():
for
(
k
,
v
)
in
vars
(
args
).
items
():
Util
.
debug
(
"
* {}: {}
"
,
k
,
v
)
Util
.
debug
(
"
* {}: {}
"
,
k
,
v
)
...
@@ -49,37 +57,21 @@ def process_args(parser):
...
@@ -49,37 +57,21 @@ def process_args(parser):
train_vocab
=
None
train_vocab
=
None
if
args
.
train_filename
:
if
args
.
train_filename
:
train_corpus
=
CoNLLUReader
(
args
.
train_filename
)
train_corpus
=
CoNLLUReader
(
args
.
train_filename
)
ignoreme
,
train_vocab
=
train_corpus
.
to_int_and_vocab
({
args
.
col_
name_feat
:[]})
ignoreme
,
train_vocab
=
train_corpus
.
to_int_and_vocab
({
args
.
name_feat
:[]})
if
args
.
col_
name_tag
not
in
gold_corpus
.
header
or
\
if
args
.
name_tag
not
in
gold_corpus
.
header
or
\
args
.
col_
name_feat
not
in
gold_corpus
.
header
:
args
.
name_feat
not
in
gold_corpus
.
header
:
Util
.
error
(
"
-c and -f names must be valid conllu column among:
\n
{}
"
,
Util
.
error
(
"
-c and -f names must be valid conllu column among:
\n
{}
"
,
gold_corpus
.
header
)
gold_corpus
.
header
)
return
args
,
gold_corpus
,
pred_corpus
,
train_vocab
return
args
,
gold_corpus
,
pred_corpus
,
train_vocab
########################################################################
########################################################################
########
if
__name__
==
"
__main__
"
:
def
tp_count_feats
(
tok_pred
,
tok_gold
,
prf
):
args
,
gold_corpus
,
pred_corpus
,
train_vocab
=
process_args
(
parser
)
"""
prf
=
collections
.
defaultdict
(
lambda
:{
'
tp
'
:
0
,
'
t
'
:
0
,
'
p
'
:
0
})
Increment number of true positives, trues and positives for morph feature eval
total_tokens
=
correct_tokens
=
0
Compares all features of `tok_pred` with thos of `tok_gold`
total_oov
=
correct_oov
=
0
Result is modification of `prf` dict, function does not return anything
for
(
sent_gold
,
sent_pred
)
in
zip
(
gold_corpus
.
readConllu
(),
"""
pred_corpus
.
readConllu
()):
for
(
tok_gold
,
tok_pred
)
in
zip
(
sent_gold
,
sent_pred
):
if
not
args
.
upos_filter
or
tok_gold
[
'
upos
'
]
in
args
.
upos_filter
:
if
train_vocab
:
train_vocab_feat
=
train_vocab
[
args
.
col_name_feat
].
keys
()
if
tok_gold
[
args
.
col_name_feat
]
not
in
train_vocab_feat
:
total_oov
=
total_oov
+
1
oov
=
True
else
:
oov
=
False
if
tok_gold
[
args
.
col_name_tag
]
==
tok_pred
[
args
.
col_name_tag
]:
correct_tokens
=
correct_tokens
+
1
if
train_vocab
and
oov
:
correct_oov
=
correct_oov
+
1
total_tokens
+=
1
if
args
.
col_name_tag
==
'
feats
'
:
pred_feats
=
tok_pred
[
'
feats
'
]
if
tok_pred
[
'
feats
'
]
else
{}
pred_feats
=
tok_pred
[
'
feats
'
]
if
tok_pred
[
'
feats
'
]
else
{}
gold_feats
=
tok_gold
[
'
feats
'
]
if
tok_gold
[
'
feats
'
]
else
{}
gold_feats
=
tok_gold
[
'
feats
'
]
if
tok_gold
[
'
feats
'
]
else
{}
for
key
in
pred_feats
.
keys
():
for
key
in
pred_feats
.
keys
():
...
@@ -93,34 +85,113 @@ if __name__ == "__main__":
...
@@ -93,34 +85,113 @@ if __name__ == "__main__":
t_inc
=
int
(
gold_feats
.
get
(
key
,
None
)
!=
None
)
t_inc
=
int
(
gold_feats
.
get
(
key
,
None
)
!=
None
)
prf
[
key
][
'
t
'
]
=
prf
[
key
][
'
t
'
]
+
t_inc
prf
[
key
][
'
t
'
]
=
prf
[
key
][
'
t
'
]
+
t_inc
prf
[
'
micro-average
'
][
'
t
'
]
=
prf
[
'
micro-average
'
][
'
t
'
]
+
t_inc
prf
[
'
micro-average
'
][
'
t
'
]
=
prf
[
'
micro-average
'
][
'
t
'
]
+
t_inc
print
(
"
Pred file: {}
"
.
format
(
pred_corpus
.
name
()))
################################################################################
def
parseme_cat_in
(
ent
,
ent_list
):
"""
Verify if `ent` is present in `ent_list` by comparing both span AND category.
Default cuptlib implementation ignores category
"""
for
ent_cand
in
ent_list
:
if
ent
.
span
==
ent_cand
.
span
and
ent
.
cat
==
ent_cand
.
cat
:
return
True
return
False
################################################################################
def
tp_count_parseme
(
s_pred
,
s_gold
,
name_tag
,
prf
):
try
:
import
parseme.cupt
as
cupt
except
ImportError
:
print
(
"""
Please install cuptlib before running this script
\n\n
git clone
\
https://gitlab.com/parseme/cuptlib.git
\n
cd cuptlib
\n
pip install .
"""
)
sys
.
exit
(
-
1
)
ents_pred
=
cupt
.
retrieve_mwes
(
s_pred
,
column_name
=
name_tag
)
ents_gold
=
cupt
.
retrieve_mwes
(
s_gold
,
column_name
=
name_tag
)
prf
[
'
Exact-nocat
'
][
'
p
'
]
+=
len
(
ents_pred
)
prf
[
'
Exact-nocat
'
][
'
t
'
]
+=
len
(
ents_gold
)
for
e_pred
in
ents_pred
.
values
()
:
if
e_pred
in
ents_gold
.
values
()
:
#pdb.set_trace()
prf
[
'
Exact-nocat
'
][
'
tp
'
]
+=
1
if
parseme_cat_in
(
e_pred
,
ents_gold
.
values
())
:
prf
[
'
Exact-
'
+
e_pred
.
cat
][
'
tp
'
]
+=
1
prf
[
'
Exact-
'
+
e_pred
.
cat
][
'
p
'
]
+=
1
for
e_pred
in
ents_gold
.
values
()
:
prf
[
'
Exact-
'
+
e_pred
.
cat
][
'
t
'
]
+=
1
# Token-based evaluation - categories always ignored here
span_pred
=
sum
([
list
(
ep
.
int_span
())
for
ep
in
ents_pred
.
values
()],
start
=
[])
span_gold
=
sum
([
list
(
eg
.
int_span
())
for
eg
in
ents_gold
.
values
()],
start
=
[])
prf
[
'
Token-nocat
'
][
'
p
'
]
+=
len
(
span_pred
)
prf
[
'
Token-nocat
'
][
'
t
'
]
+=
len
(
span_gold
)
for
e_pred
in
span_pred
:
if
e_pred
in
span_gold
:
prf
[
'
Token-nocat
'
][
'
tp
'
]
+=
1
################################################################################
def
print_results
(
pred_corpus_name
,
args
,
acc
,
prf
):
"""
Calculate and print accuracies, precision, recall, f-score, etc.
"""
print
(
"
Predictions file: {}
"
.
format
(
pred_corpus_name
))
if
args
.
upos_filter
:
if
args
.
upos_filter
:
print
(
"
Results focus only on following UPOS: {}
"
.
format
(
"
"
.
join
(
args
.
upos_filter
)))
print
(
"
Results concern only some UPOS: {}
"
.
format
(
"
"
.
join
(
args
.
upos_filter
)))
accuracy
=
(
correct_tokens
/
total_tokens
)
*
100
accuracy
=
(
acc
[
'
correct_tokens
'
]
/
acc
[
'
total_tokens
'
])
*
100
print
(
"
Accuracy on all {}: {:0.2f} ({}/{})
"
.
format
(
args
.
col_name_tag
,
accuracy
,
print
(
"
Accuracy on all {}: {:0.2f} ({:5}/{:5})
"
.
format
(
args
.
name_tag
,
accuracy
,
correct_tokens
,
total_tokens
))
acc
[
'
correct_tokens
'
],
acc
[
'
total_tokens
'
]))
if
train_vocab
:
if
args
.
train_filename
:
accuracy_oov
=
(
correct_oov
/
total_oov
)
*
100
accuracy_oov
=
(
acc
[
'
correct_oov
'
]
/
acc
[
'
total_oov
'
])
*
100
print
(
"
Accuracy on OOV {}: {:0.2f} ({}/{})
"
.
format
(
args
.
col_name_tag
,
print
(
"
Accuracy on OOV {}: {:0.2f} ({:5}/{:5})
"
.
format
(
args
.
name_tag
,
accuracy_oov
,
accuracy_oov
,
acc
[
'
correct_oov
'
],
acc
[
'
total_oov
'
]))
correct_oov
,
total_oov
))
if
prf
:
if
prf
:
print
(
"
Metrics per feature:
"
)
print
(
"
\n
Precision, recall, and F-score for {}:
"
.
format
(
args
.
name_tag
)
)
macro
=
{
"
precis
"
:
0.0
,
"
recall
"
:
0.0
}
macro
=
{
"
precis
"
:
0.0
,
"
recall
"
:
0.0
}
for
key
in
sorted
(
prf
.
keys
()):
for
key
in
sorted
(
prf
):
# max prevents zero-division in P and R
precis
=
prf
[
key
][
'
tp
'
]
/
max
(
1
,
prf
[
key
][
'
p
'
])
# max prevents zero-division
precis
=
(
prf
[
key
][
'
tp
'
]
/
max
(
1
,
prf
[
key
][
'
p
'
])
)
*
100
recall
=
prf
[
key
][
'
tp
'
]
/
max
(
1
,
prf
[
key
][
'
t
'
])
recall
=
(
prf
[
key
][
'
tp
'
]
/
max
(
1
,
prf
[
key
][
'
t
'
])
)
*
100
fscore
=
(
2
*
precis
*
recall
)
/
max
(
1
,
precis
+
recall
)
fscore
=
(
(
2
*
precis
*
recall
)
/
max
(
1
,
precis
+
recall
)
)
if
key
!=
'
micro-average
'
:
if
key
!=
'
micro-average
'
:
macro
[
'
precis
'
]
=
macro
[
'
precis
'
]
+
precis
macro
[
'
precis
'
]
=
macro
[
'
precis
'
]
+
precis
macro
[
'
recall
'
]
=
macro
[
'
recall
'
]
+
recall
macro
[
'
recall
'
]
=
macro
[
'
recall
'
]
+
recall
else
:
else
:
print
()
print
()
templ
=
"
{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}
"
templ
=
"
{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}
"
print
(
templ
.
format
(
key
,
precis
*
100
,
prf
[
key
][
'
tp
'
],
prf
[
key
][
'
p
'
],
recall
*
100
,
print
(
templ
.
format
(
key
,
precis
,
prf
[
key
][
'
tp
'
],
prf
[
key
][
'
p
'
],
recall
,
prf
[
key
][
'
tp
'
],
prf
[
key
][
'
t
'
],
fscore
*
100
))
prf
[
key
][
'
tp
'
],
prf
[
key
][
'
t
'
],
fscore
))
templ
=
"
{:13}: P={:6.2f} / R={:6.2f} / F={:6.2f}
"
templ
=
"
{:13}: P={:6.2f}
"
+
"
"
*
15
+
"
/ R={:6.2f}
"
+
"
"
*
15
+
"
/ F={:6.2f}
"
ma_precis
=
macro
[
'
precis
'
]
/
(
len
(
prf
.
keys
())
-
1
)
if
len
(
prf
)
>
1
:
# Calculate macro-precision
ma_recall
=
macro
[
'
recall
'
]
/
(
len
(
prf
.
keys
())
-
1
)
nb_scores
=
len
(
prf
)
-
1
if
"
micro-average
"
in
prf
else
len
(
prf
)
ma_fscore
=
(
2
*
ma_precis
*
ma_recall
)
/
max
(
1
,
ma_precis
+
ma_recall
)
ma_precis
=
(
macro
[
'
precis
'
]
/
(
nb_scores
))
print
(
templ
.
format
(
"
macro-average
"
,
ma_precis
*
100
,
ma_recall
*
100
,
ma_fscore
*
100
))
ma_recall
=
(
macro
[
'
recall
'
]
/
(
nb_scores
))
ma_fscore
=
((
2
*
ma_precis
*
ma_recall
)
/
max
(
1
,
ma_precis
+
ma_recall
))
print
(
templ
.
format
(
"
macro-average
"
,
ma_precis
,
ma_recall
,
ma_fscore
))
################################################################################
if
__name__
==
"
__main__
"
:
args
,
gold_corpus
,
pred_corpus
,
train_vocab
=
process_args
(
parser
)
prf
=
defaultdict
(
lambda
:{
'
tp
'
:
0
,
'
t
'
:
0
,
'
p
'
:
0
})
# used for feats, NEs and MWEs
acc
=
Counter
()
# store correct and total for all and OOV
for
(
s_gold
,
s_pred
)
in
zip
(
gold_corpus
.
readConllu
(),
pred_corpus
.
readConllu
()):
if
args
.
name_tag
.
startswith
(
"
parseme
"
):
tp_count_parseme
(
s_pred
,
s_gold
,
args
.
name_tag
,
prf
)
for
(
tok_gold
,
tok_pred
)
in
zip
(
s_gold
,
s_pred
):
if
not
args
.
upos_filter
or
tok_gold
[
'
upos
'
]
in
args
.
upos_filter
:
if
train_vocab
:
train_vocab_feat
=
train_vocab
[
args
.
name_feat
].
keys
()
if
tok_gold
[
args
.
name_feat
]
not
in
train_vocab_feat
:
acc
[
'
total_oov
'
]
+=
1
oov
=
True
else
:
oov
=
False
if
tok_gold
[
args
.
name_tag
]
==
tok_pred
[
args
.
name_tag
]:
acc
[
'
correct_tokens
'
]
+=
1
if
train_vocab
and
oov
:
acc
[
'
correct_oov
'
]
+=
1
acc
[
'
total_tokens
'
]
+=
1
if
args
.
name_tag
==
'
feats
'
:
tp_count_feats
(
tok_gold
,
tok_pred
,
prf
)
print_results
(
pred_corpus
.
name
(),
args
,
acc
,
prf
)
This diff is collapsed.
Click to expand it.
lib/conllulib.py
+
143
−
7
View file @
7ea2bf6d
...
@@ -6,6 +6,7 @@ import collections
...
@@ -6,6 +6,7 @@ import collections
from
torch.utils.data
import
TensorDataset
,
DataLoader
from
torch.utils.data
import
TensorDataset
,
DataLoader
import
torch
import
torch
import
random
import
random
import
numpy
as
np
import
pdb
import
pdb
########################################################################
########################################################################
...
@@ -15,6 +16,7 @@ import pdb
...
@@ -15,6 +16,7 @@ import pdb
class
Util
(
object
):
class
Util
(
object
):
DEBUG_FLAG
=
False
DEBUG_FLAG
=
False
PSEUDO_INF
=
9999.0
###############################
###############################
...
@@ -64,6 +66,23 @@ class Util(object):
...
@@ -64,6 +66,23 @@ class Util(object):
random
.
seed
(
seed
)
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
manual_seed
(
seed
)
###############################
@staticmethod
def
log_cap
(
number
):
"""
Returns the base-10 logarithm of `number`.
If `number` is negative, stops the program with an error message.
If `number` is zero returns -9999.0 representing negative pseudo infinity
This is more convenient than -np.inf returned by np.log10 because :
inf + a = inf (no difference in sum) but 9999.0 + a != 9999.0
"""
if
number
<
0
:
Util
.
error
(
"
Cannot get logarithm of negative number {}
"
.
format
(
number
))
elif
number
==
0
:
return
-
Util
.
PSEUDO_INF
else
:
return
np
.
log10
(
number
)
########################################################################
########################################################################
# CONLLU FUNCTIONS
# CONLLU FUNCTIONS
########################################################################
########################################################################
...
@@ -72,6 +91,8 @@ class CoNLLUReader(object):
...
@@ -72,6 +91,8 @@ class CoNLLUReader(object):
###############################
###############################
start_tag
=
"
<s>
"
def
__init__
(
self
,
infile
):
def
__init__
(
self
,
infile
):
self
.
infile
=
infile
self
.
infile
=
infile
DEFAULT_HEADER
=
"
ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
"
+
\
DEFAULT_HEADER
=
"
ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
"
+
\
...
@@ -129,14 +150,14 @@ class CoNLLUReader(object):
...
@@ -129,14 +150,14 @@ class CoNLLUReader(object):
###############################
###############################
def
to_int_from_vocab
(
self
,
col_name
_dict
,
unk_token
,
vocab
=
{}):
def
to_int_from_vocab
(
self
,
col_name
s
,
unk_token
,
vocab
=
{}):
int_list
=
{}
int_list
=
{}
unk_toks
=
{}
unk_toks
=
{}
for
col_name
,
special_tokens
in
col_name_dict
.
items
()
:
for
col_name
in
col_names
:
int_list
[
col_name
]
=
[]
int_list
[
col_name
]
=
[]
unk_toks
[
col_name
]
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
unk_toks
[
col_name
]
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
for
s
in
self
.
readConllu
():
for
s
in
self
.
readConllu
():
for
col_name
in
col_name
_dict
.
keys
()
:
for
col_name
in
col_name
s
:
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
t
[
col_name
],
unk_toks
[
col_name
])
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
t
[
col_name
],
unk_toks
[
col_name
])
int_list
[
col_name
].
append
([
id_getter
(
vocab
,
tok
)
for
tok
in
s
])
int_list
[
col_name
].
append
([
id_getter
(
vocab
,
tok
)
for
tok
in
s
])
return
int_list
return
int_list
...
@@ -144,11 +165,126 @@ class CoNLLUReader(object):
...
@@ -144,11 +165,126 @@ class CoNLLUReader(object):
###############################
###############################
@staticmethod
@staticmethod
def
to_int_from_vocab_sent
(
sent
,
col_name_dict
,
unk_token
,
vocab
=
{}):
def
to_int_from_vocab_sent
(
sent
,
col_names
,
unk_token
,
vocab
=
{},
lowercase
=
False
):
int_list
=
{}
int_list
=
{}
for
col_name
in
col_name
_dict
.
keys
()
:
for
col_name
in
col_name
s
:
unk_tok_id
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
unk_tok_id
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
t
[
col_name
],
unk_tok_id
)
low_or_not
=
lambda
w
:
w
.
lower
()
if
lowercase
else
w
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
low_or_not
(
t
[
col_name
]),
unk_tok_id
)
int_list
[
col_name
]
=
[
id_getter
(
vocab
,
tok
)
for
tok
in
sent
]
int_list
[
col_name
]
=
[
id_getter
(
vocab
,
tok
)
for
tok
in
sent
]
return
int_list
return
int_list
###############################
@staticmethod
def
to_bio
(
sent
,
bio_style
=
'
bio
'
,
name_tag
=
'
parseme:ne
'
):
bio_enc
=
[]
neindex
=
0
for
tok
in
sent
:
netag
=
tok
[
name_tag
]
if
netag
==
'
*
'
:
cur_tag
=
'
O
'
elif
netag
==
neindex
:
cur_tag
=
'
I
'
+
necat
else
:
neindex
,
necat
=
netag
.
split
(
"
:
"
)
necat
=
'
-
'
+
necat
if
bio_style
==
'
io
'
:
cur_tag
=
'
I
'
+
necat
else
:
cur_tag
=
'
B
'
+
necat
bio_enc
.
append
(
cur_tag
)
return
bio_enc
###############################
@staticmethod
def
from_bio
(
bio_enc
,
bio_style
=
'
bio
'
,
stop_on_error
=
False
):
"""
Converst BIO-encoded annotations into Sequoia/parseme format.
Input `bio_enc` is a list of strings, each corresponding to one BIO tag.
`bio_style` can be
"
bio
"
(default) or
"
io
"
. Will try to recover encoding
errors by replacing wrong tags when `stop_on_error` equals False (default),
otherwise stops execution and shows an error message.
Only works for BIO-cat & IO-cat, with -cat appended to both B and I tags.
Requires adaptations for BIOES, and encoding schemes without
"
-cat.
Examples:
>>>
from_bio
([
"
B-PERS
"
,
"
I-PERS
"
,
"
I-PERS
"
,
"
O
"
,
"
B-LOC
"
,
"
I-LOC
"
],
bio_style
=
'
bio
'
)
[
'
1:PERS
'
,
'
1
'
,
'
1
'
,
'
*
'
,
'
2:LOC
'
,
'
2
'
]
>>>
from_bio
([
"
B-PERS
"
,
"
I-PERS
"
,
"
I-PERS
"
,
"
O
"
,
"
B-LOC
"
,
"
I-LOC
"
],
bio_style
=
'
io
'
)
WARNING
:
Got
B
tag
in
spite
of
'
io
'
bio_style
:
interpreted
as
I
WARNING
:
Got
B
tag
in
spite
of
'
io
'
bio_style
:
interpreted
as
I
[
'
1:PERS
'
,
'
1
'
,
'
1
'
,
'
*
'
,
'
2:LOC
'
,
'
2
'
]
>>>
from_bio
([
"
I-PERS
"
,
"
B-PERS
"
,
"
I-PERS
"
,
"
O
"
,
"
I-LOC
"
],
bio_style
=
'
io
'
)
WARNING
:
Got
B
tag
in
spite
of
'
io
'
bio_style
:
interpreted
as
I
[
'
1:PERS
'
,
'
1
'
,
'
1
'
,
'
*
'
,
'
2:LOC
'
]
>>>
from_bio
([
"
I-PERS
"
,
"
I-PERS
"
,
"
I-PERS
"
,
"
O
"
,
"
I-LOC
"
],
bio_style
=
'
bio
'
)
WARNING
:
Invalid
I
-
initial
tag
I
-
PERS
converted
to
B
WARNING
:
Invalid
I
-
initial
tag
I
-
LOC
converted
to
B
[
'
1:PERS
'
,
'
1
'
,
'
1
'
,
'
*
'
,
'
2:LOC
'
]
>>>
from_bio
([
"
I-PERS
"
,
"
B-PERS
"
,
"
I-PERS
"
,
"
O
"
,
"
I-LOC
"
],
bio_style
=
'
bio
'
)
WARNING
:
Invalid
I
-
initial
tag
I
-
PERS
converted
to
B
WARNING
:
Invalid
I
-
initial
tag
I
-
LOC
converted
to
B
[
'
1:PERS
'
,
'
2:PERS
'
,
'
2
'
,
'
*
'
,
'
3:LOC
'
]
>>>
from_bio
([
"
I-PERS
"
,
"
B-PERS
"
,
"
I-EVE
"
,
"
O
"
,
"
I-PERS
"
],
bio_style
=
'
io
'
)
[
'
1:PERS
'
,
'
2:PERS
'
,
'
3:EVE
'
,
'
*
'
,
'
4:PERS
'
]
>>>
from_bio
([
"
I-PERS
"
,
"
B-PERS
"
,
"
I-EVE
"
,
"
O
"
,
"
I-PERS
"
],
bio_style
=
'
bio
'
)
WARNING
:
Invalid
I
-
initial
tag
I
-
PERS
converted
to
B
WARNING
:
Invalid
I
-
initial
tag
I
-
EVE
converted
to
B
WARNING
:
Invalid
I
-
initial
tag
I
-
PERS
converted
to
B
[
'
1:PERS
'
,
'
2:PERS
'
,
'
3:EVE
'
,
'
*
'
,
'
4:PERS
'
]
"""
# TODO: warning if I-cat != previous I-cat or B-cat
result
=
[]
neindex
=
0
prev_bio_tag
=
'
O
'
prev_cat
=
None
for
bio_tag
in
bio_enc
:
if
bio_tag
==
'
O
'
:
seq_tag
=
'
*
'
elif
bio_tag
[
0
]
in
[
'
B
'
,
'
I
'
]
and
bio_tag
[
1
]
==
'
-
'
:
necat
=
bio_tag
.
split
(
"
-
"
)[
1
]
if
bio_tag
[
0
]
==
'
B
'
and
bio_style
==
'
bio
'
:
neindex
+=
1
# Begining of an entity
seq_tag
=
str
(
neindex
)
+
"
:
"
+
necat
elif
bio_tag
[
0
]
==
'
B
'
:
# bio_style = 'io'
if
stop_on_error
:
Util
.
error
(
"
B tag not allowed with
'
io
'"
)
else
:
bio_tag
=
bio_tag
.
replace
(
"
B-
"
,
"
I-
"
)
Util
.
warn
(
"
Got B tag in spite of
'
io
'
bio_style: interpreted as I
"
)
if
bio_tag
[
0
]
==
"
I
"
and
bio_style
==
"
io
"
:
if
necat
!=
prev_cat
:
neindex
+=
1
# Begining of an entity
seq_tag
=
str
(
neindex
)
+
"
:
"
+
necat
else
:
seq_tag
=
str
(
neindex
)
# is a continuation
elif
bio_tag
[
0
]
==
"
I
"
:
# tag is "I" and bio_style is "bio"
if
bio_style
==
'
bio
'
and
prev_bio_tag
!=
'
O
'
and
necat
==
prev_cat
:
seq_tag
=
str
(
neindex
)
# is a continuation
elif
stop_on_error
:
Util
.
error
(
"
Invalid I-initial tag in BIO format: {}
"
.
format
(
bio_tag
))
else
:
neindex
+=
1
# Begining of an entity
seq_tag
=
str
(
neindex
)
+
"
:
"
+
necat
Util
.
warn
(
"
Invalid I-initial tag {} converted to B
"
.
format
(
bio_tag
))
prev_cat
=
necat
else
:
if
stop_on_error
:
Util
.
error
(
"
Invalid BIO tag: {}
"
.
format
(
bio_tag
))
else
:
Util
.
warn
(
"
Invalid BIO tag {} converted to O
"
.
format
(
bio_tag
))
result
.
append
(
"
*
"
)
result
.
append
(
seq_tag
)
prev_bio_tag
=
bio_tag
return
result
################################################################################
This diff is collapsed.
Click to expand it.
sequoia/bin/simplify_sequoia.py
+
7
−
1
View file @
7ea2bf6d
...
@@ -35,7 +35,13 @@ import sys
...
@@ -35,7 +35,13 @@ import sys
import
conllu
import
conllu
import
re
import
re
import
pdb
import
pdb
import
subprocess
try
:
import
parseme.cupt
as
cupt
import
parseme.cupt
as
cupt
except
ImportError
:
print
(
"""
Please install cuptlib before running this script
\n\n
git clone
\
https://gitlab.com/parseme/cuptlib.git
\n
cd cuptlib
\n
pip install .
"""
)
sys
.
exit
(
-
1
)
#########################################
#########################################
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment