Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
macaon_data
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Franck Dary
macaon_data
Commits
e76bd7c4
Commit
e76bd7c4
authored
4 years ago
by
Franck Dary
Browse files
Options
Downloads
Patches
Plain Diff
Added compute error to eval script
parent
915f6237
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
UD_any/evaluate.sh
+3
-3
3 additions, 3 deletions
UD_any/evaluate.sh
scripts/conll18_ud_eval.py
+533
-465
533 additions, 465 deletions
scripts/conll18_ud_eval.py
with
536 additions
and
468 deletions
UD_any/evaluate.sh
+
3
−
3
View file @
e76bd7c4
...
...
@@ -72,16 +72,16 @@ then
MCD
=
$EXPPATH
"/data/*
\.
mcd"
fi
EVALCONLL
=
"../scripts/
conll18_ud_
eval.py"
EVALCONLL
=
"../scripts/eval
uate
.py"
OUTPUT
=
$EXPPATH
"/predicted_eval.tsv"
if
[
"
$MODE
"
=
"tsv"
]
;
then
macaon decode
--model
$EXPPATH
--mcd
$MCD
--inputTSV
$REF
$@
>
$OUTPUT
&&
$EVALCONLL
$REF
$OUTPUT
-v
||
exit
1
macaon decode
--model
$EXPPATH
--mcd
$MCD
--inputTSV
$REF
$@
>
$OUTPUT
&&
$EVALCONLL
$REF
$OUTPUT
||
exit
1
exit
0
fi
if
[
"
$MODE
"
=
"txt"
]
;
then
macaon decode
--model
$EXPPATH
--mcd
$MCD
--inputTXT
$REFRAW
$@
>
$OUTPUT
&&
$EVALCONLL
$REF
$OUTPUT
-v
||
exit
1
macaon decode
--model
$EXPPATH
--mcd
$MCD
--inputTXT
$REFRAW
$@
>
$OUTPUT
&&
$EVALCONLL
$REF
$OUTPUT
||
exit
1
exit
0
fi
...
...
This diff is collapsed.
Click to expand it.
scripts/conll18_ud_eval.py
+
533
−
465
View file @
e76bd7c4
...
...
@@ -21,14 +21,14 @@
# just ASCII space.
# - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python).
# In Python2, make the whole computation use `unicode` strings.
#
# Updated by Franck Dary for Macaon
# Command line usage
# ------------------
# conll18_ud_eval.py
[-v]
gold_conllu_file system_conllu_file
# conll18_ud_eval.py gold_conllu_file system_conllu_file
#
# - if no -v is given, only the official CoNLL18 UD Shared Task evaluation metrics
# are printed
# - if -v is given, more metrics are printed (as precision, recall, F1 score,
# Metrics printed (as precision, recall, F1 score,
# and in case the metric is computed on aligned words also accuracy on these):
# - Tokens: how well do the gold tokens match system tokens
# - Sentences: how well do the gold sentences match system sentences
...
...
@@ -119,17 +119,28 @@ UNIVERSAL_FEATURES = {
"
Tense
"
,
"
Aspect
"
,
"
Voice
"
,
"
Evident
"
,
"
Polarity
"
,
"
Person
"
,
"
Polite
"
}
################################################################################
# UD Error is used when raising exceptions in this module
class
UDError
(
Exception
)
:
pass
################################################################################
################################################################################
# Conversion methods handling `str` <-> `unicode` conversions in Python2
def
_decode
(
text
)
:
return
text
if
sys
.
version_info
[
0
]
>=
3
or
not
isinstance
(
text
,
str
)
else
text
.
decode
(
"
utf-8
"
)
################################################################################
################################################################################
def
_encode
(
text
)
:
return
text
if
sys
.
version_info
[
0
]
>=
3
or
not
isinstance
(
text
,
unicode
)
else
text
.
encode
(
"
utf-8
"
)
################################################################################
################################################################################
# Load given CoNLL-U file into internal representation
def
load_conllu
(
file
)
:
# Internal representation classes
...
...
@@ -144,6 +155,8 @@ def load_conllu(file):
self
.
words
=
[]
# List of UDSpan instances with start&end indices into `characters`.
self
.
sentences
=
[]
# List of UDSpan instances with start&end indices into `words`.
self
.
sentences_words
=
[]
class
UDSpan
:
def
__init__
(
self
,
start
,
end
)
:
self
.
start
=
start
...
...
@@ -152,6 +165,8 @@ def load_conllu(file):
self
.
end
=
end
class
UDWord
:
def
__init__
(
self
,
span
,
columns
,
is_multiword
)
:
# Index of the sentence this word is part of, within ud_representation.sentences.
self
.
sentence
=
None
# Span of this word (or MWT, see below) within ud_representation.characters.
self
.
span
=
span
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
...
...
@@ -164,6 +179,7 @@ def load_conllu(file):
# List of references to UDWord instances representing functional-deprel children.
self
.
functional_children
=
[]
# Only consider universal FEATS.
# TODO consider all feats
self
.
columns
[
FEATS
]
=
"
|
"
.
join
(
sorted
(
feat
for
feat
in
columns
[
FEATS
].
split
(
"
|
"
)
if
feat
.
split
(
"
=
"
,
1
)[
0
]
in
UNIVERSAL_FEATURES
))
# Let's ignore language-specific deprel subtypes.
...
...
@@ -188,8 +204,9 @@ def load_conllu(file):
if
line
.
startswith
(
"
#
"
)
:
continue
# Start a new sentence
ud
.
sentences
.
append
(
UDSpan
(
index
,
0
))
sentence_start
=
len
(
ud
.
words
)
ud
.
sentences
.
append
(
UDSpan
(
index
,
0
))
ud
.
sentences_words
.
append
(
UDSpan
(
sentence_start
,
0
))
if
not
line
:
# Add parent and children UDWord links and check there are no cycles
def
process_word
(
word
)
:
...
...
@@ -219,6 +236,7 @@ def load_conllu(file):
# End the sentence
ud
.
sentences
[
-
1
].
end
=
index
ud
.
sentences_words
[
-
1
].
end
=
len
(
ud
.
words
)
sentence_start
=
None
continue
...
...
@@ -256,6 +274,7 @@ def load_conllu(file):
if
len
(
word_columns
)
<
10
:
raise
UDError
(
"
The CoNLL-U line does not contain 10 tab-separated columns:
'
{}
'"
.
format
(
_encode
(
word_line
)))
ud
.
words
.
append
(
UDWord
(
ud
.
tokens
[
-
1
],
word_columns
,
is_multiword
=
True
))
ud
.
words
[
-
1
].
sentence
=
len
(
ud
.
sentences
)
-
1
# Basic tokens/words
else
:
try
:
...
...
@@ -274,12 +293,16 @@ def load_conllu(file):
raise
UDError
(
"
HEAD cannot be negative
"
)
ud
.
words
.
append
(
UDWord
(
ud
.
tokens
[
-
1
],
columns
,
is_multiword
=
False
))
ud
.
words
[
-
1
].
sentence
=
len
(
ud
.
sentences
)
-
1
if
sentence_start
is
not
None
:
raise
UDError
(
"
The CoNLL-U file does not end with empty line
"
)
return
ud
################################################################################
################################################################################
# Evaluate the gold and system treebanks (loaded using load_conllu).
def
evaluate
(
gold_ud
,
system_ud
)
:
class
Score
:
...
...
@@ -318,7 +341,7 @@ def evaluate(gold_ud, system_ud):
si
+=
1
gi
+=
1
return
Score
(
len
(
gold_spans
),
len
(
system_spans
),
correct
)
return
[
Score
(
len
(
gold_spans
),
len
(
system_spans
),
correct
)
]
def
alignment_score
(
alignment
,
key_fn
=
None
,
filter_fn
=
None
)
:
if
filter_fn
is
not
None
:
...
...
@@ -332,19 +355,22 @@ def evaluate(gold_ud, system_ud):
if
key_fn
is
None
:
# Return score for whole aligned words
return
Score
(
gold
,
system
,
aligned
)
return
[
Score
(
gold
,
system
,
aligned
)
]
def
gold_aligned_gold
(
word
)
:
return
word
def
gold_aligned_system
(
word
)
:
return
alignment
.
matched_words_map
.
get
(
word
,
"
NotAligned
"
)
if
word
is
not
None
else
None
correct
=
0
errors
=
[]
for
words
in
alignment
.
matched_words
:
if
filter_fn
is
None
or
filter_fn
(
words
.
gold_word
)
:
if
key_fn
(
words
.
gold_word
,
gold_aligned_gold
)
==
key_fn
(
words
.
system_word
,
gold_aligned_system
)
:
correct
+=
1
else
:
errors
.
append
(
words
)
return
Score
(
gold
,
system
,
correct
,
aligned
)
return
[
Score
(
gold
,
system
,
correct
,
aligned
)
,
errors
]
def
beyond_end
(
words
,
i
,
multiword_span_end
)
:
if
i
>=
len
(
words
)
:
...
...
@@ -471,18 +497,54 @@ def evaluate(gold_ud, system_ud):
w
.
columns
[
LEMMA
]
if
ga
(
w
).
columns
[
LEMMA
]
!=
"
_
"
else
"
_
"
),
filter_fn
=
lambda
w
:
w
.
is_content_deprel
),
}
################################################################################
################################################################################
def
load_conllu_file
(
path
)
:
_file
=
open
(
path
,
mode
=
"
r
"
,
**
({
"
encoding
"
:
"
utf-8
"
}
if
sys
.
version_info
>=
(
3
,
0
)
else
{}))
return
load_conllu
(
_file
)
################################################################################
################################################################################
def
evaluate_wrapper
(
args
)
:
# Load CoNLL-U files
gold_ud
=
load_conllu_file
(
args
.
gold_file
)
system_ud
=
load_conllu_file
(
args
.
system_file
)
return
evaluate
(
gold_ud
,
system_ud
)
if
args
.
system_file2
is
not
None
:
print
(
"
TODO
"
)
#TODO
return
evaluate
(
gold_ud
,
system_ud
),
[
gold_ud
,
system_ud
]
################################################################################
################################################################################
def
compute_errors
(
gold_file
,
system_file
,
evaluation
,
metric
)
:
errors
=
{}
for
alignment_word
in
evaluation
[
metric
][
1
]
:
gold
=
alignment_word
.
gold_word
pred
=
alignment_word
.
system_word
error_type
=
gold
.
columns
[
UPOS
]
+
"
->
"
+
pred
.
columns
[
UPOS
]
gold_sentence_start
=
gold_file
.
sentences_words
[
gold
.
sentence
].
start
gold_sentence_end
=
gold_file
.
sentences_words
[
gold
.
sentence
].
end
pred_sentence_start
=
system_file
.
sentences_words
[
pred
.
sentence
].
start
pred_sentence_end
=
system_file
.
sentences_words
[
pred
.
sentence
].
end
error
=
[
gold
,
pred
,
gold_file
.
words
[
gold_sentence_start
:
gold_sentence_end
],
system_file
.
words
[
pred_sentence_start
:
pred_sentence_end
]]
if
error_type
not
in
errors
:
errors
[
error_type
]
=
[]
errors
[
error_type
].
append
(
error
)
return
errors
################################################################################
################################################################################
def
main
()
:
# Parse arguments
parser
=
argparse
.
ArgumentParser
()
...
...
@@ -490,21 +552,19 @@ def main():
help
=
"
Name of the CoNLL-U file with the gold data.
"
)
parser
.
add_argument
(
"
system_file
"
,
type
=
str
,
help
=
"
Name of the CoNLL-U file with the predicted data.
"
)
parser
.
add_argument
(
"
--verbose
"
,
"
-v
"
,
default
=
False
,
action
=
"
store_true
"
,
help
=
"
Print all metrics.
"
)
parser
.
add_argument
(
"
--counts
"
,
"
-c
"
,
default
=
False
,
action
=
"
store_true
"
,
help
=
"
Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.
"
)
parser
.
add_argument
(
"
--system_file2
"
,
help
=
"
Name of another CoNLL-U file with predicted data, for error comparison.
"
)
args
=
parser
.
parse_args
()
# Evaluate
evaluation
=
evaluate_wrapper
(
args
)
evaluation
,
files
=
evaluate_wrapper
(
args
)
# Compute errors
errors
=
compute_errors
(
files
[
0
],
files
[
1
],
evaluation
,
"
UPOS
"
)
# Print the evaluation
if
not
args
.
verbose
and
not
args
.
counts
:
print
(
"
LAS F1 Score: {:.2f}
"
.
format
(
100
*
evaluation
[
"
LAS
"
].
f1
))
print
(
"
MLAS Score: {:.2f}
"
.
format
(
100
*
evaluation
[
"
MLAS
"
].
f1
))
print
(
"
BLEX Score: {:.2f}
"
.
format
(
100
*
evaluation
[
"
BLEX
"
].
f1
))
else
:
if
args
.
counts
:
print
(
"
Metric | Correct | Gold | Predicted | Aligned
"
)
else
:
...
...
@@ -514,23 +574,29 @@ def main():
if
args
.
counts
:
print
(
"
{:11}|{:10} |{:10} |{:10} |{:10}
"
.
format
(
metric
,
evaluation
[
metric
].
correct
,
evaluation
[
metric
].
gold_total
,
evaluation
[
metric
].
system_total
,
evaluation
[
metric
].
aligned_total
or
(
evaluation
[
metric
].
correct
if
metric
==
"
Words
"
else
""
)
evaluation
[
metric
]
[
0
]
.
correct
,
evaluation
[
metric
]
[
0
]
.
gold_total
,
evaluation
[
metric
]
[
0
]
.
system_total
,
evaluation
[
metric
]
[
0
]
.
aligned_total
or
(
evaluation
[
metric
]
[
0
]
.
correct
if
metric
==
"
Words
"
else
""
)
))
else
:
print
(
"
{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}
"
.
format
(
metric
,
100
*
evaluation
[
metric
].
precision
,
100
*
evaluation
[
metric
].
recall
,
100
*
evaluation
[
metric
].
f1
,
"
{:10.2f}
"
.
format
(
100
*
evaluation
[
metric
].
aligned_accuracy
)
if
evaluation
[
metric
].
aligned_accuracy
is
not
None
else
""
100
*
evaluation
[
metric
]
[
0
]
.
precision
,
100
*
evaluation
[
metric
]
[
0
]
.
recall
,
100
*
evaluation
[
metric
]
[
0
]
.
f1
,
"
{:10.2f}
"
.
format
(
100
*
evaluation
[
metric
]
[
0
]
.
aligned_accuracy
)
if
evaluation
[
metric
]
[
0
]
.
aligned_accuracy
is
not
None
else
""
))
################################################################################
################################################################################
if
__name__
==
"
__main__
"
:
main
()
################################################################################
################################################################################
# Tests, which can be executed with `python -m unittest conll18_ud_eval`.
class
TestAlignment
(
unittest
.
TestCase
)
:
@staticmethod
...
...
@@ -580,3 +646,5 @@ class TestAlignment(unittest.TestCase):
self
.
_test_ok
([
"
abc a BX c
"
,
"
def d EX f
"
],
[
"
ab a b
"
,
"
cd c d
"
,
"
ef e f
"
],
4
)
self
.
_test_ok
([
"
ab a b
"
,
"
cd bc d
"
],
[
"
a
"
,
"
bc
"
,
"
d
"
],
2
)
self
.
_test_ok
([
"
a
"
,
"
bc b c
"
,
"
d
"
],
[
"
ab AX BX
"
,
"
cd CX a
"
],
1
)
################################################################################
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment