Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PSTAL étudiant.e.s
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Carlos Ramisch
PSTAL étudiant.e.s
Commits
58a4a244
Commit
58a4a244
authored
10 months ago
by
ceramisch
Browse files
Options
Downloads
Patches
Plain Diff
Add first version of library scripts
parent
23f6b043
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
lib/__init__.py
+0
-0
0 additions, 0 deletions
lib/__init__.py
lib/accuracy.py
+93
-0
93 additions, 0 deletions
lib/accuracy.py
lib/conllulib.py
+109
-0
109 additions, 0 deletions
lib/conllulib.py
with
202 additions
and
0 deletions
lib/__init__.py
0 → 100644
+
0
−
0
View file @
58a4a244
This diff is collapsed.
Click to expand it.
lib/accuracy.py
0 → 100755
+
93
−
0
View file @
58a4a244
#!/usr/bin/env python3
import
sys
import
argparse
import
collections
import
pdb
from
conllulib
import
CoNLLUReader
,
Util
################################################################################
parser
=
argparse
.
ArgumentParser
(
description
=
"
Calculates the accuracy of a
\
prediction with respect to the gold file. By default, uses UPOS, but this can
\
be configured with option -c.
"
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'
-D
'
,
"
--debug
"
,
action
=
"
store_true
"
,
dest
=
"
DEBUG_FLAG
"
,
help
=
"""
Print debug information (grep it or pipe into `less -SR`)
"""
)
parser
.
add_argument
(
'
-p
'
,
"
--pred
"
,
metavar
=
"
FILENAME.conllu
"
,
required
=
True
,
\
dest
=
"
pred_filename
"
,
type
=
argparse
.
FileType
(
'
r
'
,
encoding
=
'
UTF-8
'
),
\
help
=
"""
Test corpus in CoNLLU with *predicted* tags. (Required)
"""
)
parser
.
add_argument
(
'
-g
'
,
"
--gold
"
,
metavar
=
"
FILENAME.conllu
"
,
required
=
True
,
\
dest
=
"
gold_filename
"
,
type
=
argparse
.
FileType
(
'
r
'
,
encoding
=
'
UTF-8
'
),
\
help
=
"""
Test corpus in CoNLLU with *gold* tags. (Required)
"""
)
parser
.
add_argument
(
'
-t
'
,
"
--train
"
,
metavar
=
"
FILENAME.conllu
"
,
required
=
False
,
\
dest
=
"
train_filename
"
,
type
=
argparse
.
FileType
(
'
r
'
,
encoding
=
'
UTF-8
'
),
\
help
=
"""
Training corpus in CoNLL-U, from which tagger was learnt.
"""
)
parser
.
add_argument
(
'
-c
'
,
"
--tagcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
col_name_tag
"
,
required
=
False
,
type
=
str
,
default
=
"
upos
"
,
help
=
"""
Column name of tags,
\
as defined in header. Use lowercase
"""
)
parser
.
add_argument
(
'
-f
'
,
"
--featcolumn
"
,
metavar
=
"
NAME
"
,
dest
=
"
col_name_feat
"
,
required
=
False
,
type
=
str
,
default
=
"
form
"
,
help
=
"""
Column name of input
feature, as defined in header. Use lowercase.
"""
)
parser
.
add_argument
(
'
-u
'
,
"
--upos-filter
"
,
metavar
=
"
NAME
"
,
dest
=
"
upos_filter
"
,
required
=
False
,
type
=
str
,
nargs
=
'
+
'
,
default
=
[],
help
=
"""
Only calculate accuracy for words with UPOS in this list.
\
Empty list = no filter.
"""
)
########################################################################
def
process_args
(
parser
):
args
=
parser
.
parse_args
()
Util
.
DEBUG_FLAG
=
args
.
DEBUG_FLAG
args
.
col_name_tag
=
args
.
col_name_tag
.
lower
()
args
.
col_name_feat
=
args
.
col_name_feat
.
lower
()
Util
.
debug
(
"
Command-line arguments and defaults:
"
)
for
(
k
,
v
)
in
vars
(
args
).
items
():
Util
.
debug
(
"
* {}: {}
"
,
k
,
v
)
gold_corpus
=
CoNLLUReader
(
args
.
gold_filename
)
pred_corpus
=
CoNLLUReader
(
args
.
pred_filename
)
train_vocab
=
None
if
args
.
train_filename
:
train_corpus
=
CoNLLUReader
(
args
.
train_filename
)
ignoreme
,
train_vocab
=
train_corpus
.
to_int_and_vocab
({
args
.
col_name_feat
:[]})
if
args
.
col_name_tag
not
in
gold_corpus
.
header
or
\
args
.
col_name_feat
not
in
gold_corpus
.
header
:
Util
.
error
(
"
-c and -f names must be valid conllu column among:
\n
{}
"
,
gold_corpus
.
header
)
return
args
,
gold_corpus
,
pred_corpus
,
train_vocab
########################################################################
if
__name__
==
"
__main__
"
:
args
,
gold_corpus
,
pred_corpus
,
train_vocab
=
process_args
(
parser
)
total_tokens
=
correct_tokens
=
0
total_oov
=
correct_oov
=
0
for
(
sent_gold
,
sent_pred
)
in
zip
(
gold_corpus
.
readConllu
(),
pred_corpus
.
readConllu
()):
for
(
tok_gold
,
tok_pred
)
in
zip
(
sent_gold
,
sent_pred
):
if
not
args
.
upos_filter
or
tok_gold
[
'
upos
'
]
in
args
.
upos_filter
:
if
train_vocab
:
train_vocab_feat
=
train_vocab
[
args
.
col_name_feat
].
keys
()
if
tok_gold
[
args
.
col_name_feat
]
not
in
train_vocab_feat
:
total_oov
=
total_oov
+
1
oov
=
True
else
:
oov
=
False
if
tok_gold
[
args
.
col_name_tag
]
==
tok_pred
[
args
.
col_name_tag
]:
correct_tokens
=
correct_tokens
+
1
if
train_vocab
and
oov
:
correct_oov
=
correct_oov
+
1
total_tokens
+=
1
print
(
"
Pred file: {}
"
.
format
(
pred_corpus
.
name
()))
if
args
.
upos_filter
:
print
(
"
Results focus only on following UPOS: {}
"
.
format
(
"
"
.
join
(
args
.
upos_filter
)))
accuracy
=
(
correct_tokens
/
total_tokens
)
*
100
print
(
"
Accuracy on all {}: {:0.2f} ({}/{})
"
.
format
(
args
.
col_name_tag
,
accuracy
,
correct_tokens
,
total_tokens
))
if
train_vocab
:
accuracy_oov
=
(
correct_oov
/
total_oov
)
*
100
print
(
"
Accuracy on OOV {}: {:0.2f} ({}/{})
"
.
format
(
args
.
col_name_tag
,
accuracy_oov
,
correct_oov
,
total_oov
))
This diff is collapsed.
Click to expand it.
lib/conllulib.py
0 → 100644
+
109
−
0
View file @
58a4a244
#!/usr/bin/env python3
import
sys
import
conllu
import
collections
import
pdb
########################################################################
# UTILITY FUNCTIONS
########################################################################
class
Util
(
object
):
DEBUG_FLAG
=
False
########################################################################
@staticmethod
def
error
(
msg
,
*
kwargs
):
print
(
"
ERROR:
"
,
msg
.
format
(
*
kwargs
),
file
=
sys
.
stderr
)
sys
.
exit
(
-
1
)
########################################################################
@staticmethod
def
debug
(
msg
,
*
kwargs
):
if
Util
.
DEBUG_FLAG
:
print
(
msg
.
format
(
*
kwargs
),
file
=
sys
.
stderr
)
@staticmethod
def
rev_vocab
(
vocab
):
rev_dict
=
{
y
:
x
for
x
,
y
in
vocab
.
items
()}
return
[
rev_dict
[
k
]
for
k
in
range
(
len
(
rev_dict
))]
########################################################################
# CONLLU FUNCTIONS
########################################################################
class
CoNLLUReader
(
object
):
###########################################
def
__init__
(
self
,
infile
):
self
.
infile
=
infile
DEFAULT_HEADER
=
"
ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
"
+
\
"
PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE
"
try
:
first
=
self
.
infile
.
readline
().
strip
()
# First line in the file
globalcolumns
=
conllu
.
parse
(
first
)[
0
].
metadata
[
'
global.columns
'
]
self
.
header
=
globalcolumns
.
lower
().
split
(
"
"
)
self
.
infile
.
seek
(
0
)
# Rewind open file
except
KeyError
:
self
.
header
=
DEFAULT_HEADER
.
split
(
"
"
)
###########################################
def
readConllu
(
self
):
for
sent
in
conllu
.
parse_incr
(
self
.
infile
):
yield
sent
###########################################
def
name
(
self
):
return
self
.
infile
.
name
###########################################
def
to_int_and_vocab
(
self
,
col_name_dict
):
int_list
=
{};
vocab
=
{}
for
col_name
,
special_tokens
in
col_name_dict
.
items
():
int_list
[
col_name
]
=
[]
vocab
[
col_name
]
=
collections
.
defaultdict
(
lambda
:
len
(
vocab
[
col_name
]))
for
special_token
in
special_tokens
:
# Simple access to undefined dict key creates new ID (dict length)
vocab
[
col_name
][
special_token
]
for
s
in
self
.
readConllu
():
for
col_name
in
col_name_dict
.
keys
():
int_list
[
col_name
].
append
([
vocab
[
col_name
][
tok
[
col_name
]]
for
tok
in
s
])
# vocabs cannot be saved if they have lambda function: erase default_factory
for
col_name
in
col_name_dict
.
keys
():
vocab
[
col_name
].
default_factory
=
None
return
int_list
,
vocab
###########################################
def
to_int_from_vocab
(
self
,
col_name_dict
,
unk_token
,
vocab
=
{}):
int_list
=
{}
unk_toks
=
{}
for
col_name
,
special_tokens
in
col_name_dict
.
items
():
int_list
[
col_name
]
=
[]
unk_toks
[
col_name
]
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
for
s
in
self
.
readConllu
():
for
col_name
in
col_name_dict
.
keys
():
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
t
[
col_name
],
unk_toks
[
col_name
])
int_list
[
col_name
].
append
([
id_getter
(
vocab
,
tok
)
for
tok
in
s
])
return
int_list
###########################################
@staticmethod
def
to_int_from_vocab_sent
(
sent
,
col_name_dict
,
unk_token
,
vocab
=
{}):
int_list
=
{}
for
col_name
in
col_name_dict
.
keys
():
unk_tok_id
=
vocab
[
col_name
].
get
(
unk_token
,
None
)
id_getter
=
lambda
v
,
t
:
v
[
col_name
].
get
(
t
[
col_name
],
unk_tok_id
)
int_list
[
col_name
]
=
[
id_getter
(
vocab
,
tok
)
for
tok
in
sent
]
return
int_list
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment