Skip to content
Snippets Groups Projects
Commit 6d6ecda4 authored by Carlos Ramisch's avatar Carlos Ramisch
Browse files

Update documentation of conllulib

parent 13cc0ebe
Branches
No related tags found
No related merge requests found
......@@ -80,14 +80,14 @@ def tp_count_feats(tok_pred, tok_gold, prf):
for key in pred_feats.keys():
tp_inc = int(gold_feats.get(key,None) == pred_feats[key])
prf[key]['tp'] = prf[key]['tp'] + tp_inc
prf['micro-average']['tp'] = prf['micro-average']['tp'] + tp_inc
prf['micro-avg']['tp'] = prf['micro-avg']['tp'] + tp_inc
p_inc = int(pred_feats.get(key,None) != None)
prf[key]['p'] = prf[key]['p'] + p_inc
prf['micro-average']['p'] = prf['micro-average']['p'] + p_inc
prf['micro-avg']['p'] = prf['micro-avg']['p'] + p_inc
for key in gold_feats.keys():
t_inc = int(gold_feats.get(key,None) != None)
prf[key]['t'] = prf[key]['t'] + t_inc
prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc
prf['micro-avg']['t'] = prf['micro-avg']['t'] + t_inc
################################################################################
......@@ -128,14 +128,14 @@ https://gitlab.com/parseme/cuptlib.git\n cd cuptlib\n pip install .""")
prf['Exact-'+e_pred.cat]['p'] += 1
for e_pred in ents_gold.values() :
prf['Exact-'+e_pred.cat]['t'] += 1
# Token-based evaluation - categories always ignored here
# Fuzzy (token-based) evaluation - categories always ignored here
span_pred = sum([list(ep.int_span()) for ep in ents_pred.values()], start=[])
span_gold = sum([list(eg.int_span()) for eg in ents_gold.values()], start=[])
prf['Token-nocat']['p'] += len(span_pred)
prf['Token-nocat']['t'] += len(span_gold)
prf['Fuzzy-nocat']['p'] += len(span_pred)
prf['Fuzzy-nocat']['t'] += len(span_gold)
for e_pred in span_pred :
if e_pred in span_gold :
prf['Token-nocat']['tp'] += 1
prf['Fuzzy-nocat']['tp'] += 1
################################################################################
......@@ -172,21 +172,21 @@ def print_results(pred_corpus_name, args, acc, prf, parsing=False):
precis = (prf[key]['tp'] / max(1, prf[key]['p'])) * 100
recall = (prf[key]['tp'] / max(1, prf[key]['t'])) * 100
fscore = ((2 * precis * recall) / max(1, precis + recall))
if key != 'micro-average':
if key != 'micro-avg':
macro['precis'] = macro['precis'] + precis
macro['recall'] = macro['recall'] + recall
else:
print()
templ = "{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}"
templ = "{:11}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}"
print(templ.format(key, precis, prf[key]['tp'], prf[key]['p'], recall,
prf[key]['tp'], prf[key]['t'], fscore))
templ = "{:13}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}"
templ = "{:11}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}"
if len(prf) > 1 : # Calculate macro-precision
nb_scores = len(prf)-1 if "micro-average" in prf else len(prf)
nb_scores = len(prf)-1 if "micro-avg" in prf else len(prf)
ma_precis = (macro['precis'] / (nb_scores))
ma_recall = (macro['recall'] / (nb_scores))
ma_fscore = ((2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall))
print(templ.format("macro-average", ma_precis, ma_recall, ma_fscore))
print(templ.format("macro-avg", ma_precis, ma_recall, ma_fscore))
################################################################################
......
......@@ -131,29 +131,48 @@ class CoNLLUReader(object):
###############################
start_tag = "<s>"
def __init__(self, infile):
"""
Initialise a CoNLL-U reader object from an open `infile` handler (read mode,
UTF-8 encoding). Tries to automatically get the names of all columns from
first line "# global.columns" meta-data.
"""
self.infile = infile
DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
"PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
try:
try: # guess the header (names of columns) from first line
first = self.infile.readline().strip() # First line in the file
globalcolumns = conllu.parse(first)[0].metadata['global.columns']
self.header = globalcolumns.lower().split(" ")
self.infile.seek(0) # Rewind open file
except KeyError:
self.header = DEFAULT_HEADER.split(" ")
except KeyError: # if first line absent (wrong format), try to set a default
DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
"PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
self.header = DEFAULT_HEADER.lower().split(" ")
###############################
def readConllu(self):
"""
Yields sentences as `TokenList` from open CoNLL-U file given to constructor
"""
for sent in conllu.parse_incr(self.infile):
yield sent
###############################
@staticmethod
def readConlluStr(conllustring):
"""
Yields sentences as `TokenList` from CoNLL-U text given as a string
"""
for sent in conllu.parse(conllustring):
yield sent
###############################
def name(self):
"""
Returns the CoNLL-U filename
"""
return self.infile.name
###############################
......@@ -177,6 +196,25 @@ class CoNLLUReader(object):
###############################
def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}):
"""
Transforms open `self.infile` into lists of integer indices and associated
vocabularies. Vocabularies are created on the fly, according to the file
contents. Parameter `col_name_dict` is a dictionary with column names to
encode as keys, and containing as values a list of special tokens for each
column, for instance:
col_name_dict = {"form":["<PAD>", "<UNK>"], "upos":["<PAD>"]}
means that 2 columns will be encoded, "form" and "upos", with the
corresponding special symbols in respective vocabularies. Parameter
`extra_cols_dict` is similar, but instead of list of special tokens, value
is a function to be applied to each column value, for instance:
extra_cols_dict = {"head":int}
means that column "head" will also be encoded, but with no vocabulary
associated. Instead, column values are directly encoded with function int.
Returns a tuple of 2 dicts, `int_list` and `vocab`, with same keys as those
in `col_name_dict` and `extra_cols_dict`, and results as values (list of
integers and vocabulary dict, respectively)\
Useful to encode **training** corpora.
"""
int_list = {};
vocab = {}
for col_name, special_tokens in col_name_dict.items():
......@@ -201,6 +239,16 @@ class CoNLLUReader(object):
###############################
def to_int_from_vocab(self, col_names, unk_token, vocab={}, extra_cols_dict={}):
"""
Transforms open `self.infile` into lists of integer indices according to
provided `vocab` dictionaries (different from `to_int_and_vocab`, where
vocabs are also built). Values not found in `vocab` will be replaced by
`vocab[unk_token]`. Parameters `col_name_dict` and `extra_cols_dict` are
the same as in `to_int_and_vocab`, see above. Returns a dict, `int_list`,
with same keys as those in `col_name_dict` and `extra_cols_dict`, and
results as values (list of integers).
Useful to encode **test/dev** corpora.
"""
int_list = {}
unk_toks = {}
for col_name in col_names:
......@@ -221,6 +269,11 @@ class CoNLLUReader(object):
@staticmethod
def to_int_from_vocab_sent(sent, col_names, unk_token, vocab={},
lowercase=False):
"""
Similar to `to_int_from_vocab` above, but applies to a single `sent`
represented as a `TokenList`. Extra possibility to `lowercase` sentence
elements before looking them up in `vocab`.
"""
int_list = {}
for col_name in col_names:
unk_tok_id = vocab[col_name].get(unk_token, None)
......@@ -239,6 +292,13 @@ class CoNLLUReader(object):
belonging to the same NE get the same int + first gets ":category" suffix).
The output has category appended to 'B' and 'I' tags. The `bio_style` can
be 'bio' or 'io', the latter has only 'I-category' tags, no 'B's.
Example:
>>> test=\"\"\"# global.columns = ID FORM parseme:ne\n1\tLe\t1:PROD\n2\tPetit\t1\n3\tPrince\t1\n4\tde\t*\n5\tSaint-Exupéry\t2:PERS\n6\test\t*\n7\tentré\t*\n8\tà\t*\n9\tl'\t*\n10\tÉcole\t3:ORG\n11\tJules-Romains\t3\"\"\"
>>> for sent in readConlluString(test):
>>> print(CoNLLUReader.to_bio(sent))
['B-PROD', 'I-PROD', 'I-PROD', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG']
"""
bio_enc = []
neindex = 0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment