diff --git a/lib/accuracy.py b/lib/accuracy.py index 3c6186c5c8a4caf3a05d24781dfe111cc780fba3..a9038ba96bdb0bdb80ce2afcd2fbf130db2439e1 100755 --- a/lib/accuracy.py +++ b/lib/accuracy.py @@ -80,14 +80,14 @@ def tp_count_feats(tok_pred, tok_gold, prf): for key in pred_feats.keys(): tp_inc = int(gold_feats.get(key,None) == pred_feats[key]) prf[key]['tp'] = prf[key]['tp'] + tp_inc - prf['micro-average']['tp'] = prf['micro-average']['tp'] + tp_inc + prf['micro-avg']['tp'] = prf['micro-avg']['tp'] + tp_inc p_inc = int(pred_feats.get(key,None) != None) prf[key]['p'] = prf[key]['p'] + p_inc - prf['micro-average']['p'] = prf['micro-average']['p'] + p_inc + prf['micro-avg']['p'] = prf['micro-avg']['p'] + p_inc for key in gold_feats.keys(): t_inc = int(gold_feats.get(key,None) != None) prf[key]['t'] = prf[key]['t'] + t_inc - prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc + prf['micro-avg']['t'] = prf['micro-avg']['t'] + t_inc ################################################################################ @@ -128,14 +128,14 @@ https://gitlab.com/parseme/cuptlib.git\n cd cuptlib\n pip install .""") prf['Exact-'+e_pred.cat]['p'] += 1 for e_pred in ents_gold.values() : prf['Exact-'+e_pred.cat]['t'] += 1 - # Token-based evaluation - categories always ignored here + # Fuzzy (token-based) evaluation - categories always ignored here span_pred = sum([list(ep.int_span()) for ep in ents_pred.values()], start=[]) span_gold = sum([list(eg.int_span()) for eg in ents_gold.values()], start=[]) - prf['Token-nocat']['p'] += len(span_pred) - prf['Token-nocat']['t'] += len(span_gold) + prf['Fuzzy-nocat']['p'] += len(span_pred) + prf['Fuzzy-nocat']['t'] += len(span_gold) for e_pred in span_pred : if e_pred in span_gold : - prf['Token-nocat']['tp'] += 1 + prf['Fuzzy-nocat']['tp'] += 1 ################################################################################ @@ -172,21 +172,21 @@ def print_results(pred_corpus_name, args, acc, prf, parsing=False): precis = (prf[key]['tp'] / max(1, prf[key]['p'])) * 100 recall = (prf[key]['tp'] / max(1, prf[key]['t'])) * 100 fscore = ((2 * precis * recall) / max(1, precis + recall)) - if key != 'micro-average': + if key != 'micro-avg': macro['precis'] = macro['precis'] + precis macro['recall'] = macro['recall'] + recall else: print() - templ = "{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}" + templ = "{:11}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}" print(templ.format(key, precis, prf[key]['tp'], prf[key]['p'], recall, prf[key]['tp'], prf[key]['t'], fscore)) - templ = "{:13}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}" + templ = "{:11}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}" if len(prf) > 1 : # Calculate macro-precision - nb_scores = len(prf)-1 if "micro-average" in prf else len(prf) + nb_scores = len(prf)-1 if "micro-avg" in prf else len(prf) ma_precis = (macro['precis'] / (nb_scores)) ma_recall = (macro['recall'] / (nb_scores)) ma_fscore = ((2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall)) - print(templ.format("macro-average", ma_precis, ma_recall, ma_fscore)) + print(templ.format("macro-avg", ma_precis, ma_recall, ma_fscore)) ################################################################################ diff --git a/lib/conllulib.py b/lib/conllulib.py index 08dbc14c81d0fdfd61ea039e101c55537447f3e5..516c7cc8dc5ab7aebf6c68a586950cfafd5ac004 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -131,29 +131,48 @@ class CoNLLUReader(object): ############################### - start_tag = "<s>" - - def __init__(self, infile): + def __init__(self, infile): + """ + Initialise a CoNLL-U reader object from an open `infile` handler (read mode, + UTF-8 encoding). Tries to automatically get the names of all columns from + first line "# global.columns" meta-data. + """ self.infile = infile - DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\ - "PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE" - try: + try: # guess the header (names of columns) from first line first = self.infile.readline().strip() # First line in the file globalcolumns = conllu.parse(first)[0].metadata['global.columns'] self.header = globalcolumns.lower().split(" ") self.infile.seek(0) # Rewind open file - except KeyError: - self.header = DEFAULT_HEADER.split(" ") + except KeyError: # if first line absent (wrong format), try to set a default + DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\ + "PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE" + self.header = DEFAULT_HEADER.lower().split(" ") ############################### def readConllu(self): + """ + Yields sentences as `TokenList` from open CoNLL-U file given to constructor + """ for sent in conllu.parse_incr(self.infile): yield sent + + ############################### + + @staticmethod + def readConlluStr(conllustring): + """ + Yields sentences as `TokenList` from CoNLL-U text given as a string + """ + for sent in conllu.parse(conllustring): + yield sent ############################### def name(self): + """ + Returns the CoNLL-U filename + """ return self.infile.name ############################### @@ -176,7 +195,26 @@ class CoNLLUReader(object): ############################### - def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}): + def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}): + """ + Transforms open `self.infile` into lists of integer indices and associated + vocabularies. Vocabularies are created on the fly, according to the file + contents. Parameter `col_name_dict` is a dictionary with column names to + encode as keys, and containing as values a list of special tokens for each + column, for instance: + col_name_dict = {"form":["<PAD>", "<UNK>"], "upos":["<PAD>"]} + means that 2 columns will be encoded, "form" and "upos", with the + corresponding special symbols in respective vocabularies. Parameter + `extra_cols_dict` is similar, but instead of list of special tokens, value + is a function to be applied to each column value, for instance: + extra_cols_dict = {"head":int} + means that column "head" will also be encoded, but with no vocabulary + associated. Instead, column values are directly encoded with function int. + Returns a tuple of 2 dicts, `int_list` and `vocab`, with same keys as those + in `col_name_dict` and `extra_cols_dict`, and results as values (list of + integers and vocabulary dict, respectively)\ + Useful to encode **training** corpora. + """ int_list = {}; vocab = {} for col_name, special_tokens in col_name_dict.items(): @@ -201,6 +239,16 @@ class CoNLLUReader(object): ############################### def to_int_from_vocab(self, col_names, unk_token, vocab={}, extra_cols_dict={}): + """ + Transforms open `self.infile` into lists of integer indices according to + provided `vocab` dictionaries (different from `to_int_and_vocab`, where + vocabs are also built). Values not found in `vocab` will be replaced by + `vocab[unk_token]`. Parameters `col_name_dict` and `extra_cols_dict` are + the same as in `to_int_and_vocab`, see above. Returns a dict, `int_list`, + with same keys as those in `col_name_dict` and `extra_cols_dict`, and + results as values (list of integers). + Useful to encode **test/dev** corpora. + """ int_list = {} unk_toks = {} for col_name in col_names: @@ -221,6 +269,11 @@ class CoNLLUReader(object): @staticmethod def to_int_from_vocab_sent(sent, col_names, unk_token, vocab={}, lowercase=False): + """ + Similar to `to_int_from_vocab` above, but applies to a single `sent` + represented as a `TokenList`. Extra possibility to `lowercase` sentence + elements before looking them up in `vocab`. + """ int_list = {} for col_name in col_names: unk_tok_id = vocab[col_name].get(unk_token, None) @@ -239,6 +292,13 @@ class CoNLLUReader(object): belonging to the same NE get the same int + first gets ":category" suffix). The output has category appended to 'B' and 'I' tags. The `bio_style` can be 'bio' or 'io', the latter has only 'I-category' tags, no 'B's. + Example: + >>> test=\"\"\"# global.columns = ID FORM parseme:ne\n1\tLe\t1:PROD\n2\tPetit\t1\n3\tPrince\t1\n4\tde\t*\n5\tSaint-Exupéry\t2:PERS\n6\test\t*\n7\tentré\t*\n8\tà\t*\n9\tl'\t*\n10\tÉcole\t3:ORG\n11\tJules-Romains\t3\"\"\" + >>> for sent in readConlluString(test): + >>> print(CoNLLUReader.to_bio(sent)) + ['B-PROD', 'I-PROD', 'I-PROD', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG'] + + """ bio_enc = [] neindex = 0