diff --git a/lib/accuracy.py b/lib/accuracy.py index 078e7077b7c94d60fd3367171d6e5263cdb1e229..db33f8403ebb402933cebe6cf8460ef54effa169 100755 --- a/lib/accuracy.py +++ b/lib/accuracy.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 + import sys import argparse -import collections +from collections import defaultdict, Counter import pdb from conllulib import CoNLLUReader, Util @@ -23,10 +24,10 @@ parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\ parser.add_argument('-t', "--train", metavar="FILENAME.conllu", required=False,\ dest="train_filename", type=argparse.FileType('r', encoding='UTF-8'), \ help="""Training corpus in CoNLL-U, from which tagger was learnt.""") -parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="col_name_tag", +parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="name_tag", required=False, type=str, default="upos", help="""Column name of tags, \ as defined in header. Use lowercase""") -parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="col_name_feat", +parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="name_feat", required=False, type=str, default="form", help="""Column name of input feature, as defined in header. Use lowercase.""") parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter", @@ -34,13 +35,20 @@ parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter", help="""Only calculate accuracy for words with UPOS in this list. \ Empty list = no filter.""") -######################################################################## +################################################################################ -def process_args(parser): +def process_args(parser): + """ + Show (in debug mode) and process all command line options. Checks tag and feat + columns appear in corpora. Create training corpus vocabulary if option present + for OOV status check. Input is an instance of `argparse.ArgumentParser`, + returns list of `args`, `gold_corpus` and `pred_corpus` as `CoNLLUReader`, + `train_vocab` dictionary. + """ args = parser.parse_args() Util.DEBUG_FLAG = args.DEBUG_FLAG - args.col_name_tag = args.col_name_tag.lower() - args.col_name_feat = args.col_name_feat.lower() + args.name_tag = args.name_tag.lower() + args.name_feat = args.name_feat.lower() Util.debug("Command-line arguments and defaults:") for (k,v) in vars(args).items(): Util.debug(" * {}: {}",k,v) @@ -49,78 +57,141 @@ def process_args(parser): train_vocab = None if args.train_filename: train_corpus = CoNLLUReader(args.train_filename) - ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.col_name_feat:[]}) - if args.col_name_tag not in gold_corpus.header or \ - args.col_name_feat not in gold_corpus.header: + ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.name_feat:[]}) + if args.name_tag not in gold_corpus.header or \ + args.name_feat not in gold_corpus.header: Util.error("-c and -f names must be valid conllu column among:\n{}", gold_corpus.header) return args, gold_corpus, pred_corpus, train_vocab -######################################################################## - -if __name__ == "__main__": - args, gold_corpus, pred_corpus, train_vocab = process_args(parser) - prf = collections.defaultdict(lambda:{'tp':0,'t':0, 'p':0}) - total_tokens = correct_tokens = 0 - total_oov = correct_oov = 0 - for (sent_gold, sent_pred) in zip(gold_corpus.readConllu(), - pred_corpus.readConllu()): - for (tok_gold, tok_pred) in zip (sent_gold, sent_pred): - if not args.upos_filter or tok_gold['upos'] in args.upos_filter : - if train_vocab : - train_vocab_feat = train_vocab[args.col_name_feat].keys() - if tok_gold[args.col_name_feat] not in train_vocab_feat: - total_oov = total_oov + 1 - oov = True - else: - oov = False - if tok_gold[args.col_name_tag] == tok_pred[args.col_name_tag]: - correct_tokens = correct_tokens + 1 - if train_vocab and oov : - correct_oov = correct_oov + 1 - total_tokens += 1 - if args.col_name_tag == 'feats': - pred_feats = tok_pred['feats'] if tok_pred['feats'] else {} - gold_feats = tok_gold['feats'] if tok_gold['feats'] else {} - for key in pred_feats.keys(): - tp_inc = int(gold_feats.get(key,None) == pred_feats[key]) - prf[key]['tp'] = prf[key]['tp'] + tp_inc - prf['micro-average']['tp'] = prf['micro-average']['tp'] + tp_inc - p_inc = int(pred_feats.get(key,None) != None) - prf[key]['p'] = prf[key]['p'] + p_inc - prf['micro-average']['p'] = prf['micro-average']['p'] + p_inc - for key in gold_feats.keys(): - t_inc = int(gold_feats.get(key,None) != None) - prf[key]['t'] = prf[key]['t'] + t_inc - prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc - print("Pred file: {}".format(pred_corpus.name())) +################################################################################ + +def tp_count_feats(tok_pred, tok_gold, prf): + """ + Increment number of true positives, trues and positives for morph feature eval + Compares all features of `tok_pred` with thos of `tok_gold` + Result is modification of `prf` dict, function does not return anything + """ + pred_feats = tok_pred['feats'] if tok_pred['feats'] else {} + gold_feats = tok_gold['feats'] if tok_gold['feats'] else {} + for key in pred_feats.keys(): + tp_inc = int(gold_feats.get(key,None) == pred_feats[key]) + prf[key]['tp'] = prf[key]['tp'] + tp_inc + prf['micro-average']['tp'] = prf['micro-average']['tp'] + tp_inc + p_inc = int(pred_feats.get(key,None) != None) + prf[key]['p'] = prf[key]['p'] + p_inc + prf['micro-average']['p'] = prf['micro-average']['p'] + p_inc + for key in gold_feats.keys(): + t_inc = int(gold_feats.get(key,None) != None) + prf[key]['t'] = prf[key]['t'] + t_inc + prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc + +################################################################################ + +def parseme_cat_in(ent, ent_list): + """ + Verify if `ent` is present in `ent_list` by comparing both span AND category. + Default cuptlib implementation ignores category + """ + for ent_cand in ent_list: + if ent.span == ent_cand.span and ent.cat == ent_cand.cat : + return True + return False + +################################################################################ + +def tp_count_parseme(s_pred, s_gold, name_tag, prf): + try : + import parseme.cupt as cupt + except ImportError: + print("""Please install cuptlib before running this script\n\n git clone \ +https://gitlab.com/parseme/cuptlib.git\n cd cuptlib\n pip install .""") + sys.exit(-1) + ents_pred = cupt.retrieve_mwes(s_pred, column_name=name_tag) + ents_gold = cupt.retrieve_mwes(s_gold, column_name=name_tag) + prf['Exact-nocat']['p'] += len(ents_pred) + prf['Exact-nocat']['t'] += len(ents_gold) + for e_pred in ents_pred.values() : + if e_pred in ents_gold.values() : + #pdb.set_trace() + prf['Exact-nocat']['tp'] += 1 + if parseme_cat_in(e_pred, ents_gold.values()) : + prf['Exact-'+e_pred.cat]['tp'] += 1 + prf['Exact-'+e_pred.cat]['p'] += 1 + for e_pred in ents_gold.values() : + prf['Exact-'+e_pred.cat]['t'] += 1 + # Token-based evaluation - categories always ignored here + span_pred = sum([list(ep.int_span()) for ep in ents_pred.values()], start=[]) + span_gold = sum([list(eg.int_span()) for eg in ents_gold.values()], start=[]) + prf['Token-nocat']['p'] += len(span_pred) + prf['Token-nocat']['t'] += len(span_gold) + for e_pred in span_pred : + if e_pred in span_gold : + prf['Token-nocat']['tp'] += 1 + +################################################################################ + +def print_results(pred_corpus_name, args, acc, prf): + """ + Calculate and print accuracies, precision, recall, f-score, etc. + """ + print("Predictions file: {}".format(pred_corpus_name)) if args.upos_filter : - print("Results focus only on following UPOS: {}".format(" ".join(args.upos_filter))) - accuracy = (correct_tokens / total_tokens) * 100 - print("Accuracy on all {}: {:0.2f} ({}/{})".format(args.col_name_tag,accuracy, - correct_tokens, total_tokens)) - if train_vocab : - accuracy_oov = (correct_oov / total_oov) * 100 - print("Accuracy on OOV {}: {:0.2f} ({}/{})".format(args.col_name_tag, - accuracy_oov, - correct_oov, total_oov)) + print("Results concern only some UPOS: {}".format(" ".join(args.upos_filter))) + accuracy = (acc['correct_tokens'] / acc['total_tokens']) * 100 + print("Accuracy on all {}: {:0.2f} ({:5}/{:5})".format(args.name_tag, accuracy, + acc['correct_tokens'], acc['total_tokens'])) + if args.train_filename : + accuracy_oov = (acc['correct_oov'] / acc['total_oov']) * 100 + print("Accuracy on OOV {}: {:0.2f} ({:5}/{:5})".format(args.name_tag, accuracy_oov, + acc['correct_oov'], acc['total_oov'])) if prf: - print("Metrics per feature:") - macro = {"precis":0.0,"recall":0.0} - for key in sorted(prf.keys()): - precis = prf[key]['tp'] / max(1,prf[key]['p']) # max prevents zero-division - recall = prf[key]['tp'] / max(1,prf[key]['t']) - fscore = (2*precis*recall)/max(1,precis+recall) + print("\nPrecision, recall, and F-score for {}:".format(args.name_tag)) + macro = {"precis":0.0, "recall":0.0} + for key in sorted(prf): # max prevents zero-division in P and R + precis = (prf[key]['tp'] / max(1, prf[key]['p'])) * 100 + recall = (prf[key]['tp'] / max(1, prf[key]['t'])) * 100 + fscore = ((2 * precis * recall) / max(1, precis + recall)) if key != 'micro-average': macro['precis'] = macro['precis'] + precis macro['recall'] = macro['recall'] + recall else: print() templ = "{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}" - print(templ.format(key,precis*100,prf[key]['tp'],prf[key]['p'],recall*100, - prf[key]['tp'],prf[key]['t'], fscore*100)) - templ = "{:13}: P={:6.2f} / R={:6.2f} / F={:6.2f}" - ma_precis = macro['precis'] / (len(prf.keys())-1) - ma_recall = macro['recall'] / (len(prf.keys())-1) - ma_fscore = (2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall) - print(templ.format("macro-average",ma_precis*100,ma_recall*100, ma_fscore*100)) + print(templ.format(key, precis, prf[key]['tp'], prf[key]['p'], recall, + prf[key]['tp'], prf[key]['t'], fscore)) + templ = "{:13}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}" + if len(prf) > 1 : # Calculate macro-precision + nb_scores = len(prf)-1 if "micro-average" in prf else len(prf) + ma_precis = (macro['precis'] / (nb_scores)) + ma_recall = (macro['recall'] / (nb_scores)) + ma_fscore = ((2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall)) + print(templ.format("macro-average", ma_precis, ma_recall, ma_fscore)) + +################################################################################ + +if __name__ == "__main__": + args, gold_corpus, pred_corpus, train_vocab = process_args(parser) + prf = defaultdict(lambda:{'tp':0,'t':0, 'p':0}) # used for feats, NEs and MWEs + acc = Counter() # store correct and total for all and OOV + for (s_gold,s_pred) in zip(gold_corpus.readConllu(),pred_corpus.readConllu()): + if args.name_tag.startswith("parseme"): + tp_count_parseme(s_pred, s_gold, args.name_tag, prf) + for (tok_gold, tok_pred) in zip (s_gold, s_pred): + if not args.upos_filter or tok_gold['upos'] in args.upos_filter : + if train_vocab : + train_vocab_feat = train_vocab[args.name_feat].keys() + if tok_gold[args.name_feat] not in train_vocab_feat: + acc['total_oov'] += 1 + oov = True + else: + oov = False + if tok_gold[args.name_tag] == tok_pred[args.name_tag]: + acc['correct_tokens'] += 1 + if train_vocab and oov : + acc['correct_oov'] += 1 + acc['total_tokens'] += 1 + if args.name_tag == 'feats': + tp_count_feats(tok_gold, tok_pred, prf) + print_results(pred_corpus.name(), args, acc, prf) + diff --git a/lib/conllulib.py b/lib/conllulib.py index 6c9d409ba6a4fb161ebbcc4b556669cd3f286f10..a701aa1447199fac4cc232513874c63e2a577b89 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -6,6 +6,7 @@ import collections from torch.utils.data import TensorDataset, DataLoader import torch import random +import numpy as np import pdb ######################################################################## @@ -15,6 +16,7 @@ import pdb class Util(object): DEBUG_FLAG = False + PSEUDO_INF = 9999.0 ############################### @@ -64,6 +66,23 @@ class Util(object): random.seed(seed) torch.manual_seed(seed) + ############################### + + @staticmethod + def log_cap(number): + """Returns the base-10 logarithm of `number`. + If `number` is negative, stops the program with an error message. + If `number` is zero returns -9999.0 representing negative pseudo infinity + This is more convenient than -np.inf returned by np.log10 because : + inf + a = inf (no difference in sum) but 9999.0 + a != 9999.0""" + if number < 0 : + Util.error("Cannot get logarithm of negative number {}".format(number)) + elif number == 0: + return -Util.PSEUDO_INF + else : + return np.log10(number) + + ######################################################################## # CONLLU FUNCTIONS ######################################################################## @@ -72,6 +91,8 @@ class CoNLLUReader(object): ############################### + start_tag = "<s>" + def __init__(self, infile): self.infile = infile DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\ @@ -129,14 +150,14 @@ class CoNLLUReader(object): ############################### - def to_int_from_vocab(self, col_name_dict, unk_token, vocab={}): + def to_int_from_vocab(self, col_names, unk_token, vocab={}): int_list = {} unk_toks = {} - for col_name, special_tokens in col_name_dict.items(): + for col_name in col_names: int_list[col_name] = [] unk_toks[col_name] = vocab[col_name].get(unk_token,None) for s in self.readConllu(): - for col_name in col_name_dict.keys(): + for col_name in col_names: id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name]) int_list[col_name].append([id_getter(vocab,tok) for tok in s]) return int_list @@ -144,11 +165,126 @@ class CoNLLUReader(object): ############################### @staticmethod - def to_int_from_vocab_sent(sent, col_name_dict, unk_token, vocab={}): + def to_int_from_vocab_sent(sent, col_names, unk_token, vocab={}, + lowercase=False): int_list = {} - for col_name in col_name_dict.keys(): - unk_tok_id = vocab[col_name].get(unk_token,None) - id_getter = lambda v,t: v[col_name].get(t[col_name],unk_tok_id) + for col_name in col_names: + unk_tok_id = vocab[col_name].get(unk_token, None) + low_or_not = lambda w: w.lower() if lowercase else w + id_getter = lambda v,t: v[col_name].get(low_or_not(t[col_name]),unk_tok_id) int_list[col_name]=[id_getter(vocab,tok) for tok in sent] return int_list + ############################### + + @staticmethod + def to_bio(sent, bio_style='bio', name_tag='parseme:ne'): + bio_enc = [] + neindex = 0 + for tok in sent : + netag = tok[name_tag] + if netag == '*' : + cur_tag = 'O' + elif netag == neindex : + cur_tag = 'I' + necat + else : + neindex, necat = netag.split(":") + necat = '-' + necat + if bio_style == 'io' : + cur_tag = 'I' + necat + else: + cur_tag = 'B' + necat + bio_enc.append(cur_tag) + return bio_enc + +############################### + + @staticmethod + def from_bio(bio_enc, bio_style='bio', stop_on_error=False): + """Converst BIO-encoded annotations into Sequoia/parseme format. + Input `bio_enc` is a list of strings, each corresponding to one BIO tag. + `bio_style` can be "bio" (default) or "io". Will try to recover encoding + errors by replacing wrong tags when `stop_on_error` equals False (default), + otherwise stops execution and shows an error message. + Only works for BIO-cat & IO-cat, with -cat appended to both B and I tags. + Requires adaptations for BIOES, and encoding schemes without "-cat. + Examples: + >>> from_bio(["B-PERS", "I-PERS", "I-PERS", "O", "B-LOC", "I-LOC"], bio_style='bio') + ['1:PERS', '1', '1', '*', '2:LOC', '2'] + + >>> from_bio(["B-PERS", "I-PERS", "I-PERS", "O", "B-LOC", "I-LOC"],bio_style='io') + WARNING: Got B tag in spite of 'io' bio_style: interpreted as I + WARNING: Got B tag in spite of 'io' bio_style: interpreted as I + ['1:PERS', '1', '1', '*', '2:LOC', '2'] + + >>> from_bio(["I-PERS", "B-PERS", "I-PERS", "O", "I-LOC"],bio_style='io') + WARNING: Got B tag in spite of 'io' bio_style: interpreted as I + ['1:PERS', '1', '1', '*', '2:LOC'] + + >>> from_bio(["I-PERS", "I-PERS", "I-PERS", "O", "I-LOC"], bio_style='bio') + WARNING: Invalid I-initial tag I-PERS converted to B + WARNING: Invalid I-initial tag I-LOC converted to B + ['1:PERS', '1', '1', '*', '2:LOC'] + + >>> from_bio(["I-PERS", "B-PERS", "I-PERS", "O", "I-LOC"], bio_style='bio') + WARNING: Invalid I-initial tag I-PERS converted to B + WARNING: Invalid I-initial tag I-LOC converted to B + ['1:PERS', '2:PERS', '2', '*', '3:LOC'] + + >>> from_bio(["I-PERS", "B-PERS", "I-EVE", "O", "I-PERS"], bio_style='io') + ['1:PERS', '2:PERS', '3:EVE', '*', '4:PERS'] + + >>> from_bio(["I-PERS", "B-PERS", "I-EVE", "O", "I-PERS"], bio_style='bio') + WARNING: Invalid I-initial tag I-PERS converted to B + WARNING: Invalid I-initial tag I-EVE converted to B + WARNING: Invalid I-initial tag I-PERS converted to B + ['1:PERS', '2:PERS', '3:EVE', '*', '4:PERS'] + """ + # TODO: warning if I-cat != previous I-cat or B-cat + result = [] + neindex = 0 + prev_bio_tag = 'O' + prev_cat = None + for bio_tag in bio_enc : + if bio_tag == 'O' : + seq_tag = '*' + elif bio_tag[0] in ['B', 'I'] and bio_tag[1] == '-': + necat = bio_tag.split("-")[1] + if bio_tag[0] == 'B' and bio_style == 'bio': + neindex += 1 # Begining of an entity + seq_tag = str(neindex) + ":" + necat + elif bio_tag[0] == 'B' : # bio_style = 'io' + if stop_on_error: + Util.error("B tag not allowed with 'io'") + else: + bio_tag = bio_tag.replace("B-", "I-") + Util.warn("Got B tag in spite of 'io' bio_style: interpreted as I") + if bio_tag[0] == "I" and bio_style == "io" : + if necat != prev_cat: + neindex += 1 # Begining of an entity + seq_tag = str(neindex) + ":" + necat + else: + seq_tag = str(neindex) # is a continuation + elif bio_tag[0] == "I" : # tag is "I" and bio_style is "bio" + if bio_style == 'bio' and prev_bio_tag != 'O' and necat == prev_cat : + seq_tag = str(neindex) # is a continuation + elif stop_on_error : + Util.error("Invalid I-initial tag in BIO format: {}".format(bio_tag)) + else: + neindex += 1 # Begining of an entity + seq_tag = str(neindex) + ":" + necat + Util.warn("Invalid I-initial tag {} converted to B".format(bio_tag)) + prev_cat = necat + else: + if stop_on_error: + Util.error("Invalid BIO tag: {}".format(bio_tag)) + else: + Util.warn("Invalid BIO tag {} converted to O".format(bio_tag)) + result.append("*") + result.append(seq_tag) + prev_bio_tag = bio_tag + return result + +################################################################################ + + diff --git a/sequoia/bin/simplify_sequoia.py b/sequoia/bin/simplify_sequoia.py index 3f890ad784d087662b373c0017a55231774d489b..517e7d0b5c8096d8b27e87e111b42dcc220edc83 100755 --- a/sequoia/bin/simplify_sequoia.py +++ b/sequoia/bin/simplify_sequoia.py @@ -35,7 +35,13 @@ import sys import conllu import re import pdb -import parseme.cupt as cupt +import subprocess +try : + import parseme.cupt as cupt +except ImportError: + print("""Please install cuptlib before running this script\n\n git clone \ + https://gitlab.com/parseme/cuptlib.git\n cd cuptlib\n pip install .""") + sys.exit(-1) #########################################