diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/accuracy.py b/lib/accuracy.py new file mode 100755 index 0000000000000000000000000000000000000000..063050737f8d5ea52a46f20fb04a55c7a927db5d --- /dev/null +++ b/lib/accuracy.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import collections +import pdb +from conllulib import CoNLLUReader, Util + +################################################################################ + +parser = argparse.ArgumentParser(description="Calculates the accuracy of a \ +prediction with respect to the gold file. By default, uses UPOS, but this can \ +be configured with option -c.", +formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +parser.add_argument('-D', "--debug", action="store_true", dest="DEBUG_FLAG", + help="""Print debug information (grep it or pipe into `less -SR`)""") +parser.add_argument('-p', "--pred", metavar="FILENAME.conllu", required=True,\ + dest="pred_filename", type=argparse.FileType('r', encoding='UTF-8'), \ + help="""Test corpus in CoNLLU with *predicted* tags. (Required)""") +parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\ + dest="gold_filename", type=argparse.FileType('r', encoding='UTF-8'), \ + help="""Test corpus in CoNLLU with *gold* tags. (Required)""") +parser.add_argument('-t', "--train", metavar="FILENAME.conllu", required=False,\ + dest="train_filename", type=argparse.FileType('r', encoding='UTF-8'), \ + help="""Training corpus in CoNLL-U, from which tagger was learnt.""") +parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="col_name_tag", + required=False, type=str, default="upos", help="""Column name of tags, \ + as defined in header. Use lowercase""") +parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="col_name_feat", + required=False, type=str, default="form", help="""Column name of input + feature, as defined in header. Use lowercase.""") +parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter", + required=False, type=str, nargs='+', default=[], + help="""Only calculate accuracy for words with UPOS in this list. \ + Empty list = no filter.""") + +######################################################################## + +def process_args(parser): + args = parser.parse_args() + Util.DEBUG_FLAG = args.DEBUG_FLAG + args.col_name_tag = args.col_name_tag.lower() + args.col_name_feat = args.col_name_feat.lower() + Util.debug("Command-line arguments and defaults:") + for (k,v) in vars(args).items(): + Util.debug(" * {}: {}",k,v) + gold_corpus = CoNLLUReader(args.gold_filename) + pred_corpus = CoNLLUReader(args.pred_filename) + train_vocab = None + if args.train_filename: + train_corpus = CoNLLUReader(args.train_filename) + ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.col_name_feat:[]}) + if args.col_name_tag not in gold_corpus.header or \ + args.col_name_feat not in gold_corpus.header: + Util.error("-c and -f names must be valid conllu column among:\n{}", + gold_corpus.header) + return args, gold_corpus, pred_corpus, train_vocab + +######################################################################## + +if __name__ == "__main__": + args, gold_corpus, pred_corpus, train_vocab = process_args(parser) + total_tokens = correct_tokens = 0 + total_oov = correct_oov = 0 + for (sent_gold, sent_pred) in zip(gold_corpus.readConllu(), + pred_corpus.readConllu()): + for (tok_gold, tok_pred) in zip (sent_gold, sent_pred): + if not args.upos_filter or tok_gold['upos'] in args.upos_filter : + if train_vocab : + train_vocab_feat = train_vocab[args.col_name_feat].keys() + if tok_gold[args.col_name_feat] not in train_vocab_feat: + total_oov = total_oov + 1 + oov = True + else: + oov = False + if tok_gold[args.col_name_tag] == tok_pred[args.col_name_tag]: + correct_tokens = correct_tokens + 1 + if train_vocab and oov : + correct_oov = correct_oov + 1 + total_tokens += 1 + + print("Pred file: {}".format(pred_corpus.name())) + if args.upos_filter : + print("Results focus only on following UPOS: {}".format(" ".join(args.upos_filter))) + accuracy = (correct_tokens / total_tokens) * 100 + print("Accuracy on all {}: {:0.2f} ({}/{})".format(args.col_name_tag,accuracy, + correct_tokens, total_tokens)) + if train_vocab : + accuracy_oov = (correct_oov / total_oov) * 100 + print("Accuracy on OOV {}: {:0.2f} ({}/{})".format(args.col_name_tag, + accuracy_oov, + correct_oov, total_oov)) diff --git a/lib/conllulib.py b/lib/conllulib.py new file mode 100644 index 0000000000000000000000000000000000000000..a796733da46a16ed7f7c69b19c973722b14910ba --- /dev/null +++ b/lib/conllulib.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import sys +import conllu +import collections +import pdb + +######################################################################## +# UTILITY FUNCTIONS +######################################################################## + +class Util(object): + + DEBUG_FLAG = False + +######################################################################## + + @staticmethod + def error(msg, *kwargs): + print("ERROR:", msg.format(*kwargs), file=sys.stderr) + sys.exit(-1) + +######################################################################## + + @staticmethod + def debug(msg, *kwargs): + if Util.DEBUG_FLAG: + print(msg.format(*kwargs), file=sys.stderr) + + @staticmethod + def rev_vocab(vocab): + rev_dict = {y: x for x, y in vocab.items()} + return [rev_dict[k] for k in range(len(rev_dict))] + +######################################################################## +# CONLLU FUNCTIONS +######################################################################## + +class CoNLLUReader(object): + + ########################################### + + def __init__(self, infile): + self.infile = infile + DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\ + "PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE" + try: + first = self.infile.readline().strip() # First line in the file + globalcolumns = conllu.parse(first)[0].metadata['global.columns'] + self.header = globalcolumns.lower().split(" ") + self.infile.seek(0) # Rewind open file + except KeyError: + self.header = DEFAULT_HEADER.split(" ") + + ########################################### + + def readConllu(self): + for sent in conllu.parse_incr(self.infile): + yield sent + + ########################################### + + def name(self): + return self.infile.name + + ########################################### + + def to_int_and_vocab(self, col_name_dict): + int_list = {}; + vocab = {} + for col_name, special_tokens in col_name_dict.items(): + int_list[col_name] = [] + vocab[col_name] = collections.defaultdict(lambda: len(vocab[col_name])) + for special_token in special_tokens: + # Simple access to undefined dict key creates new ID (dict length) + vocab[col_name][special_token] + for s in self.readConllu(): + for col_name in col_name_dict.keys(): + int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s]) + # vocabs cannot be saved if they have lambda function: erase default_factory + for col_name in col_name_dict.keys(): + vocab[col_name].default_factory = None + return int_list, vocab + + ########################################### + + def to_int_from_vocab(self, col_name_dict, unk_token, vocab={}): + int_list = {} + unk_toks = {} + for col_name, special_tokens in col_name_dict.items(): + int_list[col_name] = [] + unk_toks[col_name] = vocab[col_name].get(unk_token,None) + for s in self.readConllu(): + for col_name in col_name_dict.keys(): + id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name]) + int_list[col_name].append([id_getter(vocab,tok) for tok in s]) + return int_list + + ########################################### + + @staticmethod + def to_int_from_vocab_sent(sent, col_name_dict, unk_token, vocab={}): + int_list = {} + for col_name in col_name_dict.keys(): + unk_tok_id = vocab[col_name].get(unk_token,None) + id_getter = lambda v,t: v[col_name].get(t[col_name],unk_tok_id) + int_list[col_name]=[id_getter(vocab,tok) for tok in sent] + return int_list +