Skip to content
Snippets Groups Projects
Commit 58a4a244 authored by ceramisch's avatar ceramisch
Browse files

Add first version of library scripts

parent 23f6b043
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import sys
import argparse
import collections
import pdb
from conllulib import CoNLLUReader, Util
################################################################################
parser = argparse.ArgumentParser(description="Calculates the accuracy of a \
prediction with respect to the gold file. By default, uses UPOS, but this can \
be configured with option -c.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-D', "--debug", action="store_true", dest="DEBUG_FLAG",
help="""Print debug information (grep it or pipe into `less -SR`)""")
parser.add_argument('-p', "--pred", metavar="FILENAME.conllu", required=True,\
dest="pred_filename", type=argparse.FileType('r', encoding='UTF-8'), \
help="""Test corpus in CoNLLU with *predicted* tags. (Required)""")
parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\
dest="gold_filename", type=argparse.FileType('r', encoding='UTF-8'), \
help="""Test corpus in CoNLLU with *gold* tags. (Required)""")
parser.add_argument('-t', "--train", metavar="FILENAME.conllu", required=False,\
dest="train_filename", type=argparse.FileType('r', encoding='UTF-8'), \
help="""Training corpus in CoNLL-U, from which tagger was learnt.""")
parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="col_name_tag",
required=False, type=str, default="upos", help="""Column name of tags, \
as defined in header. Use lowercase""")
parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="col_name_feat",
required=False, type=str, default="form", help="""Column name of input
feature, as defined in header. Use lowercase.""")
parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter",
required=False, type=str, nargs='+', default=[],
help="""Only calculate accuracy for words with UPOS in this list. \
Empty list = no filter.""")
########################################################################
def process_args(parser):
args = parser.parse_args()
Util.DEBUG_FLAG = args.DEBUG_FLAG
args.col_name_tag = args.col_name_tag.lower()
args.col_name_feat = args.col_name_feat.lower()
Util.debug("Command-line arguments and defaults:")
for (k,v) in vars(args).items():
Util.debug(" * {}: {}",k,v)
gold_corpus = CoNLLUReader(args.gold_filename)
pred_corpus = CoNLLUReader(args.pred_filename)
train_vocab = None
if args.train_filename:
train_corpus = CoNLLUReader(args.train_filename)
ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.col_name_feat:[]})
if args.col_name_tag not in gold_corpus.header or \
args.col_name_feat not in gold_corpus.header:
Util.error("-c and -f names must be valid conllu column among:\n{}",
gold_corpus.header)
return args, gold_corpus, pred_corpus, train_vocab
########################################################################
if __name__ == "__main__":
args, gold_corpus, pred_corpus, train_vocab = process_args(parser)
total_tokens = correct_tokens = 0
total_oov = correct_oov = 0
for (sent_gold, sent_pred) in zip(gold_corpus.readConllu(),
pred_corpus.readConllu()):
for (tok_gold, tok_pred) in zip (sent_gold, sent_pred):
if not args.upos_filter or tok_gold['upos'] in args.upos_filter :
if train_vocab :
train_vocab_feat = train_vocab[args.col_name_feat].keys()
if tok_gold[args.col_name_feat] not in train_vocab_feat:
total_oov = total_oov + 1
oov = True
else:
oov = False
if tok_gold[args.col_name_tag] == tok_pred[args.col_name_tag]:
correct_tokens = correct_tokens + 1
if train_vocab and oov :
correct_oov = correct_oov + 1
total_tokens += 1
print("Pred file: {}".format(pred_corpus.name()))
if args.upos_filter :
print("Results focus only on following UPOS: {}".format(" ".join(args.upos_filter)))
accuracy = (correct_tokens / total_tokens) * 100
print("Accuracy on all {}: {:0.2f} ({}/{})".format(args.col_name_tag,accuracy,
correct_tokens, total_tokens))
if train_vocab :
accuracy_oov = (correct_oov / total_oov) * 100
print("Accuracy on OOV {}: {:0.2f} ({}/{})".format(args.col_name_tag,
accuracy_oov,
correct_oov, total_oov))
#!/usr/bin/env python3
import sys
import conllu
import collections
import pdb
########################################################################
# UTILITY FUNCTIONS
########################################################################
class Util(object):
DEBUG_FLAG = False
########################################################################
@staticmethod
def error(msg, *kwargs):
print("ERROR:", msg.format(*kwargs), file=sys.stderr)
sys.exit(-1)
########################################################################
@staticmethod
def debug(msg, *kwargs):
if Util.DEBUG_FLAG:
print(msg.format(*kwargs), file=sys.stderr)
@staticmethod
def rev_vocab(vocab):
rev_dict = {y: x for x, y in vocab.items()}
return [rev_dict[k] for k in range(len(rev_dict))]
########################################################################
# CONLLU FUNCTIONS
########################################################################
class CoNLLUReader(object):
###########################################
def __init__(self, infile):
self.infile = infile
DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
"PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
try:
first = self.infile.readline().strip() # First line in the file
globalcolumns = conllu.parse(first)[0].metadata['global.columns']
self.header = globalcolumns.lower().split(" ")
self.infile.seek(0) # Rewind open file
except KeyError:
self.header = DEFAULT_HEADER.split(" ")
###########################################
def readConllu(self):
for sent in conllu.parse_incr(self.infile):
yield sent
###########################################
def name(self):
return self.infile.name
###########################################
def to_int_and_vocab(self, col_name_dict):
int_list = {};
vocab = {}
for col_name, special_tokens in col_name_dict.items():
int_list[col_name] = []
vocab[col_name] = collections.defaultdict(lambda: len(vocab[col_name]))
for special_token in special_tokens:
# Simple access to undefined dict key creates new ID (dict length)
vocab[col_name][special_token]
for s in self.readConllu():
for col_name in col_name_dict.keys():
int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s])
# vocabs cannot be saved if they have lambda function: erase default_factory
for col_name in col_name_dict.keys():
vocab[col_name].default_factory = None
return int_list, vocab
###########################################
def to_int_from_vocab(self, col_name_dict, unk_token, vocab={}):
int_list = {}
unk_toks = {}
for col_name, special_tokens in col_name_dict.items():
int_list[col_name] = []
unk_toks[col_name] = vocab[col_name].get(unk_token,None)
for s in self.readConllu():
for col_name in col_name_dict.keys():
id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name])
int_list[col_name].append([id_getter(vocab,tok) for tok in s])
return int_list
###########################################
@staticmethod
def to_int_from_vocab_sent(sent, col_name_dict, unk_token, vocab={}):
int_list = {}
for col_name in col_name_dict.keys():
unk_tok_id = vocab[col_name].get(unk_token,None)
id_getter = lambda v,t: v[col_name].get(t[col_name],unk_tok_id)
int_list[col_name]=[id_getter(vocab,tok) for tok in sent]
return int_list
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment