"Code/MonoMultiViewClassifiers/Monoview/ExecPlot.py" did not exist on "3c25f5755fb8e4c1481bf3463cb1e06cd8e3637d"
Select Git revision
ExecClassifMonoView.py
conllulib.py 4.65 KiB
#!/usr/bin/env python3
import sys
import conllu
import collections
from torch.utils.data import TensorDataset, DataLoader
import torch
import random
import pdb
########################################################################
# UTILITY FUNCTIONS
########################################################################
class Util(object):
DEBUG_FLAG = False
###############################
@staticmethod
def error(msg, *kwargs):
print("ERROR:", msg.format(*kwargs), file=sys.stderr)
sys.exit(-1)
###############################
@staticmethod
def warn(msg, *kwargs):
print("WARNING:", msg.format(*kwargs), file=sys.stderr)
###############################
@staticmethod
def debug(msg, *kwargs):
if Util.DEBUG_FLAG:
print(msg.format(*kwargs), file=sys.stderr)
###############################
@staticmethod
def rev_vocab(vocab):
rev_dict = {y: x for x, y in vocab.items()}
return [rev_dict[k] for k in range(len(rev_dict))]
###############################
@staticmethod
def dataloader(inputs, outputs, batch_size=16, shuffle=True):
data_set = TensorDataset(*inputs, *outputs)
return DataLoader(data_set, batch_size, shuffle=shuffle)
###############################
@staticmethod
def count_params(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
###############################
@staticmethod
def init_seed(seed):
if seed >= 0:
random.seed(seed)
torch.manual_seed(seed)
########################################################################
# CONLLU FUNCTIONS
########################################################################
class CoNLLUReader(object):
###############################
def __init__(self, infile):
self.infile = infile
DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
"PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
try:
first = self.infile.readline().strip() # First line in the file
globalcolumns = conllu.parse(first)[0].metadata['global.columns']
self.header = globalcolumns.lower().split(" ")
self.infile.seek(0) # Rewind open file
except KeyError:
self.header = DEFAULT_HEADER.split(" ")
###############################
def readConllu(self):
for sent in conllu.parse_incr(self.infile):
yield sent
###############################
def name(self):
return self.infile.name
###############################
def morph_feats(self):
morph_feats_list = set([])
for sent in conllu.parse_incr(self.infile):
for tok in sent :
if tok["feats"] :
for key in tok["feats"].keys():
morph_feats_list.add(key )
self.infile.seek(0) # Rewind open file
return list(morph_feats_list)
###############################
def to_int_and_vocab(self, col_name_dict):
int_list = {};
vocab = {}
for col_name, special_tokens in col_name_dict.items():
int_list[col_name] = []
vocab[col_name] = collections.defaultdict(lambda: len(vocab[col_name]))
for special_token in special_tokens:
# Simple access to undefined dict key creates new ID (dict length)
vocab[col_name][special_token]
for s in self.readConllu():
# IMPORTANT : only works if "col_name" is the same as in lambda function definition!
for col_name in col_name_dict.keys():
int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s])
# vocabs cannot be saved if they have lambda function: erase default_factory
for col_name in col_name_dict.keys():
vocab[col_name].default_factory = None
return int_list, vocab
###############################
def to_int_from_vocab(self, col_name_dict, unk_token, vocab={}):
int_list = {}
unk_toks = {}
for col_name, special_tokens in col_name_dict.items():
int_list[col_name] = []
unk_toks[col_name] = vocab[col_name].get(unk_token,None)
for s in self.readConllu():
for col_name in col_name_dict.keys():
id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name])
int_list[col_name].append([id_getter(vocab,tok) for tok in s])
return int_list
###############################
@staticmethod
def to_int_from_vocab_sent(sent, col_name_dict, unk_token, vocab={}):
int_list = {}
for col_name in col_name_dict.keys():
unk_tok_id = vocab[col_name].get(unk_token,None)
id_getter = lambda v,t: v[col_name].get(t[col_name],unk_tok_id)
int_list[col_name]=[id_getter(vocab,tok) for tok in sent]
return int_list