diff --git a/lib/accuracy.py b/lib/accuracy.py index 3d4203af2603b8422c7d05b29ba2d00083c1f72f..3c6186c5c8a4caf3a05d24781dfe111cc780fba3 100755 --- a/lib/accuracy.py +++ b/lib/accuracy.py @@ -5,6 +5,7 @@ import argparse from collections import defaultdict, Counter import pdb from conllulib import CoNLLUReader, Util +import re ################################################################################ @@ -197,7 +198,7 @@ if __name__ == "__main__": for (s_gold,s_pred) in zip(gold_corpus.readConllu(),pred_corpus.readConllu()): if args.name_tag.startswith("parseme"): tp_count_parseme(s_pred, s_gold, args.name_tag, prf) - if args.name_tag in ["head", "deprel"]: + if args.name_tag in ["head", "deprel"]: # Any of both is considered LAS/UAS eval args.name_tag = "head" parsing = True for (tok_gold, tok_pred) in zip (s_gold, s_pred): @@ -213,8 +214,11 @@ if __name__ == "__main__": acc['correct_tokens'] += 1 if train_vocab and oov : acc['correct_oov'] += 1 + # LAS ignores subrelations, as usual in CoNLL17/18 eval scripts + gold_deprel = re.sub(':.*', '', tok_gold["deprel"]) + pred_deprel = re.sub(':.*', '', tok_pred["deprel"]) if parsing and tok_gold["head"] == tok_pred["head"] and \ - tok_gold["deprel"] == tok_pred["deprel"]: + gold_deprel == pred_deprel: acc['correct_tokens_las'] += 1 if train_vocab and oov : acc['correct_oov_las'] += 1 diff --git a/lib/conllulib.py b/lib/conllulib.py index ed8144c06933420bb7f65b6ad266341bdaa13633..08dbc14c81d0fdfd61ea039e101c55537447f3e5 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -147,7 +147,7 @@ class CoNLLUReader(object): ############################### - def readConllu(self): + def readConllu(self): for sent in conllu.parse_incr(self.infile): yield sent @@ -159,6 +159,12 @@ class CoNLLUReader(object): ############################### def morph_feats(self): + """ + Extract the list of morphological features from the "FEATS" field of the + CoNLL-U file. At the end, rewinds the file so that it can be read through + again. The result is a list of unique strings corresponding to the keys + appearing in the FEATS column of the corpus (before the = sign) + """ morph_feats_list = set([]) for sent in conllu.parse_incr(self.infile): for tok in sent : @@ -170,7 +176,7 @@ class CoNLLUReader(object): ############################### - def to_int_and_vocab(self, col_name_dict): + def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}): int_list = {}; vocab = {} for col_name, special_tokens in col_name_dict.items(): @@ -179,10 +185,14 @@ class CoNLLUReader(object): for special_token in special_tokens: # Simple access to undefined dict key creates new ID (dict length) vocab[col_name][special_token] + for col_name in extra_cols_dict.keys() : + int_list[col_name] = [] for s in self.readConllu(): # IMPORTANT : only works if "col_name" is the same as in lambda function definition! for col_name in col_name_dict.keys(): - int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s]) + int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s]) + for col_name, col_fct in extra_cols_dict.items(): + int_list[col_name].append(list(map(col_fct, [tok[col_name] for tok in s]))) # vocabs cannot be saved if they have lambda function: erase default_factory for col_name in col_name_dict.keys(): vocab[col_name].default_factory = None @@ -190,16 +200,20 @@ class CoNLLUReader(object): ############################### - def to_int_from_vocab(self, col_names, unk_token, vocab={}): + def to_int_from_vocab(self, col_names, unk_token, vocab={}, extra_cols_dict={}): int_list = {} unk_toks = {} for col_name in col_names: int_list[col_name] = [] unk_toks[col_name] = vocab[col_name].get(unk_token,None) + for col_name in extra_cols_dict.keys() : + int_list[col_name] = [] for s in self.readConllu(): for col_name in col_names: id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name]) - int_list[col_name].append([id_getter(vocab,tok) for tok in s]) + int_list[col_name].append([id_getter(vocab,tok) for tok in s]) + for col_name, col_fct in extra_cols_dict.items(): + int_list[col_name].append(list(map(col_fct, [tok[col_name] for tok in s]))) return int_list ############################### diff --git a/sequoia/README.md b/sequoia/README.md index b04e964e6a28850651bf36a91c277dae629dffeb..fc831962e0ac515a15a535da35dc2d789724de85 100644 --- a/sequoia/README.md +++ b/sequoia/README.md @@ -7,7 +7,6 @@ We obtained the file `trunk/sequoia-ud.parseme.frsemcor` from commit number `ea7 The file is the result of the conversion from Sequoia's source as described on the [documentation](https://deep-sequoia.inria.fr/process/) We keep the original file in `src` folder to make command line completion faster -The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises. ### Simplification @@ -51,6 +50,8 @@ CUTLINE=`grep -n ${LASTID} ${CORPUS} | sed 's/:.*//g'` head -n $((CUTLINE-1)) ${CORPUS} > sequoia-ud.parseme.frsemcor.simple.small ``` +The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises. + Finally, we also split the non-simplified version of the corpus into train, dev and test (before simplification). These files should not be used in your experiments. ``` diff --git a/sequoia/bin/simplify_sequoia.py b/sequoia/bin/simplify_sequoia.py index 517e7d0b5c8096d8b27e87e111b42dcc220edc83..a6c48829d6370ba2927fd636350c8bc4bcd39426 100755 --- a/sequoia/bin/simplify_sequoia.py +++ b/sequoia/bin/simplify_sequoia.py @@ -23,6 +23,7 @@ Simplify Sequoia corpus for pedagogical purposes: - Columns HEAD and DEPREL (TP5 and TP6) - Remove non-projective sentences => Non-projective parse trees are not straightforward to handle in the dependency parsing models we implement + - [EXPERIMENTAL] Remove all deprel subrelations (after semicolon) to simplify the tagset This script depends on the `cuptlib` library. You can install it with: @@ -125,13 +126,21 @@ def is_projective(sent): ######################################### +def remove_subrelations(sent): + subrel_counter = sum([1 if ':' in t['deprel'] else 0 for t in sent]) + for token in sent : + token['deprel'] = re.sub(':.*', '', token['deprel']) + return subrel_counter + +######################################### + if len(sys.argv) != 2: print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) exit(-1) with open(sys.argv[1], "r", encoding="UTF=8") as f: np_counter = range_counter = del_ne_counter = 0 - del_ssense_counter = mod_ssense_counter = 0 + del_ssense_counter = mod_ssense_counter = 0 #subrel_counter = 0 np_ids = [] for sent in conllu.parse_incr(f): range_counter = range_counter + remove_range_tokens(sent) @@ -139,10 +148,11 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f: del_ssense_counter = del_ssense_counter + del_ssense_ci mod_ssense_counter = mod_ssense_counter + mod_ssense_ci del_ne_counter = del_ne_counter + simplify_mwe_ne(sent) +# subrel_counter = subrel_counter + remove_subrelations(sent) if is_projective(sent) : # Returns false to remove sentence if sent.metadata.get("global.columns", None): # Add header for new column sent.metadata["global.columns"] += " PARSEME:NE" - print(sent.serialize(),end="") + print(sent.serialize(), end="") else: np_counter += 1 np_ids.append(sent.metadata["sent_id"]) @@ -154,6 +164,6 @@ print( "{} discontinuous and overlapping NEs removed.\n".format(del_ne_counter), print( "{} supersense tags removed (on MWEs or strange POS).".format(del_ssense_counter), file=sys.stderr) print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_counter), file=sys.stderr) - +#print( "{} subrelations removed from deprel.".format(subrel_counter), file=sys.stderr) print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr) print(", ".join(np_ids), file=sys.stderr)