Skip to content
Snippets Groups Projects
Commit 13cc0ebe authored by Carlos Ramisch's avatar Carlos Ramisch
Browse files

Merge branch 'master' of gitlab.lis-lab.fr:carlos.ramisch/pstal-etu

parents 5e9e9d8c 53054e6c
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@ import argparse
from collections import defaultdict, Counter
import pdb
from conllulib import CoNLLUReader, Util
import re
################################################################################
......@@ -197,7 +198,7 @@ if __name__ == "__main__":
for (s_gold,s_pred) in zip(gold_corpus.readConllu(),pred_corpus.readConllu()):
if args.name_tag.startswith("parseme"):
tp_count_parseme(s_pred, s_gold, args.name_tag, prf)
if args.name_tag in ["head", "deprel"]:
if args.name_tag in ["head", "deprel"]: # Any of both is considered LAS/UAS eval
args.name_tag = "head"
parsing = True
for (tok_gold, tok_pred) in zip (s_gold, s_pred):
......@@ -213,8 +214,11 @@ if __name__ == "__main__":
acc['correct_tokens'] += 1
if train_vocab and oov :
acc['correct_oov'] += 1
# LAS ignores subrelations, as usual in CoNLL17/18 eval scripts
gold_deprel = re.sub(':.*', '', tok_gold["deprel"])
pred_deprel = re.sub(':.*', '', tok_pred["deprel"])
if parsing and tok_gold["head"] == tok_pred["head"] and \
tok_gold["deprel"] == tok_pred["deprel"]:
gold_deprel == pred_deprel:
acc['correct_tokens_las'] += 1
if train_vocab and oov :
acc['correct_oov_las'] += 1
......
......@@ -159,6 +159,12 @@ class CoNLLUReader(object):
###############################
def morph_feats(self):
"""
Extract the list of morphological features from the "FEATS" field of the
CoNLL-U file. At the end, rewinds the file so that it can be read through
again. The result is a list of unique strings corresponding to the keys
appearing in the FEATS column of the corpus (before the = sign)
"""
morph_feats_list = set([])
for sent in conllu.parse_incr(self.infile):
for tok in sent :
......@@ -170,7 +176,7 @@ class CoNLLUReader(object):
###############################
def to_int_and_vocab(self, col_name_dict):
def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}):
int_list = {};
vocab = {}
for col_name, special_tokens in col_name_dict.items():
......@@ -179,10 +185,14 @@ class CoNLLUReader(object):
for special_token in special_tokens:
# Simple access to undefined dict key creates new ID (dict length)
vocab[col_name][special_token]
for col_name in extra_cols_dict.keys() :
int_list[col_name] = []
for s in self.readConllu():
# IMPORTANT : only works if "col_name" is the same as in lambda function definition!
for col_name in col_name_dict.keys():
int_list[col_name].append([vocab[col_name][tok[col_name]] for tok in s])
for col_name, col_fct in extra_cols_dict.items():
int_list[col_name].append(list(map(col_fct, [tok[col_name] for tok in s])))
# vocabs cannot be saved if they have lambda function: erase default_factory
for col_name in col_name_dict.keys():
vocab[col_name].default_factory = None
......@@ -190,16 +200,20 @@ class CoNLLUReader(object):
###############################
def to_int_from_vocab(self, col_names, unk_token, vocab={}):
def to_int_from_vocab(self, col_names, unk_token, vocab={}, extra_cols_dict={}):
int_list = {}
unk_toks = {}
for col_name in col_names:
int_list[col_name] = []
unk_toks[col_name] = vocab[col_name].get(unk_token,None)
for col_name in extra_cols_dict.keys() :
int_list[col_name] = []
for s in self.readConllu():
for col_name in col_names:
id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name])
int_list[col_name].append([id_getter(vocab,tok) for tok in s])
for col_name, col_fct in extra_cols_dict.items():
int_list[col_name].append(list(map(col_fct, [tok[col_name] for tok in s])))
return int_list
###############################
......
......@@ -7,7 +7,6 @@ We obtained the file `trunk/sequoia-ud.parseme.frsemcor` from commit number `ea7
The file is the result of the conversion from Sequoia's source as described on the [documentation](https://deep-sequoia.inria.fr/process/)
We keep the original file in `src` folder to make command line completion faster
The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises.
### Simplification
......@@ -51,6 +50,8 @@ CUTLINE=`grep -n ${LASTID} ${CORPUS} | sed 's/:.*//g'`
head -n $((CUTLINE-1)) ${CORPUS} > sequoia-ud.parseme.frsemcor.simple.small
```
The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises.
Finally, we also split the non-simplified version of the corpus into train, dev and test (before simplification).
These files should not be used in your experiments.
```
......
......@@ -23,6 +23,7 @@ Simplify Sequoia corpus for pedagogical purposes:
- Columns HEAD and DEPREL (TP5 and TP6)
- Remove non-projective sentences
=> Non-projective parse trees are not straightforward to handle in the dependency parsing models we implement
- [EXPERIMENTAL] Remove all deprel subrelations (after semicolon) to simplify the tagset
This script depends on the `cuptlib` library. You can install it with:
......@@ -125,13 +126,21 @@ def is_projective(sent):
#########################################
def remove_subrelations(sent):
subrel_counter = sum([1 if ':' in t['deprel'] else 0 for t in sent])
for token in sent :
token['deprel'] = re.sub(':.*', '', token['deprel'])
return subrel_counter
#########################################
if len(sys.argv) != 2:
print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr)
exit(-1)
with open(sys.argv[1], "r", encoding="UTF=8") as f:
np_counter = range_counter = del_ne_counter = 0
del_ssense_counter = mod_ssense_counter = 0
del_ssense_counter = mod_ssense_counter = 0 #subrel_counter = 0
np_ids = []
for sent in conllu.parse_incr(f):
range_counter = range_counter + remove_range_tokens(sent)
......@@ -139,6 +148,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f:
del_ssense_counter = del_ssense_counter + del_ssense_ci
mod_ssense_counter = mod_ssense_counter + mod_ssense_ci
del_ne_counter = del_ne_counter + simplify_mwe_ne(sent)
# subrel_counter = subrel_counter + remove_subrelations(sent)
if is_projective(sent) : # Returns false to remove sentence
if sent.metadata.get("global.columns", None): # Add header for new column
sent.metadata["global.columns"] += " PARSEME:NE"
......@@ -154,6 +164,6 @@ print( "{} discontinuous and overlapping NEs removed.\n".format(del_ne_counter),
print( "{} supersense tags removed (on MWEs or strange POS).".format(del_ssense_counter), file=sys.stderr)
print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_counter), file=sys.stderr)
#print( "{} subrelations removed from deprel.".format(subrel_counter), file=sys.stderr)
print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr)
print(", ".join(np_ids), file=sys.stderr)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment