From 5867c7992131569190cda33e03bf027db77a10bf Mon Sep 17 00:00:00 2001 From: ceramisch <carlos.ramisch@lis-lab.fr> Date: Mon, 16 Dec 2024 00:00:38 +0100 Subject: [PATCH] Minimal update un Sequoia simplification script --- cm-code/bert-minimal.py | 4 ++-- cm-code/petits.conllu | 3 ++- lib/conllulib.py | 4 ++-- sequoia/bin/simplify_sequoia.py | 18 +++++++++--------- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cm-code/bert-minimal.py b/cm-code/bert-minimal.py index 477e1e4..114b2e5 100755 --- a/cm-code/bert-minimal.py +++ b/cm-code/bert-minimal.py @@ -6,8 +6,8 @@ from transformers import AutoModel, AutoTokenizer name = 'almanach/camembert-base' #sent = "Des poids lourds et engins en feu \ # dans une entreprise en Vendée ." -#sent = "La gare routière attend toujours ses illuminations ." -sent = "Quelle surprise ! Arturo a la covid" +sent = "La gare routière attend toujours ses illuminations ." +#sent = "Quelle surprise ! Arturo a la covid" tok = AutoTokenizer.from_pretrained(name) model = AutoModel.from_pretrained(name) diff --git a/cm-code/petits.conllu b/cm-code/petits.conllu index ba3517c..be0ba52 100644 --- a/cm-code/petits.conllu +++ b/cm-code/petits.conllu @@ -1,4 +1,5 @@ -# text = Les petits ruisseaux font les grandes rivières . +# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC +# text = Les petits ruisseaux font les grandes rivières. 1 Les le DET _ Definite=Def|Number=Plur|PronType=Art 3 det _ _ 2 petits petit ADJ _ Gender=Masc|Number=Plur 3 amod _ _ 3 ruisseaux ruisseau NOUN _ Gender=Masc|Number=Plur 4 nsubj _ _ diff --git a/lib/conllulib.py b/lib/conllulib.py index b587782..fa707f4 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -538,8 +538,8 @@ class TransBasedConfig(object): `next_act` is a string among "SHIFT", "RIGHT-ARC-X" or "LEFT-ARC-X" where "X" is the name of any valid syntactic relation label (deprel). Returns a new syntactic relation added by the action, or None for "SHIFT" - Returned relation is a triple (mod, head, deprel) with modifier, head, and - deprel label if `add_deprel=True` (default), or a pair (mod, head) if + Returned relation is a triple (dep, head, deprel) with dependent, head, and + deprel label if `add_deprel=True` (default), or a pair (dep, head) if `add_deprel=False`. """ if next_act == "SHIFT": diff --git a/sequoia/bin/simplify_sequoia.py b/sequoia/bin/simplify_sequoia.py index 3d4afda..70cd719 100755 --- a/sequoia/bin/simplify_sequoia.py +++ b/sequoia/bin/simplify_sequoia.py @@ -134,10 +134,10 @@ def is_projective(sent): if dep_id > head_id : start = head_id end = dep_id - for token_i in range(start,end-1): # sent is 0-indexed, ID is 1-indexed - if sent[token_i]["head"] < start or sent[token_i]["head"] > end : - return False - return True + for token_i in range(start, end - 1): # sent is 0-indexed, ID is 1-indexed + if sent[token_i]["head"] < start or sent[token_i]["head"] > end : + return False + return True ######################################### @@ -150,14 +150,14 @@ def remove_subrelations(sent): ######################################### if len(sys.argv) != 2: - print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) + print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) exit(-1) with open(sys.argv[1], "r", encoding="UTF=8") as f: np_counter = range_counter = del_ne_counter = 0 del_ssense_counter = mod_ssense_counter = 0 #subrel_counter = 0 - np_ids = [] - for sent in conllu.parse_incr(f): + np_ids = [] + for sent in conllu.parse_incr(f): range_counter = range_counter + remove_range_tokens(sent) del_ssense_ci, mod_ssense_ci = simplify_supersense(sent) del_ssense_counter = del_ssense_counter + del_ssense_ci @@ -171,7 +171,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f: print(sent.serialize(), end="") else: np_counter += 1 - np_ids.append(sent.metadata["sent_id"]) + np_ids.append((sent.metadata["sent_id"],len(sent))) print( "{} range tokens removed.\n".format(range_counter), file=sys.stderr) @@ -182,4 +182,4 @@ print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_co #print( "{} subrelations removed from deprel.".format(subrel_counter), file=sys.stderr) print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr) -print(", ".join(np_ids), file=sys.stderr) +print("\n".join([f"{np_id} -> {lgth}" for (np_id, lgth) in np_ids]), file=sys.stderr) -- GitLab