diff --git a/cm-code/bert-minimal.py b/cm-code/bert-minimal.py index 477e1e44f3d67ceb5eeb6c6394f0e2f81cf4bd07..114b2e51a860c5a6b782feb96a9ec1f54f3bda54 100755 --- a/cm-code/bert-minimal.py +++ b/cm-code/bert-minimal.py @@ -6,8 +6,8 @@ from transformers import AutoModel, AutoTokenizer name = 'almanach/camembert-base' #sent = "Des poids lourds et engins en feu \ # dans une entreprise en Vendée ." -#sent = "La gare routière attend toujours ses illuminations ." -sent = "Quelle surprise ! Arturo a la covid" +sent = "La gare routière attend toujours ses illuminations ." +#sent = "Quelle surprise ! Arturo a la covid" tok = AutoTokenizer.from_pretrained(name) model = AutoModel.from_pretrained(name) diff --git a/cm-code/petits.conllu b/cm-code/petits.conllu index ba3517c7e19d1972acfdad8c757b5ce3c6c1e619..be0ba525648ac10f02a05202e04ee71c916737ee 100644 --- a/cm-code/petits.conllu +++ b/cm-code/petits.conllu @@ -1,4 +1,5 @@ -# text = Les petits ruisseaux font les grandes rivières . +# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC +# text = Les petits ruisseaux font les grandes rivières. 1 Les le DET _ Definite=Def|Number=Plur|PronType=Art 3 det _ _ 2 petits petit ADJ _ Gender=Masc|Number=Plur 3 amod _ _ 3 ruisseaux ruisseau NOUN _ Gender=Masc|Number=Plur 4 nsubj _ _ diff --git a/lib/conllulib.py b/lib/conllulib.py index b5877821d190cb05232f0304bd0a3a664ea4a6da..fa707f4295b54d515fc24fe281c832140ec8badb 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -538,8 +538,8 @@ class TransBasedConfig(object): `next_act` is a string among "SHIFT", "RIGHT-ARC-X" or "LEFT-ARC-X" where "X" is the name of any valid syntactic relation label (deprel). Returns a new syntactic relation added by the action, or None for "SHIFT" - Returned relation is a triple (mod, head, deprel) with modifier, head, and - deprel label if `add_deprel=True` (default), or a pair (mod, head) if + Returned relation is a triple (dep, head, deprel) with dependent, head, and + deprel label if `add_deprel=True` (default), or a pair (dep, head) if `add_deprel=False`. """ if next_act == "SHIFT": diff --git a/sequoia/bin/simplify_sequoia.py b/sequoia/bin/simplify_sequoia.py index 3d4afda390374938cd9814e6178393ad48bb7bc9..70cd719b6436969b152a212b0b20ded6ee0c3f7a 100755 --- a/sequoia/bin/simplify_sequoia.py +++ b/sequoia/bin/simplify_sequoia.py @@ -134,10 +134,10 @@ def is_projective(sent): if dep_id > head_id : start = head_id end = dep_id - for token_i in range(start,end-1): # sent is 0-indexed, ID is 1-indexed - if sent[token_i]["head"] < start or sent[token_i]["head"] > end : - return False - return True + for token_i in range(start, end - 1): # sent is 0-indexed, ID is 1-indexed + if sent[token_i]["head"] < start or sent[token_i]["head"] > end : + return False + return True ######################################### @@ -150,14 +150,14 @@ def remove_subrelations(sent): ######################################### if len(sys.argv) != 2: - print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) + print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) exit(-1) with open(sys.argv[1], "r", encoding="UTF=8") as f: np_counter = range_counter = del_ne_counter = 0 del_ssense_counter = mod_ssense_counter = 0 #subrel_counter = 0 - np_ids = [] - for sent in conllu.parse_incr(f): + np_ids = [] + for sent in conllu.parse_incr(f): range_counter = range_counter + remove_range_tokens(sent) del_ssense_ci, mod_ssense_ci = simplify_supersense(sent) del_ssense_counter = del_ssense_counter + del_ssense_ci @@ -171,7 +171,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f: print(sent.serialize(), end="") else: np_counter += 1 - np_ids.append(sent.metadata["sent_id"]) + np_ids.append((sent.metadata["sent_id"],len(sent))) print( "{} range tokens removed.\n".format(range_counter), file=sys.stderr) @@ -182,4 +182,4 @@ print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_co #print( "{} subrelations removed from deprel.".format(subrel_counter), file=sys.stderr) print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr) -print(", ".join(np_ids), file=sys.stderr) +print("\n".join([f"{np_id} -> {lgth}" for (np_id, lgth) in np_ids]), file=sys.stderr)