diff --git a/sequoia/bin/simplify_sequoia.py b/sequoia/bin/simplify_sequoia.py index a6c48829d6370ba2927fd636350c8bc4bcd39426..3d4afda390374938cd9814e6178393ad48bb7bc9 100755 --- a/sequoia/bin/simplify_sequoia.py +++ b/sequoia/bin/simplify_sequoia.py @@ -4,6 +4,8 @@ Simplify Sequoia corpus for pedagogical purposes: - Remove all range tokens (e.g. "2-3 du" = "2 de" + "3 le"), keep only full tokens => Range tokens usually contain no annotation: they mark the presence of a contraction => The text may become strange to read, e.g. "L'ambassadrice de le Portugal à les Pays-Bas" +- Column FEATS (TP3) + - Remove extremely rare features appearing only once with a given UPOS - Column FRSEMCOR:NOUN (TP4) - Remove all supersense annotations for multiword units => keeping multiwords would make data preparation unnecessarily complex @@ -77,6 +79,19 @@ def simplify_supersense(sent): return del_ssense_counter, mod_ssense_counter ######################################### + +def simplify_morphology(sent): + for token in sent: + if token["feats"] : + if "Typo" in token["feats"]: + del token['feats']['Typo'] + elif token["upos"] in ["ADV", "NUM"]: + if "Number" in token["feats"]: + del token['feats']['Number'] + elif "Gender" in token['feats']: + del token['feats']['Gender'] + +######################################### def simplify_mwe_ne(sent): ne_ind = 1 # Start new named entities at index 1 in new column @@ -148,6 +163,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f: del_ssense_counter = del_ssense_counter + del_ssense_ci mod_ssense_counter = mod_ssense_counter + mod_ssense_ci del_ne_counter = del_ne_counter + simplify_mwe_ne(sent) + simplify_morphology(sent) # subrel_counter = subrel_counter + remove_subrelations(sent) if is_projective(sent) : # Returns false to remove sentence if sent.metadata.get("global.columns", None): # Add header for new column diff --git a/sequoia/sequoia-ud.parseme.frsemcor.simple.dev b/sequoia/sequoia-ud.parseme.frsemcor.simple.dev index d513071bc549e2e0a545b65fcad015431a716948..7f651849ba27a8a615a42357a1817b8a29345334 100644 --- a/sequoia/sequoia-ud.parseme.frsemcor.simple.dev +++ b/sequoia/sequoia-ud.parseme.frsemcor.simple.dev @@ -1519,7 +1519,7 @@ 17 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 18 det _ _ 4 * * 18 myocarde myocarde NOUN _ Gender=Masc|Number=Sing 15 nmod _ _ 4 * * 19 sans sans ADP _ _ 20 case _ _ * * * -20 sus sus NOUN _ Typo=Yes 15 nmod _ CorrectForm=sus-décalage 5:_|MWE|LEX * * +20 sus sus NOUN _ _ 15 nmod _ CorrectForm=sus-décalage 5:_|MWE|LEX * * 21 décalage _ X _ _ 20 goeswith _ _ 5 * * 22 de de ADP _ _ 24 case _ _ 5 * * 23 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 24 det _ _ 5 * * diff --git a/sequoia/sequoia-ud.parseme.frsemcor.simple.full b/sequoia/sequoia-ud.parseme.frsemcor.simple.full index fafca9d26c480c91385340e4e1fd42d093ddce07..56093b321a8e88b7276ad49bb8452d8a1651a3e4 100644 --- a/sequoia/sequoia-ud.parseme.frsemcor.simple.full +++ b/sequoia/sequoia-ud.parseme.frsemcor.simple.full @@ -14461,7 +14461,7 @@ 17 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 18 det _ _ 4 * * 18 myocarde myocarde NOUN _ Gender=Masc|Number=Sing 15 nmod _ _ 4 * * 19 sans sans ADP _ _ 20 case _ _ * * * -20 sus sus NOUN _ Typo=Yes 15 nmod _ CorrectForm=sus-décalage 5:_|MWE|LEX * * +20 sus sus NOUN _ _ 15 nmod _ CorrectForm=sus-décalage 5:_|MWE|LEX * * 21 décalage _ X _ _ 20 goeswith _ _ 5 * * 22 de de ADP _ _ 24 case _ _ 5 * * 23 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 24 det _ _ 5 * * @@ -17189,7 +17189,7 @@ # sent_id = emea-fr-dev_00222 # text = Troubles musculo squelettiques, du tissu conjonctif et des os 1 Troubles trouble NOUN _ Gender=Masc|Number=Plur 0 root _ _ * State * -2 musculo musculo ADJ _ Typo=Yes 1 amod _ CorrectForm=musculo-squelettiques * * * +2 musculo musculo ADJ _ _ 1 amod _ CorrectForm=musculo-squelettiques * * * 3 squelettiques _ X _ _ 2 goeswith _ _ * * * 4 , , PUNCT _ _ 7 punct _ _ * * * 5 de de ADP _ _ 7 case _ _ * * * @@ -42396,7 +42396,7 @@ 27 l' le DET _ Definite=Def|Number=Sing|PronType=Art 28 det _ _ * * * 28 embargo embargo NOUN _ Gender=Masc|Number=Sing 25 nmod _ _ * Act * 29 , , PUNCT _ _ 22 punct _ _ * * * -30 faute faute ADV _ Gender=Fem|Number=Sing 22 advmod _ _ 2:ADP|MWE|IRREG * * +30 faute faute ADV _ Gender=Fem 22 advmod _ _ 2:ADP|MWE|IRREG * * 31 de de ADP _ _ 32 case _ _ 2 * * 32 produits produit NOUN _ Gender=Masc|Number=Plur 30 obl:arg _ _ * Food * 33 alimentaires alimentaire ADJ _ Number=Plur 32 amod _ _ * * * @@ -75442,7 +75442,7 @@ 3 août août NOUN _ Gender=Masc|Number=Sing 2 nmod _ _ * * * 4 , , PUNCT _ _ 13 punct _ _ * * * 5 les le DET _ Definite=Def|Number=Plur|PronType=Art 9 det _ _ * * * -6 neuf neuf NUM _ Gender=Masc|NumType=Card 9 amod _ ExtPos=ADJ * * * +6 neuf neuf NUM _ NumType=Card 9 amod _ ExtPos=ADJ * * * 7 cent cent NUM _ NumType=Card 6 fixed _ _ * * * 8 vingt vingt NUM _ NumType=Card 6 fixed _ _ * * * 9 photos photo NOUN _ Gender=Fem|Number=Plur 13 nsubj:pass _ _ * Artifact * diff --git a/sequoia/sequoia-ud.parseme.frsemcor.simple.test b/sequoia/sequoia-ud.parseme.frsemcor.simple.test index 226136b933e23c69a1c9d8fe3967cdd63d12f25f..94e45f008447ee0db04a7854ea7c7e8c5eb05303 100644 --- a/sequoia/sequoia-ud.parseme.frsemcor.simple.test +++ b/sequoia/sequoia-ud.parseme.frsemcor.simple.test @@ -1272,7 +1272,7 @@ 27 l' le DET _ Definite=Def|Number=Sing|PronType=Art 28 det _ _ * * * 28 embargo embargo NOUN _ Gender=Masc|Number=Sing 25 nmod _ _ * Act * 29 , , PUNCT _ _ 22 punct _ _ * * * -30 faute faute ADV _ Gender=Fem|Number=Sing 22 advmod _ _ 2:ADP|MWE|IRREG * * +30 faute faute ADV _ Gender=Fem 22 advmod _ _ 2:ADP|MWE|IRREG * * 31 de de ADP _ _ 32 case _ _ 2 * * 32 produits produit NOUN _ Gender=Masc|Number=Plur 30 obl:arg _ _ * Food * 33 alimentaires alimentaire ADJ _ Number=Plur 32 amod _ _ * * * diff --git a/sequoia/sequoia-ud.parseme.frsemcor.simple.train b/sequoia/sequoia-ud.parseme.frsemcor.simple.train index d7df3de16652f8a8b16b2fd2360bdfbc959bf095..a6305d82ca8d869371e26c3f98635490ad9cf4a4 100644 --- a/sequoia/sequoia-ud.parseme.frsemcor.simple.train +++ b/sequoia/sequoia-ud.parseme.frsemcor.simple.train @@ -12114,7 +12114,7 @@ # sent_id = emea-fr-dev_00222 # text = Troubles musculo squelettiques, du tissu conjonctif et des os 1 Troubles trouble NOUN _ Gender=Masc|Number=Plur 0 root _ _ * State * -2 musculo musculo ADJ _ Typo=Yes 1 amod _ CorrectForm=musculo-squelettiques * * * +2 musculo musculo ADJ _ _ 1 amod _ CorrectForm=musculo-squelettiques * * * 3 squelettiques _ X _ _ 2 goeswith _ _ * * * 4 , , PUNCT _ _ 7 punct _ _ * * * 5 de de ADP _ _ 7 case _ _ * * * @@ -53929,7 +53929,7 @@ 3 août août NOUN _ Gender=Masc|Number=Sing 2 nmod _ _ * * * 4 , , PUNCT _ _ 13 punct _ _ * * * 5 les le DET _ Definite=Def|Number=Plur|PronType=Art 9 det _ _ * * * -6 neuf neuf NUM _ Gender=Masc|NumType=Card 9 amod _ ExtPos=ADJ * * * +6 neuf neuf NUM _ NumType=Card 9 amod _ ExtPos=ADJ * * * 7 cent cent NUM _ NumType=Card 6 fixed _ _ * * * 8 vingt vingt NUM _ NumType=Card 6 fixed _ _ * * * 9 photos photo NOUN _ Gender=Fem|Number=Plur 13 nsubj:pass _ _ * Artifact *