diff --git a/lib/conllulib.py b/lib/conllulib.py index 4a2f7f33711a302aa7e5a914290a1d7940aca5a5..d8ce6f9ce24f7d9bf6742939d15409200ffabbe9 100644 --- a/lib/conllulib.py +++ b/lib/conllulib.py @@ -332,18 +332,40 @@ class TransBasedSent(object): """ ############################### - def __init__(self, sent): + def __init__(self, sent, actions_only=False): """ `sent`: A `TokenList` as retrieved by the `conllu` library or `readConllu()` + `actions_only`: affects the way the __str__ function prints this object """ self.sent = sent + self.actions_only = actions_only ############################### + def __str__(self): + """ + Sequence of configs and arc-hybrid actions corresponding to the sentence. + If `self.actions_only=True` prints only sequence of actions + """ + result = [] + for config, action in self.get_configs_oracle(): + if not self.actions_only : + result.append("{} -> {}".format(str(config), action)) + else : + result.append(action) + if not self.actions_only : + result.append("{} -> {}".format(str(config), action)) + return "\n".join(result) + else : + return " ".join(result) + + + ############################### + def get_configs_oracle(self): """ Generator of oracle arc-hybrid configurations based on gold parsing tree. - Yields triples (stack, buffer, action) where action is a string among: + Yields pairs (`TransBasedConfig`, action) where action is a string among: - "SHIFT" -> pop buffer into stack - "LEFT-ARC-X" -> relation "X" from buffer head to stack head, pop stack - "RIGHT-ARC-X" -> relation "X" from stack head to stack second, pop stack @@ -402,6 +424,15 @@ class TransBasedConfig(object): ############################### + def __str__(self): + """ + Generate a string with explicit buffer and stack words. + """ + return "{}, {}".format([self.sent[i - 1]['form'] for i in self.stack], + [self.sent[i - 1]['form'] for i in self.buff[:-1]] + [0]) + + ############################### + def is_final(self): """ Returns True if configuration is final, False else. diff --git a/sequoia/README.md b/sequoia/README.md index 18df1fe256018ed05475eb352d95c587be62fceb..b04e964e6a28850651bf36a91c277dae629dffeb 100644 --- a/sequoia/README.md +++ b/sequoia/README.md @@ -7,6 +7,7 @@ We obtained the file `trunk/sequoia-ud.parseme.frsemcor` from commit number `ea7 The file is the result of the conversion from Sequoia's source as described on the [documentation](https://deep-sequoia.inria.fr/process/) We keep the original file in `src` folder to make command line completion faster +The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises. ### Simplification diff --git a/sequoia/tiny.conllu b/sequoia/tiny.conllu new file mode 100644 index 0000000000000000000000000000000000000000..bf811a212250d2cd0b89f11a2c5a4fdd16a99428 --- /dev/null +++ b/sequoia/tiny.conllu @@ -0,0 +1,11 @@ +# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE +# sent_id = annodis.er_00192 +# text = La gare routière attend toujours ses illuminations. +1 La le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ _ * * * +2 gare gare NOUN _ Gender=Fem|Number=Sing 4 nsubj _ _ 1:_|MWE|SYNT * * +3 routière routier ADJ _ Gender=Fem|Number=Sing 2 amod _ _ 1 * * +4 attend attendre VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ * * * +5 toujours toujours ADV _ _ 4 advmod _ _ * * * +6 ses son DET _ Number=Plur|Poss=Yes 7 det _ _ * * * +7 illuminations illumination NOUN _ Gender=Fem|Number=Plur 4 obj _ _ * Artifact * +8 . . PUNCT _ _ 4 punct _ _ * * *