Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • COMPO/compo-text-eval
1 result
Select Git revision
  • main
1 result
Show changes
Commits on Source (2)
%% Cell type:markdown id:75eff004 tags:
## Sentence segmentation and tokenization with SpaCy UDPipe
%% Cell type:code id:12f2533d tags:
``` python
#! pip install spacy-udpipe
#! pip install pip install stanza -U
import spacy_udpipe
import os
```
%% Cell type:code id:0039ad52 tags:
``` python
import spacy
from spacy.language import Language
nlp = spacy_udpipe.load("fr")
```
%% Output
/home/tatiana.bladier/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
%% Cell type:code id:728329cd tags:
``` python
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text in ["; ", " ; ", " ;"]:
doc[token.i + 1].is_sent_start = False
return doc
nlp.add_pipe("parser")
```
%% Output
<spacy.pipeline.dep_parser.DependencyParser at 0x7f356c2670d0>
%% Cell type:code id:8d4237bf tags:
``` python
nlp.add_pipe("set_custom_boundaries", first=True)
nlp.initialize()
```
%% Output
<thinc.optimizers.Optimizer at 0x7f3472e77420>
%% Cell type:code id:5219a6ea tags:
``` python
"""
text = "Celui qui chantait seul, et qui paraissait le maître du bâtiment, se tenait debout \
à la proue et s'accompagnait d'une cythare à trois cordes, pareille à \
celle que les statuaires mettent aux mains d'Euterpe, la muse de \
l'harmonie."
text = "C'est un magnifique enfant du Darfour, noir comme un charbon et qui a \
déjà l'air d'un homme, quoiqu'il n'ait, selon toute probabilité, que \
onze ou douze ans. Je dis _selon toute probabilité_, parce qu'il n'y a \
pas d'exemple qu'un nègre sache son âge. Celui-là... Pardon, \
j'oubliais de vous dire son nom. Il se nomme Abailard."
text = 'Il y a un an à peu près qu’en faisant à la Bibliothèque royale des \
recherches pour mon histoire de Louis XIV, je tombai par hasard sur les \
_Mémoires de M. d’Artagnan_, imprimés,--comme la plus grande partie des \
ouvrages de cette époque, où les auteurs tenaient à dire la vérité sans \
aller faire un tour plus ou moins long à la Bastille,--à Amsterdam, \
chez Pierre Rouge. Le titre me séduisit: je les emportai chez moi, \
avec la permission de M. le conservateur, bien entendu, et je les \
dévorai.'
text = "Un jeune homme...--traçons son portrait d’un seul trait de \
plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte \
décorselé, sans haubert et sans cuissards."
text = "Nous avons pris possesion de 2 chambres cabines ayant les toilettes en commun: poussière et saleté étaient au rendez-vous!"
"""
text = "Un jeune homme...--traçons son portrait d’un seul trait de \
plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte \
décorselé, sans haubert et sans cuissards. Un instant elle balança entre \
les violettes et les glaïeuls que lui offrait l'ombrage des arbres de \
Minerve, et les narcisses et les nymphéas qui s'élevaient sur les bords \
du petit fleuve ou flottaient à sa surface; mais bientôt elle se décida \
pour ceux-ci, et, bondissant comme un jeune faon, elle courut vers le \
ruisseau."
```
%% Cell type:code id:1218955c tags:
``` python
doc1 = nlp(text)
sentences = [sent.text for sent in doc1.sents]
print("After:", [sent.text for sent in doc1.sents])
```
%% Output
After: ['Un jeune homme...--traçons son portrait d’un seul trait de plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte décorselé, sans haubert et sans cuissards.', "Un instant elle balança entre les violettes et les glaïeuls que lui offrait l'ombrage de les arbres de Minerve, et les narcisses et les nymphéas qui s'élevaient sur les bords de le petit fleuve ou flottaient à sa surface; mais bientôt elle se décida pour ceux-ci, et, bondissant comme un jeune faon, elle courut vers le ruisseau."]
%% Cell type:code id:1b99da40 tags:
``` python
def nlpdoc_to_conll(nlpdoc):
sent_length = len([token.text for token in nlpdoc])
conll_lst = []
# Preliminary: whole sentence
whole_sentence = nlpdoc.text
#print('# text =', whole_sentence)
i = 1
for token in nlpdoc:
line_str = str(token.i +1) + "\t" + str(token.text) + "\t" + str(token.lemma_) \
+ "\t" + str(token.pos_) + "\t" + '_' + "\t" \
+ str(token.morph) + "\t" + str(token.head.i + 1) + "\t" \
+ str(token.dep_) + "\t" + '_' + "\t" + '_'
conll_lst.append(line_str)
i += 1
conll_str = '\n'.join(conll_lst)
return conll_str
```
%% Cell type:code id:af17d1f7 tags:
``` python
nlpdoc_to_conll(doc1)
```
%% Output
"1\tUn\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t3\tdet\t_\t_\n2\tjeune\tjeune\tADJ\t_\tGender=Masc|Number=Sing\t3\tamod\t_\t_\n3\thomme\thomme\tNOUN\t_\tGender=Masc|Number=Sing\t6\tnsubj\t_\t_\n4\t...\t...\tPUNCT\t_\t\t3\tpunct\t_\t_\n5\t--\t--\tPUNCT\t_\t\t3\tpunct\t_\t_\n6\ttraçons\ttraçer\tVERB\t_\tMood=Imp|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin\t6\tROOT\t_\t_\n7\tson\tson\tDET\t_\tGender=Masc|Number=Sing|Poss=Yes|PronType=Prs\t8\tdet\t_\t_\n8\tportrait\tportrait\tNOUN\t_\tGender=Masc|Number=Sing\t6\tobj\t_\t_\n9\td’\td’\tPROPN\t_\t\t8\tappos\t_\t_\n10\tun\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t12\tdet\t_\t_\n11\tseul\tseul\tADJ\t_\tGender=Masc|Number=Sing\t12\tamod\t_\t_\n12\ttrait\ttrait\tNOUN\t_\tGender=Masc|Number=Sing\t15\tnsubj\t_\t_\n13\tde\tde\tADP\t_\t\t14\tcase\t_\t_\n14\tplume\tplume\tNOUN\t_\tGender=Fem|Number=Sing\t12\tnmod\t_\t_\n15\t:--figurez\t:--figurer\tVERB\t_\tMood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin\t6\tparataxis\t_\t_\n16\t-vous\tvous\tPRON\t_\tNumber=Plur|Person=2|PronType=Prs\t15\tobj\t_\t_\n17\tdon\tdon\tADP\t_\t\t18\tcase\t_\t_\n18\tQuichotte\tQuichotte\tPROPN\t_\t\t16\tnmod\t_\t_\n19\tà\tà\tADP\t_\t\t21\tcase\t_\t_\n20\tdix-huit\tdix-huit\tNUM\t_\t\t21\tnummod\t_\t_\n21\tans\tan\tNOUN\t_\tGender=Masc|Number=Plur\t18\tnmod\t_\t_\n22\t;\t;\tPUNCT\t_\t\t24\tpunct\t_\t_\n23\tdon\tdon\tADP\t_\t\t24\tcase\t_\t_\n24\tQuichotte\tQuichotte\tPROPN\t_\t\t6\tobl\t_\t_\n25\tdécorselé\tdécorseler\tVERB\t_\tGender=Masc|Number=Sing|Tense=Past|VerbForm=Part\t24\tacl\t_\t_\n26\t,\t,\tPUNCT\t_\t\t28\tpunct\t_\t_\n27\tsans\tsans\tADP\t_\t\t28\tcase\t_\t_\n28\thaubert\thaubert\tNOUN\t_\tGender=Masc|Number=Sing\t6\tobl\t_\t_\n29\tet\tet\tCCONJ\t_\t\t31\tcc\t_\t_\n30\tsans\tsans\tADP\t_\t\t31\tcase\t_\t_\n31\tcuissards\tcuissard\tNOUN\t_\tGender=Masc|Number=Plur\t28\tconj\t_\t_\n32\t.\t.\tPUNCT\t_\t\t6\tpunct\t_\t_\n33\tUn\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t34\tdet\t_\t_\n34\tinstant\tinstant\tNOUN\t_\tGender=Masc|Number=Sing\t36\tobl:mod\t_\t_\n35\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t36\tnsubj\t_\t_\n36\tbalança\tbalancer\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t36\tROOT\t_\t_\n37\tentre\tentre\tADP\t_\t\t39\tcase\t_\t_\n38\tles\tle\tDET\t_\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\t39\tdet\t_\t_\n39\tviolettes\tviolette\tNOUN\t_\tGender=Fem|Number=Plur\t36\tobl\t_\t_\n40\tet\tet\tCCONJ\t_\t\t42\tcc\t_\t_\n41\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t42\tdet\t_\t_\n42\tglaïeuls\tglaïeul\tNOUN\t_\tGender=Masc|Number=Plur\t39\tconj\t_\t_\n43\tque\tque\tPRON\t_\tPronType=Rel\t45\tobj\t_\t_\n44\tlui\tlui\tPRON\t_\tNumber=Sing|Person=3|PronType=Prs\t45\tiobj\t_\t_\n45\toffrait\toffrir\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin\t42\tacl:relcl\t_\t_\n46\tl'\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t47\tdet\t_\t_\n47\tombrage\tombrage\tNOUN\t_\tGender=Masc|Number=Sing\t45\tnsubj\t_\t_\n48\tde\tde\tADP\t_\t\t50\tcase\t_\t_\n49\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t50\tdet\t_\t_\n50\tarbres\tarbre\tNOUN\t_\tGender=Masc|Number=Plur\t47\tnmod\t_\t_\n51\tde\tde\tADP\t_\t\t52\tcase\t_\t_\n52\tMinerve\tMinerve\tPROPN\t_\t\t50\tnmod\t_\t_\n53\t,\t,\tPUNCT\t_\t\t56\tpunct\t_\t_\n54\tet\tet\tCCONJ\t_\t\t56\tcc\t_\t_\n55\tles\tle\tDET\t_\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\t56\tdet\t_\t_\n56\tnarcisses\tnarcisse\tNOUN\t_\tGender=Fem|Number=Plur\t39\tconj\t_\t_\n57\tet\tet\tCCONJ\t_\t\t59\tcc\t_\t_\n58\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t59\tdet\t_\t_\n59\tnymphéas\tnymphéa\tNOUN\t_\tGender=Masc|Number=Plur\t56\tconj\t_\t_\n60\tqui\tqui\tPRON\t_\tPronType=Rel\t62\tnsubj\t_\t_\n61\ts'\tse\tPRON\t_\tPerson=3|PronType=Prs\t62\texpl:comp\t_\t_\n62\télevaient\télever\tVERB\t_\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\t59\tacl:relcl\t_\t_\n63\tsur\tsur\tADP\t_\t\t65\tcase\t_\t_\n64\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t65\tdet\t_\t_\n65\tbords\tbord\tNOUN\t_\tGender=Masc|Number=Plur\t62\tobl\t_\t_\n66\tde\tde\tADP\t_\t\t69\tcase\t_\t_\n67\tle\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t69\tdet\t_\t_\n68\tpetit\tpetit\tADJ\t_\tGender=Masc|Number=Sing\t69\tamod\t_\t_\n69\tfleuve\tfleuve\tNOUN\t_\tGender=Masc|Number=Sing\t65\tnmod\t_\t_\n70\tou\tou\tCCONJ\t_\t\t71\tcc\t_\t_\n71\tflottaient\tflott\tVERB\t_\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\t62\tconj\t_\t_\n72\tà\tà\tADP\t_\t\t74\tcase\t_\t_\n73\tsa\tson\tDET\t_\tGender=Fem|Number=Sing|Poss=Yes|PronType=Prs\t74\tdet\t_\t_\n74\tsurface\tsurface\tNOUN\t_\tGender=Fem|Number=Sing\t71\tobl:arg\t_\t_\n75\t;\t;\tPUNCT\t_\t\t80\tpunct\t_\t_\n76\tmais\tmais\tCCONJ\t_\t\t80\tcc\t_\t_\n77\tbientôt\tbientôt\tADV\t_\t\t80\tadvmod\t_\t_\n78\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t80\tnsubj\t_\t_\n79\tse\tse\tPRON\t_\tPerson=3|PronType=Prs\t80\tobj\t_\t_\n80\tdécida\tdécider\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t36\tconj\t_\t_\n81\tpour\tpour\tADP\t_\t\t82\tcase\t_\t_\n82\tceux-ci\tcelui-ci\tPRON\t_\tGender=Masc|Number=Plur|PronType=Dem\t80\tobl\t_\t_\n83\t,\t,\tPUNCT\t_\t\t86\tpunct\t_\t_\n84\tet\tet\tCCONJ\t_\t\t86\tcc\t_\t_\n85\t,\t,\tPUNCT\t_\t\t84\tpunct\t_\t_\n86\tbondissant\tbondisser\tVERB\t_\tTense=Pres|VerbForm=Part\t80\tconj\t_\t_\n87\tcomme\tcomme\tADP\t_\t\t90\tcase\t_\t_\n88\tun\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t90\tdet\t_\t_\n89\tjeune\tjeune\tADJ\t_\tGender=Masc|Number=Sing\t90\tamod\t_\t_\n90\tfaon\tfaon\tNOUN\t_\tGender=Masc|Number=Sing\t86\tobl:mod\t_\t_\n91\t,\t,\tPUNCT\t_\t\t93\tpunct\t_\t_\n92\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t93\tnsubj\t_\t_\n93\tcourut\tcourir\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t80\tconj\t_\t_\n94\tvers\tvers\tADP\t_\t\t96\tcase\t_\t_\n95\tle\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t96\tdet\t_\t_\n96\truisseau\truisseau\tNOUN\t_\tGender=Masc|Number=Sing\t93\tobl\t_\t_\n97\t.\t.\tPUNCT\t_\t\t36\tpunct\t_\t_"
%% Cell type:code id:5d7c6586 tags:
``` python
def apply_special_conditions(tok_string):
tok_string = tok_string.replace("", "'").replace("_", '"')\
.replace(":--", " : - ")\
.replace("...", " ...").replace("!", " !")
tok_string = tok_string.replace("d' Artagnan", "d'Artagnan")
return tok_string
def create_tokenized_string(sent_string):
sent_string = apply_special_conditions(sent_string)
doc_tokenized = nlp(sent_string)
tokens_list = [ ]
for token in doc_tokenized:
tokens_list.append(token.text)
text_tokenized = ' '.join(tokens_list)
#text_tokenized = apply_special_conditions(text_tokenized)
conll_doc = nlpdoc_to_conll(doc_tokenized)
return text_tokenized, conll_doc
```
%% Cell type:code id:5bfce4b4 tags:
``` python
for sent in sentences:
tok_text, conll_text = create_tokenized_string(sent)
print(tok_text)
print(conll_text)
```
%% Output
Un jeune homme ... -- traçons son portrait d' un seul trait de plume : - figurez -vous don Quichotte à dix-huit ans ; don Quichotte décorselé , sans haubert et sans cuissards .
1 Un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 3 det _ _
2 jeune jeune ADJ _ Gender=Masc|Number=Sing 3 amod _ _
3 homme homme NOUN _ Gender=Masc|Number=Sing 6 nsubj _ _
4 ... ... PUNCT _ 3 punct _ _
5 -- -- PUNCT _ 3 punct _ _
6 traçons traçer VERB _ Mood=Imp|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin 6 ROOT _ _
7 son son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _
8 portrait portrait NOUN _ Gender=Masc|Number=Sing 6 obj _ _
9 d' de ADP _ 12 case _ _
10 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 12 det _ _
11 seul seul ADJ _ Gender=Masc|Number=Sing 12 amod _ _
12 trait trait NOUN _ Gender=Masc|Number=Sing 8 nmod _ _
13 de de ADP _ 14 case _ _
14 plume plume NOUN _ Gender=Fem|Number=Sing 12 nmod _ _
15 : : PUNCT _ 17 punct _ _
16 - - PUNCT _ 17 punct _ _
17 figurez figurer VERB _ Mood=Imp|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin 6 parataxis _ _
18 -vous le PRON _ Number=Plur|Person=2|PronType=Prs 17 nsubj _ _
19 don don ADP _ 20 case _ _
20 Quichotte Quichotte PROPN _ 17 obl:arg _ _
21 à à ADP _ 23 case _ _
22 dix-huit dix-huit NUM _ 23 nummod _ _
23 ans an NOUN _ Gender=Masc|Number=Plur 20 nmod _ _
24 ; ; PUNCT _ 26 punct _ _
25 don don ADP _ 26 case _ _
26 Quichotte Quichotte PROPN _ 20 nmod _ _
27 décorselé décorseler VERB _ Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part 26 acl _ _
28 , , PUNCT _ 30 punct _ _
29 sans sans ADP _ 30 case _ _
30 haubert haubert NOUN _ Gender=Masc|Number=Sing 20 nmod _ _
31 et et CCONJ _ 33 cc _ _
32 sans sans ADP _ 33 case _ _
33 cuissards cuissard NOUN _ Gender=Masc|Number=Plur 30 conj _ _
34 . . PUNCT _ 6 punct _ _
Un instant elle balança entre les violettes et les glaïeuls que lui offrait l' ombrage de les arbres de Minerve , et les narcisses et les nymphéas qui s' élevaient sur les bords de le petit fleuve ou flottaient à sa surface ; mais bientôt elle se décida pour ceux-ci , et , bondissant comme un jeune faon , elle courut vers le ruisseau .
1 Un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 2 det _ _
2 instant instant NOUN _ Gender=Masc|Number=Sing 4 obl:mod _ _
3 elle il PRON _ Gender=Fem|Number=Sing|Person=3|PronType=Prs 4 nsubj _ _
4 balança balancer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 ROOT _ _
5 entre entre ADP _ 7 case _ _
6 les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 7 det _ _
7 violettes violette NOUN _ Gender=Fem|Number=Plur 4 obl _ _
8 et et CCONJ _ 10 cc _ _
9 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 10 det _ _
10 glaïeuls glaïeul NOUN _ Gender=Masc|Number=Plur 7 conj _ _
11 que que PRON _ PronType=Rel 13 obj _ _
12 lui lui PRON _ Number=Sing|Person=3|PronType=Prs 13 iobj _ _
13 offrait offrir VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin 10 acl:relcl _ _
14 l' le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 15 det _ _
15 ombrage ombrage NOUN _ Gender=Masc|Number=Sing 13 nsubj _ _
16 de de ADP _ 18 case _ _
17 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 18 det _ _
18 arbres arbre NOUN _ Gender=Masc|Number=Plur 15 nmod _ _
19 de de ADP _ 20 case _ _
20 Minerve Minerve PROPN _ 18 nmod _ _
21 , , PUNCT _ 24 punct _ _
22 et et CCONJ _ 24 cc _ _
23 les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 24 det _ _
24 narcisses narcisse NOUN _ Gender=Fem|Number=Plur 7 conj _ _
25 et et CCONJ _ 27 cc _ _
26 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 27 det _ _
27 nymphéas nymphéa NOUN _ Gender=Masc|Number=Plur 24 conj _ _
28 qui qui PRON _ PronType=Rel 30 nsubj _ _
29 s' se PRON _ Person=3|PronType=Prs 30 expl:comp _ _
30 élevaient élever VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin 27 acl:relcl _ _
31 sur sur ADP _ 33 case _ _
32 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 33 det _ _
33 bords bord NOUN _ Gender=Masc|Number=Plur 30 obl _ _
34 de de ADP _ 37 case _ _
35 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 37 det _ _
36 petit petit ADJ _ Gender=Masc|Number=Sing 37 amod _ _
37 fleuve fleuve NOUN _ Gender=Masc|Number=Sing 33 nmod _ _
38 ou ou CCONJ _ 39 cc _ _
39 flottaient flott VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin 30 conj _ _
40 à à ADP _ 42 case _ _
41 sa son DET _ Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs 42 det _ _
42 surface surface NOUN _ Gender=Fem|Number=Sing 39 obl:arg _ _
43 ; ; PUNCT _ 48 punct _ _
44 mais mais CCONJ _ 48 cc _ _
45 bientôt bientôt ADV _ 48 advmod _ _
46 elle il PRON _ Gender=Fem|Number=Sing|Person=3|PronType=Prs 48 nsubj _ _
47 se se PRON _ Person=3|PronType=Prs 48 obj _ _
48 décida décider VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 conj _ _
49 pour pour ADP _ 50 case _ _
50 ceux-ci celui-ci PRON _ Gender=Masc|Number=Plur|PronType=Dem 48 obl _ _
51 , , PUNCT _ 54 punct _ _
52 et et CCONJ _ 54 cc _ _
53 , , PUNCT _ 52 punct _ _
54 bondissant bondisser VERB _ Tense=Pres|VerbForm=Part 48 conj _ _
55 comme comme ADP _ 58 case _ _
56 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 58 det _ _
57 jeune jeune ADJ _ Gender=Masc|Number=Sing 58 amod _ _
58 faon faon NOUN _ Gender=Masc|Number=Sing 54 obl:mod _ _
59 , , PUNCT _ 61 punct _ _
60 elle il PRON _ Gender=Fem|Number=Sing|Person=3|PronType=Prs 61 nsubj _ _
61 courut courir VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 48 conj _ _
62 vers vers ADP _ 64 case _ _
63 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 64 det _ _
64 ruisseau ruisseau NOUN _ Gender=Masc|Number=Sing 61 obl _ _
65 . . PUNCT _ 4 punct _ _
%% Cell type:markdown id:41d4b38f tags:
### Store the cleaned tokenized text sentence-wise in a file
%% Cell type:code id:65825f08 tags:
``` python
def chunk_file(name, lines_per_chunk, chunks_per_file):
booktitle = name.split("texts/")[1].replace('.txt', '')
print(booktitle)
target_folder = '../data/gutenberg/tokenized_texts/'
target_file_path = target_folder + booktitle + '.tok'
#os.makedirs(os.path.dirname(target_folder), exist_ok=True)
#if os.path.exists(target_file_path): os.remove(target_file_path)
if os.path.exists(target_file_path + '.conll'): os.remove(target_file_path + '.conll')
outfile = open(target_file_path, "a")
outfile_conll = open(target_file_path + '.conll', "a")
def bad_chunk(chunk_text):
unwanted_start_of_string = ("Alexandre Dumas", u"ACTÉ",
"Chapitre", "(18", u"Préface",
u"Table des matières", u"_Résumé_",
"[ Illustration", "[Illustration")
if chunk_text.startswith(unwanted_start_of_string):
return True
def write_chunk(chunk_no, chunk):
chunk_text = ' '.join(chunk).replace('\n', ' ').strip()
if len(chunk_text) > 0 and not bad_chunk(chunk_text):
doc = nlp(chunk_text)
sentences = [sent.text.strip() for sent in doc.sents]
for sent in sentences:
tokenized_sent, conll_doc = create_tokenized_string(sent)
#print(tokenized_sent)
#print(conll_doc)
outfile.write(tokenized_sent + '\n')
outfile_conll.write(conll_doc + '\n\n')
count, chunk_no, chunk_count, chunk = 1, 1, 0, []
with open(name, "r") as f:
for row in f:
if count > lines_per_chunk and row == "\n":
chunk_count += 1
count = 1
chunk.append("\n")
if chunk_count == chunks_per_file:
write_chunk(chunk_no, chunk)
chunk = []
chunk_count = 0
chunk_no += 1
else:
count += 1
chunk.append(row)
if chunk:
write_chunk(chunk_no, chunk)
outfile.close()
outfile_conll.close()
```
%% Cell type:code id:95943233 tags:
``` python
import glob
dir_with_raw_files = [x for x in glob.glob('../data/gutenberg/raw_texts/Vingt*')]
for raw_file_path in dir_with_raw_files:
chunk_file(raw_file_path, 1, 1)
```
%% Output
Vingt_ans_après
%% Cell type:markdown id:f8ccc794 tags:
## Process pre-tokenized texts with stanza and create train dev test
%% Cell type:code id:eb6ca7ba tags:
``` python
import stanza
#stanza.download("fr")
nlp_tokenized = stanza.Pipeline(lang='fr', processors='tokenize, pos, lemma, depparse', tokenize_pretokenized=True)
nlp_pos = stanza.Pipeline(lang='fr', processors='tokenize, pos, lemma, depparse', tokenize_pretokenized=True)
```
%% Output
2025-01-09 17:11:12 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 26.7MB/s]
2025-01-09 17:11:15 INFO: Loading these models for language: fr (French):
=================================
| Processor | Package |
---------------------------------
| tokenize | combined |
| pos | combined_charlm |
| lemma | combined_nocharlm |
| depparse | combined_charlm |
=================================
2025-01-09 17:11:15 INFO: Using device: cuda
2025-01-09 17:11:15 INFO: Loading: tokenize
2025-01-09 17:11:15 INFO: Loading: pos
2025-01-09 17:11:19 INFO: Loading: lemma
2025-01-09 17:11:19 INFO: Loading: depparse
2025-01-09 17:11:19 INFO: Done loading processors!
2025-01-09 17:11:19 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 18.9MB/s]
2025-01-09 17:11:20 INFO: Loading these models for language: fr (French):
=================================
| Processor | Package |
---------------------------------
| tokenize | combined |
| pos | combined_charlm |
| lemma | combined_nocharlm |
| depparse | combined_charlm |
=================================
2025-01-09 17:11:20 INFO: Using device: cuda
2025-01-09 17:11:20 INFO: Loading: tokenize
2025-01-09 17:11:20 INFO: Loading: pos
2025-01-09 17:11:21 INFO: Loading: lemma
2025-01-09 17:11:21 INFO: Loading: depparse
2025-01-09 17:11:21 INFO: Done loading processors!
%% Cell type:code id:ad525136 tags:
``` python
text = "Je mange des pommes ."
doc = nlp_tokenized(text)
doc = nlp_pos(doc)
print("{:C}".format(doc) + '\n')
for i, sentence in enumerate(doc.sentences):
print(sentence)
```
%% Output
# text = Je mange des pommes .
# sent_id = 0
1 Je moi PRON _ Emph=No|Number=Sing|Person=1|PronType=Prs 2 nsubj _ start_char=0|end_char=2
2 mange manger VERB _ Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin 0 root _ start_char=3|end_char=8
3 des un DET _ Definite=Ind|Number=Plur|PronType=Art 4 det _ start_char=9|end_char=12
4 pommes pomme NOUN _ Gender=Fem|Number=Plur 2 obj _ start_char=13|end_char=19
5 . . PUNCT _ _ 2 punct _ start_char=20|end_char=21
[
{
"id": 1,
"text": "Je",
"lemma": "moi",
"upos": "PRON",
"feats": "Emph=No|Number=Sing|Person=1|PronType=Prs",
"head": 2,
"deprel": "nsubj",
"misc": "",
"start_char": 0,
"end_char": 2
},
{
"id": 2,
"text": "mange",
"lemma": "manger",
"upos": "VERB",
"feats": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
"head": 0,
"deprel": "root",
"misc": "",
"start_char": 3,
"end_char": 8
},
{
"id": 3,
"text": "des",
"lemma": "un",
"upos": "DET",
"feats": "Definite=Ind|Number=Plur|PronType=Art",
"head": 4,
"deprel": "det",
"misc": "",
"start_char": 9,
"end_char": 12
},
{
"id": 4,
"text": "pommes",
"lemma": "pomme",
"upos": "NOUN",
"feats": "Gender=Fem|Number=Plur",
"head": 2,
"deprel": "obj",
"misc": "",
"start_char": 13,
"end_char": 19
},
{
"id": 5,
"text": ".",
"lemma": ".",
"upos": "PUNCT",
"head": 2,
"deprel": "punct",
"misc": "",
"start_char": 20,
"end_char": 21
}
]
%% Cell type:code id:e8f70a45 tags:
``` python
import glob
from sklearn.model_selection import train_test_split
dir_with_pretokenized_files = [x for x in glob.glob('../data/gutenberg/tokenized_texts/Vingt_ans_après.tok')]
for tokenized_file_path in dir_with_pretokenized_files:
booktitle = tokenized_file_path.split("texts/")[1].replace('.tok', '')
print(booktitle)
with open(tokenized_file_path, 'r') as tokfile:
documents = tokfile.readlines()
X_train, X_test = train_test_split(documents, test_size=0.1, shuffle=False, stratify = None)
X_train, X_val = train_test_split(X_train, test_size=0.1, shuffle=False, stratify = None)
in_docs_train = [stanza.Document([], text=d) for d in X_train] # Wrap each document with a stanza.Document object
out_docs_train = nlp_pos(in_docs_train) # Call the neural pipeline on this list of documents
in_docs_test = [stanza.Document([], text=d) for d in X_test] # Wrap each document with a stanza.Document object
out_docs_test = nlp_pos(in_docs_test) # Call the neural pipeline on this list of documents
in_docs_val = [stanza.Document([], text=d) for d in X_val] # Wrap each document with a stanza.Document object
out_docs_val = nlp_pos(in_docs_val) # Call the neural pipeline on this list of documents
print(len(out_docs_train))
print(len(out_docs_test))
print(len(out_docs_val))
target_folder = '../data/gutenberg/tokenized_texts/'
target_file_path = target_folder + booktitle + '.tok'
#os.makedirs(os.path.dirname(target_folder), exist_ok=True)
target_file_path = target_folder + booktitle + '.tok'
outfile_conll_val = open(target_file_path + '.dev.conll', "w")
outfile_conll_train = open(target_file_path + '.train.conll', "w")
outfile_conll_test = open(target_file_path + '.test.conll', "w")
for out_doc in out_docs_train:#.sentences:
conllstr = "{:C}".format(out_doc)
outfile_conll_train.write(conllstr + '\n\n')
for out_doc in out_docs_val:#.sentences:
conllstr = "{:C}".format(out_doc)
outfile_conll_val.write(conllstr + '\n\n')
for out_doc in out_docs_test:#.sentences:
conllstr = "{:C}".format(out_doc)
outfile_conll_test.write(conllstr + '\n\n')
outfile_conll_val.close()
outfile_conll_train.close()
outfile_conll_test.close()
```
%% Output
Vingt_ans_après
14450
1784
1606
%% Cell type:code id:6b50d131 tags:
``` python
```
%% Cell type:code id:fbedfcb1 tags:
``` python
# add an empty line before each # text =
#%cd ..
#%cd data/gutenberg/tokenized_texts
#! sed -i 's/# text =/\n\n# text =/g' *.conll
#%cd ..
#%cd ..
#%cd ..
#%cd notebooks
```
%% Cell type:code id:960bb524 tags:
``` python
#!ls
```
%% Cell type:code id:3294ef16 tags:
``` python
# concatenate the files for the train, dev, and test split and move them to the incpar folder
#%cd ..
#%cd data/gutenberg/tokenized_texts
```
%% Cell type:code id:1a56e036 tags:
``` python
#! cat *.tok.dev.conll > gutenberg_dumas_dev.tok.conll
```
%% Cell type:code id:3f216cc3 tags:
``` python
#! cat *.tok.train.conll > gutenberg_dumas_train.tok.conll
```
%% Cell type:code id:880e80d9 tags:
``` python
#! cat *.tok.test.conll > gutenberg_dumas_test.tok.conll
```
%% Cell type:code id:ea3c1380 tags:
``` python
#%cd ..
#%cd ..
#%cd ..
#%cd ..
```
%% Cell type:code id:700586eb tags:
``` python
#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_dev.tok.conll incpar/data/gutenberg_dumas
```
%% Cell type:code id:43fe6a94 tags:
``` python
#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_test.tok.conll incpar/data/gutenberg_dumas
```
%% Cell type:code id:82dd0717 tags:
``` python
#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_train.tok.conll incpar/data/gutenberg_dumas
```
%% Cell type:code id:7fbce7a2 tags:
``` python
#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_dev.tok.conll compo-gpt-model/data
```
%% Cell type:code id:45e652a6 tags:
``` python
#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_test.tok.conll compo-gpt-model/data
```
%% Cell type:code id:0078cbb5 tags:
``` python
#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_train.tok.conll compo-gpt-model/data
```
%% Cell type:code id:1b4a088b tags:
``` python
```
......@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "e16128ee",
"metadata": {},
"outputs": [],
......@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "9ea15819",
"metadata": {},
"outputs": [],
......@@ -58,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "c01a6cb4",
"metadata": {},
"outputs": [
......@@ -224,7 +224,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "1d6385f5",
"metadata": {},
"outputs": [
......@@ -389,7 +389,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "78b86a1c",
"metadata": {},
"outputs": [
......@@ -399,7 +399,7 @@
"' create Gutenberg database\\nAttention! This needs to be done just once for an OS and it roughly takes 20 minutes '"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
......@@ -415,7 +415,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "ca9f49f5",
"metadata": {},
"outputs": [],
......@@ -426,7 +426,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "52b3bb4a",
"metadata": {},
"outputs": [
......@@ -446,7 +446,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "f81f5dfc",
"metadata": {},
"outputs": [
......@@ -467,7 +467,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "fdf01c69",
"metadata": {},
"outputs": [
......@@ -487,7 +487,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "a31b4e73",
"metadata": {},
"outputs": [
......@@ -507,7 +507,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "4d85e390",
"metadata": {},
"outputs": [
......@@ -527,7 +527,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"id": "63e2231a",
"metadata": {},
"outputs": [
......@@ -547,7 +547,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "bb4dc1f4",
"metadata": {},
"outputs": [
......@@ -567,7 +567,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"id": "c1a748a6",
"metadata": {},
"outputs": [
......
This diff is collapsed.
%% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:
## Type-token ratio
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
``` python
def compute_ttr(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
Only the first column is used (tokens).
Parameters:
- text: str, the input text in column format
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0]
tokens.append(token)
if not tokens:
return 0.0
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
``` python
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
"""
ttr = compute_ttr(sample_text)
print(f"Type/Token Ratio: {ttr:.3f}")
```
%% Output
Type/Token Ratio: 0.933
%% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:
``` python
```
%% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:
## Type-token ratio
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
``` python
def compute_ttr(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
Only the first column is used (tokens).
Parameters:
- text: str, the input text in column format
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0]
tokens.append(token)
if not tokens:
return 0.0
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
``` python
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
"""
ttr = compute_ttr(sample_text)
print(f"Type/Token Ratio: {ttr:.3f}")
```
%% Output
Type/Token Ratio: 0.933
%% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:
``` python
```