diff --git a/.ipynb_checkpoints/gutenberg-preprocess-data-checkpoint.ipynb b/.ipynb_checkpoints/gutenberg-preprocess-data-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6fa01e12ea9b2013e504ea045c566466ff5a470a
--- /dev/null
+++ b/.ipynb_checkpoints/gutenberg-preprocess-data-checkpoint.ipynb
@@ -0,0 +1,889 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "75eff004",
+   "metadata": {},
+   "source": [
+    "## Sentence segmentation and tokenization with SpaCy UDPipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "12f2533d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! pip install spacy-udpipe\n",
+    "#! pip install pip install stanza -U\n",
+    "import spacy_udpipe\n",
+    "import os\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0039ad52",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tatiana.bladier/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import spacy\n",
+    "from spacy.language import Language\n",
+    "\n",
+    "nlp = spacy_udpipe.load(\"fr\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "728329cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<spacy.pipeline.dep_parser.DependencyParser at 0x7f356c2670d0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@Language.component(\"set_custom_boundaries\")\n",
+    "def set_custom_boundaries(doc):\n",
+    "    for token in doc[:-1]:\n",
+    "        if token.text in [\"; \", \" ; \", \" ;\"]:\n",
+    "            doc[token.i + 1].is_sent_start = False\n",
+    "    return doc\n",
+    "nlp.add_pipe(\"parser\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8d4237bf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<thinc.optimizers.Optimizer at 0x7f3472e77420>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp.add_pipe(\"set_custom_boundaries\", first=True)\n",
+    "\n",
+    "nlp.initialize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5219a6ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "text = \"Celui qui chantait seul, et qui paraissait le maître du bâtiment, se tenait debout \\\n",
+    "        à la proue et s'accompagnait d'une cythare à trois cordes, pareille à \\\n",
+    "        celle que les statuaires mettent aux mains d'Euterpe, la muse de \\\n",
+    "        l'harmonie.\"\n",
+    "\n",
+    "\n",
+    "text = \"C'est un magnifique enfant du Darfour, noir comme un charbon et qui a \\\n",
+    "        déjà l'air d'un homme, quoiqu'il n'ait, selon toute probabilité, que \\\n",
+    "        onze ou douze ans. Je dis _selon toute probabilité_, parce qu'il n'y a \\\n",
+    "        pas d'exemple qu'un nègre sache son âge. Celui-là... Pardon, \\\n",
+    "        j'oubliais de vous dire son nom. Il se nomme Abailard.\"\n",
+    "\n",
+    "\n",
+    "text = 'Il y a un an à peu près qu’en faisant à la Bibliothèque royale des \\\n",
+    "        recherches pour mon histoire de Louis XIV, je tombai par hasard sur les \\\n",
+    "        _Mémoires de M. d’Artagnan_, imprimés,--comme la plus grande partie des \\\n",
+    "        ouvrages de cette époque, où les auteurs tenaient à dire la vérité sans \\\n",
+    "        aller faire un tour plus ou moins long à la Bastille,--à Amsterdam, \\\n",
+    "        chez Pierre Rouge. Le titre me séduisit: je les emportai chez moi, \\\n",
+    "        avec la permission de M. le conservateur, bien entendu, et je les \\\n",
+    "        dévorai.'\n",
+    "\n",
+    "\n",
+    "text = \"Un jeune homme...--traçons son portrait d’un seul trait de \\\n",
+    "plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte \\\n",
+    "décorselé, sans haubert et sans cuissards.\"\n",
+    "\n",
+    "\n",
+    "text = \"Nous avons pris possesion de 2 chambres cabines ayant les toilettes en commun: poussière et saleté étaient au rendez-vous!\"\n",
+    "\"\"\"\n",
+    "\n",
+    "text = \"Un jeune homme...--traçons son portrait d’un seul trait de \\\n",
+    "plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte \\\n",
+    "décorselé, sans haubert et sans cuissards. Un instant elle balança entre \\\n",
+    "les violettes et les glaïeuls que lui offrait l'ombrage des arbres de \\\n",
+    "Minerve, et les narcisses et les nymphéas qui s'élevaient sur les bords \\\n",
+    "du petit fleuve ou flottaient à sa surface; mais bientôt elle se décida \\\n",
+    "pour ceux-ci, et, bondissant comme un jeune faon, elle courut vers le \\\n",
+    "ruisseau.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1218955c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "After: ['Un jeune homme...--traçons son portrait d’un seul trait de plume:--figurez-vous don Quichotte à dix-huit ans; don Quichotte décorselé, sans haubert et sans cuissards.', \"Un instant elle balança entre les violettes et les glaïeuls que lui offrait l'ombrage de les arbres de Minerve, et les narcisses et les nymphéas qui s'élevaient sur les bords de le petit fleuve ou flottaient à sa surface; mais bientôt elle se décida pour ceux-ci, et, bondissant comme un jeune faon, elle courut vers le ruisseau.\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc1 = nlp(text)\n",
+    "sentences = [sent.text for sent in doc1.sents]\n",
+    "\n",
+    "\n",
+    "print(\"After:\", [sent.text for sent in doc1.sents])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1b99da40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nlpdoc_to_conll(nlpdoc):\n",
+    "    sent_length = len([token.text for token in nlpdoc])\n",
+    "    conll_lst = []\n",
+    "    \n",
+    "    # Preliminary: whole sentence\n",
+    "    whole_sentence = nlpdoc.text\n",
+    "    #print('# text =', whole_sentence)   \n",
+    "    \n",
+    "    \n",
+    "    i = 1\n",
+    "    for token in nlpdoc:\n",
+    "        line_str = str(token.i +1) + \"\\t\" + str(token.text) + \"\\t\" + str(token.lemma_) \\\n",
+    "                              + \"\\t\" + str(token.pos_) + \"\\t\" + '_' + \"\\t\" \\\n",
+    "                              +  str(token.morph) + \"\\t\" + str(token.head.i + 1) + \"\\t\" \\\n",
+    "                              + str(token.dep_) + \"\\t\" + '_' + \"\\t\" + '_'\n",
+    "        conll_lst.append(line_str)\n",
+    "        i += 1\n",
+    "    conll_str = '\\n'.join(conll_lst)\n",
+    "    return conll_str\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "af17d1f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"1\\tUn\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t3\\tdet\\t_\\t_\\n2\\tjeune\\tjeune\\tADJ\\t_\\tGender=Masc|Number=Sing\\t3\\tamod\\t_\\t_\\n3\\thomme\\thomme\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tnsubj\\t_\\t_\\n4\\t...\\t...\\tPUNCT\\t_\\t\\t3\\tpunct\\t_\\t_\\n5\\t--\\t--\\tPUNCT\\t_\\t\\t3\\tpunct\\t_\\t_\\n6\\ttraçons\\ttraçer\\tVERB\\t_\\tMood=Imp|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin\\t6\\tROOT\\t_\\t_\\n7\\tson\\tson\\tDET\\t_\\tGender=Masc|Number=Sing|Poss=Yes|PronType=Prs\\t8\\tdet\\t_\\t_\\n8\\tportrait\\tportrait\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tobj\\t_\\t_\\n9\\td’\\td’\\tPROPN\\t_\\t\\t8\\tappos\\t_\\t_\\n10\\tun\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t12\\tdet\\t_\\t_\\n11\\tseul\\tseul\\tADJ\\t_\\tGender=Masc|Number=Sing\\t12\\tamod\\t_\\t_\\n12\\ttrait\\ttrait\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t15\\tnsubj\\t_\\t_\\n13\\tde\\tde\\tADP\\t_\\t\\t14\\tcase\\t_\\t_\\n14\\tplume\\tplume\\tNOUN\\t_\\tGender=Fem|Number=Sing\\t12\\tnmod\\t_\\t_\\n15\\t:--figurez\\t:--figurer\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin\\t6\\tparataxis\\t_\\t_\\n16\\t-vous\\tvous\\tPRON\\t_\\tNumber=Plur|Person=2|PronType=Prs\\t15\\tobj\\t_\\t_\\n17\\tdon\\tdon\\tADP\\t_\\t\\t18\\tcase\\t_\\t_\\n18\\tQuichotte\\tQuichotte\\tPROPN\\t_\\t\\t16\\tnmod\\t_\\t_\\n19\\tà\\tà\\tADP\\t_\\t\\t21\\tcase\\t_\\t_\\n20\\tdix-huit\\tdix-huit\\tNUM\\t_\\t\\t21\\tnummod\\t_\\t_\\n21\\tans\\tan\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t18\\tnmod\\t_\\t_\\n22\\t;\\t;\\tPUNCT\\t_\\t\\t24\\tpunct\\t_\\t_\\n23\\tdon\\tdon\\tADP\\t_\\t\\t24\\tcase\\t_\\t_\\n24\\tQuichotte\\tQuichotte\\tPROPN\\t_\\t\\t6\\tobl\\t_\\t_\\n25\\tdécorselé\\tdécorseler\\tVERB\\t_\\tGender=Masc|Number=Sing|Tense=Past|VerbForm=Part\\t24\\tacl\\t_\\t_\\n26\\t,\\t,\\tPUNCT\\t_\\t\\t28\\tpunct\\t_\\t_\\n27\\tsans\\tsans\\tADP\\t_\\t\\t28\\tcase\\t_\\t_\\n28\\thaubert\\thaubert\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tobl\\t_\\t_\\n29\\tet\\tet\\tCCONJ\\t_\\t\\t31\\tcc\\t_\\t_\\n30\\tsans\\tsans\\tADP\\t_\\t\\t31\\tcase\\t_\\t_\\n31\\tcuissards\\tcuissard\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t28\\tconj\\t_\\t_\\n32\\t.\\t.\\tPUNCT\\t_\\t\\t6\\tpunct\\t_\\t_\\n33\\tUn\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t34\\tdet\\t_\\t_\\n34\\tinstant\\tinstant\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t36\\tobl:mod\\t_\\t_\\n35\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t36\\tnsubj\\t_\\t_\\n36\\tbalança\\tbalancer\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t36\\tROOT\\t_\\t_\\n37\\tentre\\tentre\\tADP\\t_\\t\\t39\\tcase\\t_\\t_\\n38\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\\t39\\tdet\\t_\\t_\\n39\\tviolettes\\tviolette\\tNOUN\\t_\\tGender=Fem|Number=Plur\\t36\\tobl\\t_\\t_\\n40\\tet\\tet\\tCCONJ\\t_\\t\\t42\\tcc\\t_\\t_\\n41\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t42\\tdet\\t_\\t_\\n42\\tglaïeuls\\tglaïeul\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t39\\tconj\\t_\\t_\\n43\\tque\\tque\\tPRON\\t_\\tPronType=Rel\\t45\\tobj\\t_\\t_\\n44\\tlui\\tlui\\tPRON\\t_\\tNumber=Sing|Person=3|PronType=Prs\\t45\\tiobj\\t_\\t_\\n45\\toffrait\\toffrir\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin\\t42\\tacl:relcl\\t_\\t_\\n46\\tl'\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t47\\tdet\\t_\\t_\\n47\\tombrage\\tombrage\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t45\\tnsubj\\t_\\t_\\n48\\tde\\tde\\tADP\\t_\\t\\t50\\tcase\\t_\\t_\\n49\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t50\\tdet\\t_\\t_\\n50\\tarbres\\tarbre\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t47\\tnmod\\t_\\t_\\n51\\tde\\tde\\tADP\\t_\\t\\t52\\tcase\\t_\\t_\\n52\\tMinerve\\tMinerve\\tPROPN\\t_\\t\\t50\\tnmod\\t_\\t_\\n53\\t,\\t,\\tPUNCT\\t_\\t\\t56\\tpunct\\t_\\t_\\n54\\tet\\tet\\tCCONJ\\t_\\t\\t56\\tcc\\t_\\t_\\n55\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\\t56\\tdet\\t_\\t_\\n56\\tnarcisses\\tnarcisse\\tNOUN\\t_\\tGender=Fem|Number=Plur\\t39\\tconj\\t_\\t_\\n57\\tet\\tet\\tCCONJ\\t_\\t\\t59\\tcc\\t_\\t_\\n58\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t59\\tdet\\t_\\t_\\n59\\tnymphéas\\tnymphéa\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t56\\tconj\\t_\\t_\\n60\\tqui\\tqui\\tPRON\\t_\\tPronType=Rel\\t62\\tnsubj\\t_\\t_\\n61\\ts'\\tse\\tPRON\\t_\\tPerson=3|PronType=Prs\\t62\\texpl:comp\\t_\\t_\\n62\\télevaient\\télever\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\\t59\\tacl:relcl\\t_\\t_\\n63\\tsur\\tsur\\tADP\\t_\\t\\t65\\tcase\\t_\\t_\\n64\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t65\\tdet\\t_\\t_\\n65\\tbords\\tbord\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t62\\tobl\\t_\\t_\\n66\\tde\\tde\\tADP\\t_\\t\\t69\\tcase\\t_\\t_\\n67\\tle\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t69\\tdet\\t_\\t_\\n68\\tpetit\\tpetit\\tADJ\\t_\\tGender=Masc|Number=Sing\\t69\\tamod\\t_\\t_\\n69\\tfleuve\\tfleuve\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t65\\tnmod\\t_\\t_\\n70\\tou\\tou\\tCCONJ\\t_\\t\\t71\\tcc\\t_\\t_\\n71\\tflottaient\\tflott\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\\t62\\tconj\\t_\\t_\\n72\\tà\\tà\\tADP\\t_\\t\\t74\\tcase\\t_\\t_\\n73\\tsa\\tson\\tDET\\t_\\tGender=Fem|Number=Sing|Poss=Yes|PronType=Prs\\t74\\tdet\\t_\\t_\\n74\\tsurface\\tsurface\\tNOUN\\t_\\tGender=Fem|Number=Sing\\t71\\tobl:arg\\t_\\t_\\n75\\t;\\t;\\tPUNCT\\t_\\t\\t80\\tpunct\\t_\\t_\\n76\\tmais\\tmais\\tCCONJ\\t_\\t\\t80\\tcc\\t_\\t_\\n77\\tbientôt\\tbientôt\\tADV\\t_\\t\\t80\\tadvmod\\t_\\t_\\n78\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t80\\tnsubj\\t_\\t_\\n79\\tse\\tse\\tPRON\\t_\\tPerson=3|PronType=Prs\\t80\\tobj\\t_\\t_\\n80\\tdécida\\tdécider\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t36\\tconj\\t_\\t_\\n81\\tpour\\tpour\\tADP\\t_\\t\\t82\\tcase\\t_\\t_\\n82\\tceux-ci\\tcelui-ci\\tPRON\\t_\\tGender=Masc|Number=Plur|PronType=Dem\\t80\\tobl\\t_\\t_\\n83\\t,\\t,\\tPUNCT\\t_\\t\\t86\\tpunct\\t_\\t_\\n84\\tet\\tet\\tCCONJ\\t_\\t\\t86\\tcc\\t_\\t_\\n85\\t,\\t,\\tPUNCT\\t_\\t\\t84\\tpunct\\t_\\t_\\n86\\tbondissant\\tbondisser\\tVERB\\t_\\tTense=Pres|VerbForm=Part\\t80\\tconj\\t_\\t_\\n87\\tcomme\\tcomme\\tADP\\t_\\t\\t90\\tcase\\t_\\t_\\n88\\tun\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t90\\tdet\\t_\\t_\\n89\\tjeune\\tjeune\\tADJ\\t_\\tGender=Masc|Number=Sing\\t90\\tamod\\t_\\t_\\n90\\tfaon\\tfaon\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t86\\tobl:mod\\t_\\t_\\n91\\t,\\t,\\tPUNCT\\t_\\t\\t93\\tpunct\\t_\\t_\\n92\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t93\\tnsubj\\t_\\t_\\n93\\tcourut\\tcourir\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t80\\tconj\\t_\\t_\\n94\\tvers\\tvers\\tADP\\t_\\t\\t96\\tcase\\t_\\t_\\n95\\tle\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t96\\tdet\\t_\\t_\\n96\\truisseau\\truisseau\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t93\\tobl\\t_\\t_\\n97\\t.\\t.\\tPUNCT\\t_\\t\\t36\\tpunct\\t_\\t_\""
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlpdoc_to_conll(doc1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "5d7c6586",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def apply_special_conditions(tok_string):\n",
+    "    tok_string = tok_string.replace(\"’\", \"'\").replace(\"_\", '\"')\\\n",
+    "                  .replace(\":--\", \" : - \")\\\n",
+    "                  .replace(\"...\", \" ...\").replace(\"!\", \" !\")\n",
+    "    tok_string = tok_string.replace(\"d' Artagnan\", \"d'Artagnan\")\n",
+    "    return tok_string\n",
+    "    \n",
+    "\n",
+    "def create_tokenized_string(sent_string):\n",
+    "    sent_string = apply_special_conditions(sent_string)\n",
+    "    doc_tokenized = nlp(sent_string)\n",
+    "    tokens_list = [ ]\n",
+    "    for token in doc_tokenized:\n",
+    "        tokens_list.append(token.text)\n",
+    "    \n",
+    "    text_tokenized = ' '.join(tokens_list)\n",
+    "    #text_tokenized = apply_special_conditions(text_tokenized)\n",
+    "    \n",
+    "    conll_doc = nlpdoc_to_conll(doc_tokenized)\n",
+    "\n",
+    "    return text_tokenized, conll_doc\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5bfce4b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Un jeune homme ... -- traçons son portrait d' un seul trait de plume : - figurez -vous don Quichotte à dix-huit ans ; don Quichotte décorselé , sans haubert et sans cuissards .\n",
+      "1\tUn\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t3\tdet\t_\t_\n",
+      "2\tjeune\tjeune\tADJ\t_\tGender=Masc|Number=Sing\t3\tamod\t_\t_\n",
+      "3\thomme\thomme\tNOUN\t_\tGender=Masc|Number=Sing\t6\tnsubj\t_\t_\n",
+      "4\t...\t...\tPUNCT\t_\t\t3\tpunct\t_\t_\n",
+      "5\t--\t--\tPUNCT\t_\t\t3\tpunct\t_\t_\n",
+      "6\ttraçons\ttraçer\tVERB\t_\tMood=Imp|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin\t6\tROOT\t_\t_\n",
+      "7\tson\tson\tDET\t_\tGender=Masc|Number=Sing|Poss=Yes|PronType=Prs\t8\tdet\t_\t_\n",
+      "8\tportrait\tportrait\tNOUN\t_\tGender=Masc|Number=Sing\t6\tobj\t_\t_\n",
+      "9\td'\tde\tADP\t_\t\t12\tcase\t_\t_\n",
+      "10\tun\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t12\tdet\t_\t_\n",
+      "11\tseul\tseul\tADJ\t_\tGender=Masc|Number=Sing\t12\tamod\t_\t_\n",
+      "12\ttrait\ttrait\tNOUN\t_\tGender=Masc|Number=Sing\t8\tnmod\t_\t_\n",
+      "13\tde\tde\tADP\t_\t\t14\tcase\t_\t_\n",
+      "14\tplume\tplume\tNOUN\t_\tGender=Fem|Number=Sing\t12\tnmod\t_\t_\n",
+      "15\t:\t:\tPUNCT\t_\t\t17\tpunct\t_\t_\n",
+      "16\t-\t-\tPUNCT\t_\t\t17\tpunct\t_\t_\n",
+      "17\tfigurez\tfigurer\tVERB\t_\tMood=Imp|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin\t6\tparataxis\t_\t_\n",
+      "18\t-vous\tle\tPRON\t_\tNumber=Plur|Person=2|PronType=Prs\t17\tnsubj\t_\t_\n",
+      "19\tdon\tdon\tADP\t_\t\t20\tcase\t_\t_\n",
+      "20\tQuichotte\tQuichotte\tPROPN\t_\t\t17\tobl:arg\t_\t_\n",
+      "21\tà\tà\tADP\t_\t\t23\tcase\t_\t_\n",
+      "22\tdix-huit\tdix-huit\tNUM\t_\t\t23\tnummod\t_\t_\n",
+      "23\tans\tan\tNOUN\t_\tGender=Masc|Number=Plur\t20\tnmod\t_\t_\n",
+      "24\t;\t;\tPUNCT\t_\t\t26\tpunct\t_\t_\n",
+      "25\tdon\tdon\tADP\t_\t\t26\tcase\t_\t_\n",
+      "26\tQuichotte\tQuichotte\tPROPN\t_\t\t20\tnmod\t_\t_\n",
+      "27\tdécorselé\tdécorseler\tVERB\t_\tGender=Masc|Number=Sing|Tense=Past|VerbForm=Part\t26\tacl\t_\t_\n",
+      "28\t,\t,\tPUNCT\t_\t\t30\tpunct\t_\t_\n",
+      "29\tsans\tsans\tADP\t_\t\t30\tcase\t_\t_\n",
+      "30\thaubert\thaubert\tNOUN\t_\tGender=Masc|Number=Sing\t20\tnmod\t_\t_\n",
+      "31\tet\tet\tCCONJ\t_\t\t33\tcc\t_\t_\n",
+      "32\tsans\tsans\tADP\t_\t\t33\tcase\t_\t_\n",
+      "33\tcuissards\tcuissard\tNOUN\t_\tGender=Masc|Number=Plur\t30\tconj\t_\t_\n",
+      "34\t.\t.\tPUNCT\t_\t\t6\tpunct\t_\t_\n",
+      "Un instant elle balança entre les violettes et les glaïeuls que lui offrait l' ombrage de les arbres de Minerve , et les narcisses et les nymphéas qui s' élevaient sur les bords de le petit fleuve ou flottaient à sa surface ; mais bientôt elle se décida pour ceux-ci , et , bondissant comme un jeune faon , elle courut vers le ruisseau .\n",
+      "1\tUn\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n",
+      "2\tinstant\tinstant\tNOUN\t_\tGender=Masc|Number=Sing\t4\tobl:mod\t_\t_\n",
+      "3\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t4\tnsubj\t_\t_\n",
+      "4\tbalança\tbalancer\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t4\tROOT\t_\t_\n",
+      "5\tentre\tentre\tADP\t_\t\t7\tcase\t_\t_\n",
+      "6\tles\tle\tDET\t_\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\t7\tdet\t_\t_\n",
+      "7\tviolettes\tviolette\tNOUN\t_\tGender=Fem|Number=Plur\t4\tobl\t_\t_\n",
+      "8\tet\tet\tCCONJ\t_\t\t10\tcc\t_\t_\n",
+      "9\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t10\tdet\t_\t_\n",
+      "10\tglaïeuls\tglaïeul\tNOUN\t_\tGender=Masc|Number=Plur\t7\tconj\t_\t_\n",
+      "11\tque\tque\tPRON\t_\tPronType=Rel\t13\tobj\t_\t_\n",
+      "12\tlui\tlui\tPRON\t_\tNumber=Sing|Person=3|PronType=Prs\t13\tiobj\t_\t_\n",
+      "13\toffrait\toffrir\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin\t10\tacl:relcl\t_\t_\n",
+      "14\tl'\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t15\tdet\t_\t_\n",
+      "15\tombrage\tombrage\tNOUN\t_\tGender=Masc|Number=Sing\t13\tnsubj\t_\t_\n",
+      "16\tde\tde\tADP\t_\t\t18\tcase\t_\t_\n",
+      "17\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t18\tdet\t_\t_\n",
+      "18\tarbres\tarbre\tNOUN\t_\tGender=Masc|Number=Plur\t15\tnmod\t_\t_\n",
+      "19\tde\tde\tADP\t_\t\t20\tcase\t_\t_\n",
+      "20\tMinerve\tMinerve\tPROPN\t_\t\t18\tnmod\t_\t_\n",
+      "21\t,\t,\tPUNCT\t_\t\t24\tpunct\t_\t_\n",
+      "22\tet\tet\tCCONJ\t_\t\t24\tcc\t_\t_\n",
+      "23\tles\tle\tDET\t_\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\t24\tdet\t_\t_\n",
+      "24\tnarcisses\tnarcisse\tNOUN\t_\tGender=Fem|Number=Plur\t7\tconj\t_\t_\n",
+      "25\tet\tet\tCCONJ\t_\t\t27\tcc\t_\t_\n",
+      "26\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t27\tdet\t_\t_\n",
+      "27\tnymphéas\tnymphéa\tNOUN\t_\tGender=Masc|Number=Plur\t24\tconj\t_\t_\n",
+      "28\tqui\tqui\tPRON\t_\tPronType=Rel\t30\tnsubj\t_\t_\n",
+      "29\ts'\tse\tPRON\t_\tPerson=3|PronType=Prs\t30\texpl:comp\t_\t_\n",
+      "30\télevaient\télever\tVERB\t_\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\t27\tacl:relcl\t_\t_\n",
+      "31\tsur\tsur\tADP\t_\t\t33\tcase\t_\t_\n",
+      "32\tles\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\t33\tdet\t_\t_\n",
+      "33\tbords\tbord\tNOUN\t_\tGender=Masc|Number=Plur\t30\tobl\t_\t_\n",
+      "34\tde\tde\tADP\t_\t\t37\tcase\t_\t_\n",
+      "35\tle\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t37\tdet\t_\t_\n",
+      "36\tpetit\tpetit\tADJ\t_\tGender=Masc|Number=Sing\t37\tamod\t_\t_\n",
+      "37\tfleuve\tfleuve\tNOUN\t_\tGender=Masc|Number=Sing\t33\tnmod\t_\t_\n",
+      "38\tou\tou\tCCONJ\t_\t\t39\tcc\t_\t_\n",
+      "39\tflottaient\tflott\tVERB\t_\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\t30\tconj\t_\t_\n",
+      "40\tà\tà\tADP\t_\t\t42\tcase\t_\t_\n",
+      "41\tsa\tson\tDET\t_\tGender=Fem|Number=Sing|Poss=Yes|PronType=Prs\t42\tdet\t_\t_\n",
+      "42\tsurface\tsurface\tNOUN\t_\tGender=Fem|Number=Sing\t39\tobl:arg\t_\t_\n",
+      "43\t;\t;\tPUNCT\t_\t\t48\tpunct\t_\t_\n",
+      "44\tmais\tmais\tCCONJ\t_\t\t48\tcc\t_\t_\n",
+      "45\tbientôt\tbientôt\tADV\t_\t\t48\tadvmod\t_\t_\n",
+      "46\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t48\tnsubj\t_\t_\n",
+      "47\tse\tse\tPRON\t_\tPerson=3|PronType=Prs\t48\tobj\t_\t_\n",
+      "48\tdécida\tdécider\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t4\tconj\t_\t_\n",
+      "49\tpour\tpour\tADP\t_\t\t50\tcase\t_\t_\n",
+      "50\tceux-ci\tcelui-ci\tPRON\t_\tGender=Masc|Number=Plur|PronType=Dem\t48\tobl\t_\t_\n",
+      "51\t,\t,\tPUNCT\t_\t\t54\tpunct\t_\t_\n",
+      "52\tet\tet\tCCONJ\t_\t\t54\tcc\t_\t_\n",
+      "53\t,\t,\tPUNCT\t_\t\t52\tpunct\t_\t_\n",
+      "54\tbondissant\tbondisser\tVERB\t_\tTense=Pres|VerbForm=Part\t48\tconj\t_\t_\n",
+      "55\tcomme\tcomme\tADP\t_\t\t58\tcase\t_\t_\n",
+      "56\tun\tun\tDET\t_\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\t58\tdet\t_\t_\n",
+      "57\tjeune\tjeune\tADJ\t_\tGender=Masc|Number=Sing\t58\tamod\t_\t_\n",
+      "58\tfaon\tfaon\tNOUN\t_\tGender=Masc|Number=Sing\t54\tobl:mod\t_\t_\n",
+      "59\t,\t,\tPUNCT\t_\t\t61\tpunct\t_\t_\n",
+      "60\telle\til\tPRON\t_\tGender=Fem|Number=Sing|Person=3|PronType=Prs\t61\tnsubj\t_\t_\n",
+      "61\tcourut\tcourir\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t48\tconj\t_\t_\n",
+      "62\tvers\tvers\tADP\t_\t\t64\tcase\t_\t_\n",
+      "63\tle\tle\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t64\tdet\t_\t_\n",
+      "64\truisseau\truisseau\tNOUN\t_\tGender=Masc|Number=Sing\t61\tobl\t_\t_\n",
+      "65\t.\t.\tPUNCT\t_\t\t4\tpunct\t_\t_\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "for sent in sentences:\n",
+    "    tok_text, conll_text = create_tokenized_string(sent)\n",
+    "    print(tok_text)\n",
+    "    print(conll_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41d4b38f",
+   "metadata": {},
+   "source": [
+    "### Store the cleaned tokenized text sentence-wise in a file\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "65825f08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def chunk_file(name, lines_per_chunk, chunks_per_file):\n",
+    "    \n",
+    "    booktitle = name.split(\"texts/\")[1].replace('.txt', '')\n",
+    "    print(booktitle)\n",
+    "    \n",
+    "    target_folder = '../data/gutenberg/tokenized_texts/'\n",
+    "    target_file_path = target_folder + booktitle + '.tok'\n",
+    "\n",
+    "    #os.makedirs(os.path.dirname(target_folder), exist_ok=True)\n",
+    "    \n",
+    "    #if os.path.exists(target_file_path): os.remove(target_file_path)\n",
+    "    if os.path.exists(target_file_path + '.conll'): os.remove(target_file_path + '.conll')\n",
+    "    \n",
+    "    outfile = open(target_file_path, \"a\")\n",
+    "    outfile_conll = open(target_file_path + '.conll', \"a\")\n",
+    "    \n",
+    "    def bad_chunk(chunk_text):\n",
+    "        unwanted_start_of_string = (\"Alexandre Dumas\", u\"ACTÉ\", \n",
+    "                                    \"Chapitre\", \"(18\", u\"Préface\",\n",
+    "                                    u\"Table des matières\", u\"_Résumé_\", \n",
+    "                                   \"[ Illustration\", \"[Illustration\")     \n",
+    "        if chunk_text.startswith(unwanted_start_of_string):\n",
+    "            return True\n",
+    "\n",
+    "    def write_chunk(chunk_no, chunk):\n",
+    "        chunk_text = ' '.join(chunk).replace('\\n', ' ').strip()\n",
+    "        if len(chunk_text) > 0 and not bad_chunk(chunk_text):\n",
+    "            doc = nlp(chunk_text)            \n",
+    "            sentences = [sent.text.strip() for sent in doc.sents]\n",
+    "            for sent in sentences:\n",
+    "                tokenized_sent, conll_doc = create_tokenized_string(sent)\n",
+    "                #print(tokenized_sent)\n",
+    "                #print(conll_doc)\n",
+    "                outfile.write(tokenized_sent + '\\n')\n",
+    "                outfile_conll.write(conll_doc + '\\n\\n')\n",
+    "\n",
+    "    count, chunk_no, chunk_count, chunk = 1, 1, 0, []\n",
+    "    with open(name, \"r\") as f:\n",
+    "        for row in f:\n",
+    "            if count > lines_per_chunk and row == \"\\n\":\n",
+    "                chunk_count += 1\n",
+    "                count = 1\n",
+    "                chunk.append(\"\\n\")\n",
+    "                if chunk_count == chunks_per_file:\n",
+    "                    write_chunk(chunk_no, chunk)\n",
+    "                    chunk = []\n",
+    "                    chunk_count = 0\n",
+    "                    chunk_no += 1\n",
+    "            else:\n",
+    "                count += 1\n",
+    "                chunk.append(row)\n",
+    "    if chunk:\n",
+    "        write_chunk(chunk_no, chunk)\n",
+    "    outfile.close()\n",
+    "    outfile_conll.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "95943233",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vingt_ans_après\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "import glob\n",
+    "\n",
+    "\n",
+    "dir_with_raw_files = [x for x in glob.glob('../data/gutenberg/raw_texts/Vingt*')]\n",
+    "\n",
+    "for raw_file_path in dir_with_raw_files:\n",
+    "    chunk_file(raw_file_path, 1, 1)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8ccc794",
+   "metadata": {},
+   "source": [
+    "## Process pre-tokenized texts with stanza and create train dev test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "eb6ca7ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-09 17:11:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n",
+      "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 26.7MB/s]                    \n",
+      "2025-01-09 17:11:15 INFO: Loading these models for language: fr (French):\n",
+      "=================================\n",
+      "| Processor | Package           |\n",
+      "---------------------------------\n",
+      "| tokenize  | combined          |\n",
+      "| pos       | combined_charlm   |\n",
+      "| lemma     | combined_nocharlm |\n",
+      "| depparse  | combined_charlm   |\n",
+      "=================================\n",
+      "\n",
+      "2025-01-09 17:11:15 INFO: Using device: cuda\n",
+      "2025-01-09 17:11:15 INFO: Loading: tokenize\n",
+      "2025-01-09 17:11:15 INFO: Loading: pos\n",
+      "2025-01-09 17:11:19 INFO: Loading: lemma\n",
+      "2025-01-09 17:11:19 INFO: Loading: depparse\n",
+      "2025-01-09 17:11:19 INFO: Done loading processors!\n",
+      "2025-01-09 17:11:19 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n",
+      "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 18.9MB/s]                    \n",
+      "2025-01-09 17:11:20 INFO: Loading these models for language: fr (French):\n",
+      "=================================\n",
+      "| Processor | Package           |\n",
+      "---------------------------------\n",
+      "| tokenize  | combined          |\n",
+      "| pos       | combined_charlm   |\n",
+      "| lemma     | combined_nocharlm |\n",
+      "| depparse  | combined_charlm   |\n",
+      "=================================\n",
+      "\n",
+      "2025-01-09 17:11:20 INFO: Using device: cuda\n",
+      "2025-01-09 17:11:20 INFO: Loading: tokenize\n",
+      "2025-01-09 17:11:20 INFO: Loading: pos\n",
+      "2025-01-09 17:11:21 INFO: Loading: lemma\n",
+      "2025-01-09 17:11:21 INFO: Loading: depparse\n",
+      "2025-01-09 17:11:21 INFO: Done loading processors!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import stanza\n",
+    "\n",
+    "#stanza.download(\"fr\")\n",
+    "\n",
+    "nlp_tokenized = stanza.Pipeline(lang='fr', processors='tokenize, pos, lemma, depparse', tokenize_pretokenized=True)\n",
+    "nlp_pos = stanza.Pipeline(lang='fr', processors='tokenize, pos, lemma, depparse', tokenize_pretokenized=True)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ad525136",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# text = Je mange des pommes .\n",
+      "# sent_id = 0\n",
+      "1\tJe\tmoi\tPRON\t_\tEmph=No|Number=Sing|Person=1|PronType=Prs\t2\tnsubj\t_\tstart_char=0|end_char=2\n",
+      "2\tmange\tmanger\tVERB\t_\tMood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin\t0\troot\t_\tstart_char=3|end_char=8\n",
+      "3\tdes\tun\tDET\t_\tDefinite=Ind|Number=Plur|PronType=Art\t4\tdet\t_\tstart_char=9|end_char=12\n",
+      "4\tpommes\tpomme\tNOUN\t_\tGender=Fem|Number=Plur\t2\tobj\t_\tstart_char=13|end_char=19\n",
+      "5\t.\t.\tPUNCT\t_\t_\t2\tpunct\t_\tstart_char=20|end_char=21\n",
+      "\n",
+      "[\n",
+      "  {\n",
+      "    \"id\": 1,\n",
+      "    \"text\": \"Je\",\n",
+      "    \"lemma\": \"moi\",\n",
+      "    \"upos\": \"PRON\",\n",
+      "    \"feats\": \"Emph=No|Number=Sing|Person=1|PronType=Prs\",\n",
+      "    \"head\": 2,\n",
+      "    \"deprel\": \"nsubj\",\n",
+      "    \"misc\": \"\",\n",
+      "    \"start_char\": 0,\n",
+      "    \"end_char\": 2\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 2,\n",
+      "    \"text\": \"mange\",\n",
+      "    \"lemma\": \"manger\",\n",
+      "    \"upos\": \"VERB\",\n",
+      "    \"feats\": \"Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin\",\n",
+      "    \"head\": 0,\n",
+      "    \"deprel\": \"root\",\n",
+      "    \"misc\": \"\",\n",
+      "    \"start_char\": 3,\n",
+      "    \"end_char\": 8\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 3,\n",
+      "    \"text\": \"des\",\n",
+      "    \"lemma\": \"un\",\n",
+      "    \"upos\": \"DET\",\n",
+      "    \"feats\": \"Definite=Ind|Number=Plur|PronType=Art\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"det\",\n",
+      "    \"misc\": \"\",\n",
+      "    \"start_char\": 9,\n",
+      "    \"end_char\": 12\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 4,\n",
+      "    \"text\": \"pommes\",\n",
+      "    \"lemma\": \"pomme\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"feats\": \"Gender=Fem|Number=Plur\",\n",
+      "    \"head\": 2,\n",
+      "    \"deprel\": \"obj\",\n",
+      "    \"misc\": \"\",\n",
+      "    \"start_char\": 13,\n",
+      "    \"end_char\": 19\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 5,\n",
+      "    \"text\": \".\",\n",
+      "    \"lemma\": \".\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"head\": 2,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"misc\": \"\",\n",
+      "    \"start_char\": 20,\n",
+      "    \"end_char\": 21\n",
+      "  }\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"Je mange des pommes .\"\n",
+    "doc = nlp_tokenized(text)\n",
+    "doc = nlp_pos(doc)\n",
+    "print(\"{:C}\".format(doc) + '\\n')\n",
+    "\n",
+    "for i, sentence in enumerate(doc.sentences):\n",
+    "    print(sentence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "e8f70a45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vingt_ans_après\n",
+      "14450\n",
+      "1784\n",
+      "1606\n"
+     ]
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "dir_with_pretokenized_files = [x for x in glob.glob('../data/gutenberg/tokenized_texts/Vingt_ans_après.tok')]\n",
+    "\n",
+    "for tokenized_file_path in dir_with_pretokenized_files:\n",
+    "    \n",
+    "    booktitle = tokenized_file_path.split(\"texts/\")[1].replace('.tok', '')\n",
+    "    print(booktitle)\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    with open(tokenized_file_path, 'r') as tokfile:\n",
+    "        documents = tokfile.readlines()\n",
+    "        X_train, X_test = train_test_split(documents, test_size=0.1, shuffle=False, stratify = None)\n",
+    "        X_train, X_val = train_test_split(X_train, test_size=0.1, shuffle=False, stratify = None) \n",
+    "        \n",
+    "    in_docs_train = [stanza.Document([], text=d) for d in X_train] # Wrap each document with a stanza.Document object\n",
+    "    out_docs_train = nlp_pos(in_docs_train) # Call the neural pipeline on this list of documents\n",
+    "    \n",
+    "    in_docs_test = [stanza.Document([], text=d) for d in X_test] # Wrap each document with a stanza.Document object\n",
+    "    out_docs_test = nlp_pos(in_docs_test) # Call the neural pipeline on this list of documents\n",
+    "    \n",
+    "    in_docs_val = [stanza.Document([], text=d) for d in X_val] # Wrap each document with a stanza.Document object\n",
+    "    out_docs_val = nlp_pos(in_docs_val) # Call the neural pipeline on this list of documents\n",
+    "    \n",
+    "    print(len(out_docs_train))\n",
+    "    print(len(out_docs_test))\n",
+    "    print(len(out_docs_val))\n",
+    "    \n",
+    "    target_folder = '../data/gutenberg/tokenized_texts/'\n",
+    "    target_file_path = target_folder + booktitle + '.tok'\n",
+    "    #os.makedirs(os.path.dirname(target_folder), exist_ok=True)\n",
+    "       \n",
+    "    target_file_path = target_folder + booktitle + '.tok'\n",
+    "    \n",
+    "    outfile_conll_val = open(target_file_path + '.dev.conll', \"w\")\n",
+    "    outfile_conll_train = open(target_file_path + '.train.conll', \"w\")\n",
+    "    outfile_conll_test = open(target_file_path + '.test.conll', \"w\")\n",
+    "     \n",
+    "    \n",
+    "    for out_doc in out_docs_train:#.sentences:\n",
+    "        conllstr = \"{:C}\".format(out_doc)\n",
+    "        outfile_conll_train.write(conllstr + '\\n\\n')\n",
+    "    \n",
+    "    for out_doc in out_docs_val:#.sentences:\n",
+    "        conllstr = \"{:C}\".format(out_doc)\n",
+    "        outfile_conll_val.write(conllstr + '\\n\\n')\n",
+    "    \n",
+    "    for out_doc in out_docs_test:#.sentences:\n",
+    "        conllstr = \"{:C}\".format(out_doc)\n",
+    "        outfile_conll_test.write(conllstr + '\\n\\n')\n",
+    "    \n",
+    "    outfile_conll_val.close()\n",
+    "    outfile_conll_train.close()\n",
+    "    outfile_conll_test.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b50d131",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbedfcb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add an empty line before each # text =\n",
+    "\n",
+    "#%cd ..\n",
+    "#%cd data/gutenberg/tokenized_texts\n",
+    "#! sed -i 's/# text =/\\n\\n# text =/g' *.conll\n",
+    "#%cd ..\n",
+    "#%cd ..\n",
+    "#%cd ..\n",
+    "#%cd notebooks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "960bb524",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3294ef16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# concatenate the files for the train, dev, and test split  and move them to the incpar folder\n",
+    "#%cd ..\n",
+    "#%cd data/gutenberg/tokenized_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a56e036",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! cat *.tok.dev.conll > gutenberg_dumas_dev.tok.conll\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f216cc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! cat *.tok.train.conll > gutenberg_dumas_train.tok.conll\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "880e80d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! cat *.tok.test.conll > gutenberg_dumas_test.tok.conll\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea3c1380",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%cd ..\n",
+    "#%cd ..\n",
+    "#%cd ..\n",
+    "#%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "700586eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_dev.tok.conll incpar/data/gutenberg_dumas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43fe6a94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_test.tok.conll incpar/data/gutenberg_dumas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82dd0717",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!mv /home/tatiana.bladier/tokenization-experiments/data/gutenberg/tokenized_texts/gutenberg_dumas_train.tok.conll incpar/data/gutenberg_dumas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fbce7a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_dev.tok.conll compo-gpt-model/data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45e652a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_test.tok.conll compo-gpt-model/data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0078cbb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!cp /home/tatiana.bladier/incpar/data/gutenberg_dumas/gutenberg_dumas_train.tok.conll compo-gpt-model/data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b4a088b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gutenberg-download.ipynb b/gutenberg-download.ipynb
index 6d231391cf8bd1ea2f2df4a6a6500344ec45c411..b8bfb9df251d3705815f7180468dafeb8fb4ddae 100644
--- a/gutenberg-download.ipynb
+++ b/gutenberg-download.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "e16128ee",
    "metadata": {},
    "outputs": [],
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "9ea15819",
    "metadata": {},
    "outputs": [],
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "c01a6cb4",
    "metadata": {},
    "outputs": [
@@ -224,7 +224,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "1d6385f5",
    "metadata": {},
    "outputs": [
@@ -389,7 +389,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "78b86a1c",
    "metadata": {},
    "outputs": [
@@ -399,7 +399,7 @@
        "' create Gutenberg database\\nAttention! This needs to be done just once for an OS and it roughly takes 20 minutes '"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -415,7 +415,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "ca9f49f5",
    "metadata": {},
    "outputs": [],
@@ -426,7 +426,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "52b3bb4a",
    "metadata": {},
    "outputs": [
@@ -446,7 +446,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "f81f5dfc",
    "metadata": {},
    "outputs": [
@@ -467,7 +467,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "fdf01c69",
    "metadata": {},
    "outputs": [
@@ -487,7 +487,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "a31b4e73",
    "metadata": {},
    "outputs": [
@@ -507,7 +507,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "4d85e390",
    "metadata": {},
    "outputs": [
@@ -527,7 +527,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "63e2231a",
    "metadata": {},
    "outputs": [
@@ -547,7 +547,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "bb4dc1f4",
    "metadata": {},
    "outputs": [
@@ -567,7 +567,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "c1a748a6",
    "metadata": {},
    "outputs": [
diff --git a/gutenberg-preprocess-data.ipynb b/gutenberg-preprocess-data.ipynb
index 6fa01e12ea9b2013e504ea045c566466ff5a470a..216a3d41113d1e4f03ad9ae5d789c023dcda1326 100644
--- a/gutenberg-preprocess-data.ipynb
+++ b/gutenberg-preprocess-data.ipynb
@@ -10,12 +10,134 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "12f2533d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Collecting spacy-udpipe\n",
+      "  Using cached spacy_udpipe-1.0.0-py3-none-any.whl.metadata (5.5 kB)\n",
+      "Collecting spacy<4.0.0,>=3.0.0 (from spacy-udpipe)\n",
+      "  Downloading spacy-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)\n",
+      "Collecting ufal.udpipe>=1.2.0 (from spacy-udpipe)\n",
+      "  Downloading ufal.udpipe-1.3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n",
+      "Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)\n",
+      "Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting murmurhash<1.1.0,>=0.28.0 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading murmurhash-1.0.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)\n",
+      "Collecting cymem<2.1.0,>=2.0.2 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)\n",
+      "Collecting preshed<3.1.0,>=3.0.2 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)\n",
+      "Collecting thinc<8.4.0,>=8.3.4 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading thinc-8.3.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n",
+      "Collecting wasabi<1.2.0,>=0.9.1 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)\n",
+      "Collecting srsly<3.0.0,>=2.4.3 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading srsly-2.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n",
+      "Collecting catalogue<2.1.0,>=2.0.6 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading catalogue-2.0.10-py3-none-any.whl.metadata (14 kB)\n",
+      "Collecting weasel<0.5.0,>=0.1.0 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading weasel-0.4.1-py3-none-any.whl.metadata (4.6 kB)\n",
+      "Collecting typer<1.0.0,>=0.3.0 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading typer-0.15.3-py3-none-any.whl.metadata (15 kB)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/tatiana.bladier/.local/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (4.67.1)\n",
+      "Requirement already satisfied: numpy>=1.19.0 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (2.2.4)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (2.32.3)\n",
+      "Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)\n",
+      "Requirement already satisfied: jinja2 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (3.1.6)\n",
+      "Requirement already satisfied: setuptools in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (75.8.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from spacy<4.0.0,>=3.0.0->spacy-udpipe) (24.2)\n",
+      "Collecting langcodes<4.0.0,>=3.2.0 (from spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading langcodes-3.5.0-py3-none-any.whl.metadata (29 kB)\n",
+      "Collecting language-data>=1.2 (from langcodes<4.0.0,>=3.2.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading language_data-1.3.0-py3-none-any.whl.metadata (4.3 kB)\n",
+      "Collecting annotated-types>=0.6.0 (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n",
+      "Collecting pydantic-core==2.33.2 (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n",
+      "Requirement already satisfied: typing-extensions>=4.12.2 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.0.0->spacy-udpipe) (4.13.2)\n",
+      "Collecting typing-inspection>=0.4.0 (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading typing_inspection-0.4.0-py3-none-any.whl.metadata (2.6 kB)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (2025.1.31)\n",
+      "Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading blis-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
+      "Collecting confection<1.0.0,>=0.0.1 (from thinc<8.4.0,>=8.3.4->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading confection-0.1.5-py3-none-any.whl.metadata (19 kB)\n",
+      "Requirement already satisfied: click>=8.0.0 in /home/tatiana.bladier/.local/lib/python3.11/site-packages (from typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (8.1.8)\n",
+      "Collecting shellingham>=1.3.0 (from typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)\n",
+      "Collecting rich>=10.11.0 (from typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading rich-14.0.0-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting cloudpathlib<1.0.0,>=0.7.0 (from weasel<0.5.0,>=0.1.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading cloudpathlib-0.21.0-py3-none-any.whl.metadata (14 kB)\n",
+      "Collecting smart-open<8.0.0,>=5.2.1 (from weasel<0.5.0,>=0.1.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from jinja2->spacy<4.0.0,>=3.0.0->spacy-udpipe) (3.0.2)\n",
+      "Collecting marisa-trie>=1.1.0 (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)\n",
+      "Collecting markdown-it-py>=2.2.0 (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe) (2.19.1)\n",
+      "Collecting wrapt (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)\n",
+      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4.0.0,>=3.0.0->spacy-udpipe)\n",
+      "  Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Using cached spacy_udpipe-1.0.0-py3-none-any.whl (11 kB)\n",
+      "Downloading spacy-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m30.6/30.6 MB\u001b[0m \u001b[31m70.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading ufal.udpipe-1.3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (936 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m936.8/936.8 kB\u001b[0m \u001b[31m15.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading catalogue-2.0.10-py3-none-any.whl (17 kB)\n",
+      "Downloading cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (218 kB)\n",
+      "Downloading langcodes-3.5.0-py3-none-any.whl (182 kB)\n",
+      "Downloading murmurhash-1.0.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (134 kB)\n",
+      "Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (157 kB)\n",
+      "Downloading pydantic-2.11.4-py3-none-any.whl (443 kB)\n",
+      "Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)\n",
+      "Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)\n",
+      "Downloading srsly-2.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading thinc-8.3.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0meta \u001b[36m0:00:04\u001b[0m\n",
+      "\u001b[?25hDownloading typer-0.15.3-py3-none-any.whl (45 kB)\n",
+      "Downloading wasabi-1.1.3-py3-none-any.whl (27 kB)\n",
+      "Downloading weasel-0.4.1-py3-none-any.whl (50 kB)\n",
+      "Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)\n",
+      "Downloading blis-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.7/11.7 MB\u001b[0m \u001b[31m43.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading cloudpathlib-0.21.0-py3-none-any.whl (52 kB)\n",
+      "Downloading confection-0.1.5-py3-none-any.whl (35 kB)\n",
+      "Downloading language_data-1.3.0-py3-none-any.whl (5.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading rich-14.0.0-py3-none-any.whl (243 kB)\n",
+      "Downloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)\n",
+      "Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)\n",
+      "Downloading typing_inspection-0.4.0-py3-none-any.whl (14 kB)\n",
+      "Downloading marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
+      "Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (83 kB)\n",
+      "Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
+      "Installing collected packages: ufal.udpipe, cymem, wrapt, wasabi, typing-inspection, spacy-loggers, spacy-legacy, shellingham, pydantic-core, murmurhash, mdurl, marisa-trie, cloudpathlib, catalogue, blis, annotated-types, srsly, smart-open, pydantic, preshed, markdown-it-py, language-data, rich, langcodes, confection, typer, thinc, weasel, spacy, spacy-udpipe\n",
+      "Successfully installed annotated-types-0.7.0 blis-1.3.0 catalogue-2.0.10 cloudpathlib-0.21.0 confection-0.1.5 cymem-2.0.11 langcodes-3.5.0 language-data-1.3.0 marisa-trie-1.2.1 markdown-it-py-3.0.0 mdurl-0.1.2 murmurhash-1.0.12 preshed-3.0.9 pydantic-2.11.4 pydantic-core-2.33.2 rich-14.0.0 shellingham-1.5.4 smart-open-7.1.0 spacy-3.8.5 spacy-legacy-3.0.12 spacy-loggers-1.0.5 spacy-udpipe-1.0.0 srsly-2.5.1 thinc-8.3.6 typer-0.15.3 typing-inspection-0.4.0 ufal.udpipe-1.3.1.1 wasabi-1.1.3 weasel-0.4.1 wrapt-1.17.2\n"
+     ]
+    }
+   ],
    "source": [
-    "#! pip install spacy-udpipe\n",
+    "! pip install spacy-udpipe\n",
     "#! pip install pip install stanza -U\n",
     "import spacy_udpipe\n",
     "import os\n"
@@ -23,40 +145,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "id": "0039ad52",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/home/tatiana.bladier/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+      "Downloaded pre-trained UDPipe model for 'fr' language\n"
      ]
     }
    ],
    "source": [
     "import spacy\n",
     "from spacy.language import Language\n",
-    "\n",
+    "spacy_udpipe.download('fr')\n",
     "nlp = spacy_udpipe.load(\"fr\")\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "id": "728329cd",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<spacy.pipeline.dep_parser.DependencyParser at 0x7f356c2670d0>"
+       "<spacy.pipeline.dep_parser.DependencyParser at 0x7f55c5dca570>"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -75,17 +196,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "id": "8d4237bf",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<thinc.optimizers.Optimizer at 0x7f3472e77420>"
+       "<thinc.optimizers.Optimizer at 0x7f56a8f26c00>"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -98,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "id": "5219a6ea",
    "metadata": {},
    "outputs": [],
@@ -147,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
    "id": "1218955c",
    "metadata": {},
    "outputs": [
@@ -169,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
    "id": "1b99da40",
    "metadata": {},
    "outputs": [],
@@ -198,7 +319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "id": "af17d1f7",
    "metadata": {},
    "outputs": [
@@ -208,7 +329,7 @@
        "\"1\\tUn\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t3\\tdet\\t_\\t_\\n2\\tjeune\\tjeune\\tADJ\\t_\\tGender=Masc|Number=Sing\\t3\\tamod\\t_\\t_\\n3\\thomme\\thomme\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tnsubj\\t_\\t_\\n4\\t...\\t...\\tPUNCT\\t_\\t\\t3\\tpunct\\t_\\t_\\n5\\t--\\t--\\tPUNCT\\t_\\t\\t3\\tpunct\\t_\\t_\\n6\\ttraçons\\ttraçer\\tVERB\\t_\\tMood=Imp|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin\\t6\\tROOT\\t_\\t_\\n7\\tson\\tson\\tDET\\t_\\tGender=Masc|Number=Sing|Poss=Yes|PronType=Prs\\t8\\tdet\\t_\\t_\\n8\\tportrait\\tportrait\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tobj\\t_\\t_\\n9\\td’\\td’\\tPROPN\\t_\\t\\t8\\tappos\\t_\\t_\\n10\\tun\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t12\\tdet\\t_\\t_\\n11\\tseul\\tseul\\tADJ\\t_\\tGender=Masc|Number=Sing\\t12\\tamod\\t_\\t_\\n12\\ttrait\\ttrait\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t15\\tnsubj\\t_\\t_\\n13\\tde\\tde\\tADP\\t_\\t\\t14\\tcase\\t_\\t_\\n14\\tplume\\tplume\\tNOUN\\t_\\tGender=Fem|Number=Sing\\t12\\tnmod\\t_\\t_\\n15\\t:--figurez\\t:--figurer\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin\\t6\\tparataxis\\t_\\t_\\n16\\t-vous\\tvous\\tPRON\\t_\\tNumber=Plur|Person=2|PronType=Prs\\t15\\tobj\\t_\\t_\\n17\\tdon\\tdon\\tADP\\t_\\t\\t18\\tcase\\t_\\t_\\n18\\tQuichotte\\tQuichotte\\tPROPN\\t_\\t\\t16\\tnmod\\t_\\t_\\n19\\tà\\tà\\tADP\\t_\\t\\t21\\tcase\\t_\\t_\\n20\\tdix-huit\\tdix-huit\\tNUM\\t_\\t\\t21\\tnummod\\t_\\t_\\n21\\tans\\tan\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t18\\tnmod\\t_\\t_\\n22\\t;\\t;\\tPUNCT\\t_\\t\\t24\\tpunct\\t_\\t_\\n23\\tdon\\tdon\\tADP\\t_\\t\\t24\\tcase\\t_\\t_\\n24\\tQuichotte\\tQuichotte\\tPROPN\\t_\\t\\t6\\tobl\\t_\\t_\\n25\\tdécorselé\\tdécorseler\\tVERB\\t_\\tGender=Masc|Number=Sing|Tense=Past|VerbForm=Part\\t24\\tacl\\t_\\t_\\n26\\t,\\t,\\tPUNCT\\t_\\t\\t28\\tpunct\\t_\\t_\\n27\\tsans\\tsans\\tADP\\t_\\t\\t28\\tcase\\t_\\t_\\n28\\thaubert\\thaubert\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t6\\tobl\\t_\\t_\\n29\\tet\\tet\\tCCONJ\\t_\\t\\t31\\tcc\\t_\\t_\\n30\\tsans\\tsans\\tADP\\t_\\t\\t31\\tcase\\t_\\t_\\n31\\tcuissards\\tcuissard\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t28\\tconj\\t_\\t_\\n32\\t.\\t.\\tPUNCT\\t_\\t\\t6\\tpunct\\t_\\t_\\n33\\tUn\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t34\\tdet\\t_\\t_\\n34\\tinstant\\tinstant\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t36\\tobl:mod\\t_\\t_\\n35\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t36\\tnsubj\\t_\\t_\\n36\\tbalança\\tbalancer\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t36\\tROOT\\t_\\t_\\n37\\tentre\\tentre\\tADP\\t_\\t\\t39\\tcase\\t_\\t_\\n38\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\\t39\\tdet\\t_\\t_\\n39\\tviolettes\\tviolette\\tNOUN\\t_\\tGender=Fem|Number=Plur\\t36\\tobl\\t_\\t_\\n40\\tet\\tet\\tCCONJ\\t_\\t\\t42\\tcc\\t_\\t_\\n41\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t42\\tdet\\t_\\t_\\n42\\tglaïeuls\\tglaïeul\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t39\\tconj\\t_\\t_\\n43\\tque\\tque\\tPRON\\t_\\tPronType=Rel\\t45\\tobj\\t_\\t_\\n44\\tlui\\tlui\\tPRON\\t_\\tNumber=Sing|Person=3|PronType=Prs\\t45\\tiobj\\t_\\t_\\n45\\toffrait\\toffrir\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin\\t42\\tacl:relcl\\t_\\t_\\n46\\tl'\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t47\\tdet\\t_\\t_\\n47\\tombrage\\tombrage\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t45\\tnsubj\\t_\\t_\\n48\\tde\\tde\\tADP\\t_\\t\\t50\\tcase\\t_\\t_\\n49\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t50\\tdet\\t_\\t_\\n50\\tarbres\\tarbre\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t47\\tnmod\\t_\\t_\\n51\\tde\\tde\\tADP\\t_\\t\\t52\\tcase\\t_\\t_\\n52\\tMinerve\\tMinerve\\tPROPN\\t_\\t\\t50\\tnmod\\t_\\t_\\n53\\t,\\t,\\tPUNCT\\t_\\t\\t56\\tpunct\\t_\\t_\\n54\\tet\\tet\\tCCONJ\\t_\\t\\t56\\tcc\\t_\\t_\\n55\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Fem|Number=Plur|PronType=Art\\t56\\tdet\\t_\\t_\\n56\\tnarcisses\\tnarcisse\\tNOUN\\t_\\tGender=Fem|Number=Plur\\t39\\tconj\\t_\\t_\\n57\\tet\\tet\\tCCONJ\\t_\\t\\t59\\tcc\\t_\\t_\\n58\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t59\\tdet\\t_\\t_\\n59\\tnymphéas\\tnymphéa\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t56\\tconj\\t_\\t_\\n60\\tqui\\tqui\\tPRON\\t_\\tPronType=Rel\\t62\\tnsubj\\t_\\t_\\n61\\ts'\\tse\\tPRON\\t_\\tPerson=3|PronType=Prs\\t62\\texpl:comp\\t_\\t_\\n62\\télevaient\\télever\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\\t59\\tacl:relcl\\t_\\t_\\n63\\tsur\\tsur\\tADP\\t_\\t\\t65\\tcase\\t_\\t_\\n64\\tles\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Plur|PronType=Art\\t65\\tdet\\t_\\t_\\n65\\tbords\\tbord\\tNOUN\\t_\\tGender=Masc|Number=Plur\\t62\\tobl\\t_\\t_\\n66\\tde\\tde\\tADP\\t_\\t\\t69\\tcase\\t_\\t_\\n67\\tle\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t69\\tdet\\t_\\t_\\n68\\tpetit\\tpetit\\tADJ\\t_\\tGender=Masc|Number=Sing\\t69\\tamod\\t_\\t_\\n69\\tfleuve\\tfleuve\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t65\\tnmod\\t_\\t_\\n70\\tou\\tou\\tCCONJ\\t_\\t\\t71\\tcc\\t_\\t_\\n71\\tflottaient\\tflott\\tVERB\\t_\\tMood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin\\t62\\tconj\\t_\\t_\\n72\\tà\\tà\\tADP\\t_\\t\\t74\\tcase\\t_\\t_\\n73\\tsa\\tson\\tDET\\t_\\tGender=Fem|Number=Sing|Poss=Yes|PronType=Prs\\t74\\tdet\\t_\\t_\\n74\\tsurface\\tsurface\\tNOUN\\t_\\tGender=Fem|Number=Sing\\t71\\tobl:arg\\t_\\t_\\n75\\t;\\t;\\tPUNCT\\t_\\t\\t80\\tpunct\\t_\\t_\\n76\\tmais\\tmais\\tCCONJ\\t_\\t\\t80\\tcc\\t_\\t_\\n77\\tbientôt\\tbientôt\\tADV\\t_\\t\\t80\\tadvmod\\t_\\t_\\n78\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t80\\tnsubj\\t_\\t_\\n79\\tse\\tse\\tPRON\\t_\\tPerson=3|PronType=Prs\\t80\\tobj\\t_\\t_\\n80\\tdécida\\tdécider\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t36\\tconj\\t_\\t_\\n81\\tpour\\tpour\\tADP\\t_\\t\\t82\\tcase\\t_\\t_\\n82\\tceux-ci\\tcelui-ci\\tPRON\\t_\\tGender=Masc|Number=Plur|PronType=Dem\\t80\\tobl\\t_\\t_\\n83\\t,\\t,\\tPUNCT\\t_\\t\\t86\\tpunct\\t_\\t_\\n84\\tet\\tet\\tCCONJ\\t_\\t\\t86\\tcc\\t_\\t_\\n85\\t,\\t,\\tPUNCT\\t_\\t\\t84\\tpunct\\t_\\t_\\n86\\tbondissant\\tbondisser\\tVERB\\t_\\tTense=Pres|VerbForm=Part\\t80\\tconj\\t_\\t_\\n87\\tcomme\\tcomme\\tADP\\t_\\t\\t90\\tcase\\t_\\t_\\n88\\tun\\tun\\tDET\\t_\\tDefinite=Ind|Gender=Masc|Number=Sing|PronType=Art\\t90\\tdet\\t_\\t_\\n89\\tjeune\\tjeune\\tADJ\\t_\\tGender=Masc|Number=Sing\\t90\\tamod\\t_\\t_\\n90\\tfaon\\tfaon\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t86\\tobl:mod\\t_\\t_\\n91\\t,\\t,\\tPUNCT\\t_\\t\\t93\\tpunct\\t_\\t_\\n92\\telle\\til\\tPRON\\t_\\tGender=Fem|Number=Sing|Person=3|PronType=Prs\\t93\\tnsubj\\t_\\t_\\n93\\tcourut\\tcourir\\tVERB\\t_\\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\\t80\\tconj\\t_\\t_\\n94\\tvers\\tvers\\tADP\\t_\\t\\t96\\tcase\\t_\\t_\\n95\\tle\\tle\\tDET\\t_\\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\\t96\\tdet\\t_\\t_\\n96\\truisseau\\truisseau\\tNOUN\\t_\\tGender=Masc|Number=Sing\\t93\\tobl\\t_\\t_\\n97\\t.\\t.\\tPUNCT\\t_\\t\\t36\\tpunct\\t_\\t_\""
       ]
      },
-     "execution_count": 8,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -219,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
    "id": "5d7c6586",
    "metadata": {},
    "outputs": [],
@@ -249,7 +370,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "id": "5bfce4b4",
    "metadata": {},
    "outputs": [
@@ -881,7 +1002,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..76b435504a3972105238e6a1cc9f1a95ac0c833b
--- /dev/null
+++ b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
@@ -0,0 +1,110 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "96858183-3e82-4a33-ba0f-1b21b5f36018",
+   "metadata": {},
+   "source": [
+    "## Type-token ratio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_ttr(text):\n",
+    "    \"\"\"\n",
+    "    Compute the type/token ratio (TTR) from column-formatted text.\n",
+    "    Only the first column is used (tokens).\n",
+    "\n",
+    "    Parameters:\n",
+    "    - text: str, the input text in column format\n",
+    "\n",
+    "    Returns:\n",
+    "    - ttr: float, the type/token ratio\n",
+    "    \"\"\"\n",
+    "    tokens = []\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():  # skip empty lines\n",
+    "            token = line.split('\\t')[0]\n",
+    "            tokens.append(token)\n",
+    "\n",
+    "    if not tokens:\n",
+    "        return 0.0\n",
+    "\n",
+    "    types = set(tokens)\n",
+    "    return len(types) / len(tokens)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type/Token Ratio: 0.933\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\"\n",
+    "\n",
+    "ttr = compute_ttr(sample_text)\n",
+    "print(f\"Type/Token Ratio: {ttr:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tania_scripts/tania-some-other-metrics.ipynb b/tania_scripts/tania-some-other-metrics.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..76b435504a3972105238e6a1cc9f1a95ac0c833b
--- /dev/null
+++ b/tania_scripts/tania-some-other-metrics.ipynb
@@ -0,0 +1,110 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "96858183-3e82-4a33-ba0f-1b21b5f36018",
+   "metadata": {},
+   "source": [
+    "## Type-token ratio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_ttr(text):\n",
+    "    \"\"\"\n",
+    "    Compute the type/token ratio (TTR) from column-formatted text.\n",
+    "    Only the first column is used (tokens).\n",
+    "\n",
+    "    Parameters:\n",
+    "    - text: str, the input text in column format\n",
+    "\n",
+    "    Returns:\n",
+    "    - ttr: float, the type/token ratio\n",
+    "    \"\"\"\n",
+    "    tokens = []\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():  # skip empty lines\n",
+    "            token = line.split('\\t')[0]\n",
+    "            tokens.append(token)\n",
+    "\n",
+    "    if not tokens:\n",
+    "        return 0.0\n",
+    "\n",
+    "    types = set(tokens)\n",
+    "    return len(types) / len(tokens)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type/Token Ratio: 0.933\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\"\n",
+    "\n",
+    "ttr = compute_ttr(sample_text)\n",
+    "print(f\"Type/Token Ratio: {ttr:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}