add readability metrics for French

c1922f59 · Tatiana BLADIER · da1ae3ab · c1922f59 · c1922f59
Commit c1922f59 authored 2 months ago by Tatiana BLADIER
--- a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
+++ b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
@@ -10,15 +10,80 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
+   "id": "510c3726-366d-4e26-a2bb-b55391b473bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /home/tatiana.bladier/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import nltk\n",
+    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
+    "\n",
+    "# Download once if not already\n",
+    "#nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ee59c294-fdcd-429e-a126-734480d1b0ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
   "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def compute_ttr(text):\n",
+    "import string\n",
+    "\n",
+    "def compute_ttr_cleaned(text):\n",
    "    \"\"\"\n",
    "    Compute the type/token ratio (TTR) from column-formatted text.\n",
-    "    Only the first column is used (tokens).\n",
+    "    - Only the first column is used (tokens).\n",
+    "    - Tokens are lowercased.\n",
+    "    - Punctuation tokens are ignored.\n",
    "\n",
    "    Parameters:\n",
    "    - text: str, the input text in column format\n",
@@ -30,7 +95,8 @@
    "\n",
    "    for line in text.strip().splitlines():\n",
    "        if line.strip():  # skip empty lines\n",
-    "            token = line.split('\\t')[0]\n",
+    "            token = line.split('\\t')[0].lower()\n",
+    "            if token not in string.punctuation:\n",
    "                tokens.append(token)\n",
    "\n",
    "    if not tokens:\n",
@@ -42,7 +108,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
   "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
   "metadata": {},
   "outputs": [
@@ -55,34 +121,303 @@
    }
   ],
   "source": [
-    "sample_text = \"\"\"\n",
-    "<s>\t<s>\t<s>\t0\n",
-    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
-    "était\tv\t<nul>@@<nul>\t0\n",
-    "à\tp\t<nul>@@<nul>\t0\n",
-    "son\tdet\tNP@@<nul>\t0\n",
-    "poste\tnc\t<nul>@@<nul>\t1\n",
-    ",\tponct\t<nul>@@<nul>\t0\n",
-    "il\tcls-suj\tVN@@<nul>\t0\n",
-    "était\tv\t<nul>@@<nul>\t1\n",
-    "tombé\tvpp\t<nul>@@<nul>\t1\n",
-    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
-    "ses\tdet\tNP@@<nul>\t2\n",
-    "bras\tnc\t<nul>@@<nul>\t3\n",
-    ".\tponct\t<nul>@@<nul>\t0\n",
-    "</s>\t</s>\t</s>\t0\n",
-    "\"\"\"\n",
-    "\n",
    "ttr = compute_ttr(sample_text)\n",
    "print(f\"Type/Token Ratio: {ttr:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "def compute_ttr_by_pos(text):\n",
+    "    \"\"\"\n",
+    "    Compute type/token ratios for verbs and nouns in column-formatted text.\n",
+    "    - Columns: token \\t pos \\t ...\n",
+    "    - Verbs: POS in {'v', 'vpp', 'vpr'}\n",
+    "    - Nouns: POS in {'nc', 'npp'}\n",
+    "    - Tokens are lowercased.\n",
+    "    - Punctuation is ignored.\n",
+    "\n",
+    "    Returns:\n",
+    "    - A dictionary with TTRs for verbs and nouns.\n",
+    "    \"\"\"\n",
+    "    import string\n",
+    "\n",
+    "    verb_pos = {'v', 'vpp', 'vpr'}\n",
+    "    noun_pos = {'nc', 'npp'}\n",
+    "\n",
+    "    verb_tokens = []\n",
+    "    noun_tokens = []\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():\n",
+    "            parts = line.split('\\t')\n",
+    "            if len(parts) >= 2:\n",
+    "                token = parts[0].lower()\n",
+    "                pos = parts[1]\n",
+    "\n",
+    "                # Ignore punctuation\n",
+    "                if token in string.punctuation:\n",
+    "                    continue\n",
+    "\n",
+    "                if pos in verb_pos:\n",
+    "                    verb_tokens.append(token)\n",
+    "                elif pos in noun_pos:\n",
+    "                    noun_tokens.append(token)\n",
+    "\n",
+    "    # Compute TTRs\n",
+    "    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n",
+    "    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n",
+    "\n",
+    "    return {\n",
+    "        'verb_ttr': ttr_verb,\n",
+    "        'noun_ttr': ttr_noun\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verb TTR: 0.667\n",
+      "Noun TTR: 1.000\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = compute_ttr_by_pos(sample_text)\n",
+    "print(f\"Verb TTR: {result['verb_ttr']:.3f}\")\n",
+    "print(f\"Noun TTR: {result['noun_ttr']:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1362e192-514a-4a77-a8cb-5c012026e2bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_nv_ratios(text):\n",
+    "    \"\"\"\n",
+    "    Compute nominal/verb and verb/nominal ratios from column-formatted text.\n",
+    "    - Uses the second column (POS).\n",
+    "    - Verbs: 'v', 'vpp', 'vpr'\n",
+    "    - Nouns: 'nc', 'npp'\n",
+    "\n",
+    "    Returns:\n",
+    "    - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'\n",
+    "    \"\"\"\n",
+    "    verb_pos = {'v', 'vpp', 'vpr'}\n",
+    "    noun_pos = {'nc', 'npp'}\n",
+    "\n",
+    "    verb_count = 0\n",
+    "    noun_count = 0\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():\n",
+    "            parts = line.split('\\t')\n",
+    "            if len(parts) >= 2:\n",
+    "                pos = parts[1]\n",
+    "                if pos in verb_pos:\n",
+    "                    verb_count += 1\n",
+    "                elif pos in noun_pos:\n",
+    "                    noun_count += 1\n",
+    "\n",
+    "    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n",
+    "    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n",
+    "\n",
+    "    return {\n",
+    "        'nominal_verb_ratio': nominal_verb_ratio,\n",
+    "        'verb_nominal_ratio': verb_nominal_ratio\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "544ff6aa-4104-4580-a01f-97429ffcc228",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nominal/Verb Ratio: 1.00\n",
+      "Verb/Nominal Ratio: 1.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "ratios = compute_nv_ratios(sample_text)\n",
+    "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n",
+    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3a929bf-61cb-4ef8-bc00-6e2a59760d37",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Readability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3fe25ff0-3f83-40fe-8420-08c09ffe98e6",
+   "metadata": {},
+   "source": [
+    "### 📚 French Readability MetricsTodo: verify this\n",
+    "\n",
+    "This notebook implements and explains three common **readability formulas** tailored for **French texts**:\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 1. 🟦 **Flesch–Douma Index**\n",
+    "\n",
+    "An adaptation of the original Flesch Reading Ease formula for French.\n",
+    "\n",
+    "$\\text{Flesch–Douma} = 207 - (1.015 \\times \\text{ASL}) - (73.6 \\times \\text{ASW})$\n",
+    "\n",
+    "Where:\n",
+    "- **ASL** = Average Sentence Length = (number of words) / (number of sentences)\n",
+    "- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- 90–100: Very easy\n",
+    "- 60–70: Standard\n",
+    "- 30–50: Difficult\n",
+    "- < 30: Very difficult\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 2. 🟨 **LIX Index**\n",
+    "\n",
+    "Used widely in French and other European languages. Measures sentence length and lexical complexity.\n",
+    "\n",
+    "$\\text{LIX} = \\frac{\\text{number of words}}{\\text{number of sentences}} + \\frac{100 \\times \\text{number of long words (≥7 chars)}}{\\text{number of words}}$\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- $<$ 30: Easy\n",
+    "- 30–40: Medium\n",
+    "- $>$ 50: Difficult\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 3. 🟥 **Kandel–Moles Index**\n",
+    "\n",
+    "A linear formula proposed for French readability:\n",
+    "\n",
+    "$\\text{Kandel–Moles} = 0.1935 \\times \\text{number of words} + 0.1672 \\times \\text{number of syllables} - 1.779$\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- Higher values indicate more complex texts.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rewriting the readability metric functions here, without relying on downloading external resources\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "# Naive sentence splitter (based on punctuation)\n",
+    "def naive_sentence_tokenize(text):\n",
+    "    return re.split(r'[.!?]+', text.strip())\n",
+    "\n",
+    "# Naive word tokenizer (splits on whitespace and punctuation)\n",
+    "def naive_word_tokenize(text):\n",
+    "    return re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "\n",
+    "# Function to count syllables in a French word (naive method using vowel groups)\n",
+    "def count_syllables(word):\n",
+    "    vowels = \"aeiouyàâäéèêëîïôöùûüœ\"\n",
+    "    syllables = re.findall(rf\"[{vowels}]+\", word)\n",
+    "    return max(1, len(syllables))\n",
+    "\n",
+    "# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices\n",
+    "def compute_french_readability(text):\n",
+    "    sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]\n",
+    "    words = [w for w in naive_word_tokenize(text) if re.match(r\"\\w+\", w)]\n",
+    "    \n",
+    "    num_sentences = len(sentences)\n",
+    "    num_words = len(words)\n",
+    "    num_syllables = sum(count_syllables(w) for w in words)\n",
+    "    num_long_words = sum(1 for w in words if len(w) >= 7)\n",
+    "\n",
+    "    if num_sentences == 0 or num_words == 0:\n",
+    "        return {\n",
+    "            \"Flesch-Douma\": 0.0,\n",
+    "            \"LIX\": 0.0,\n",
+    "            \"Kandel-Moles\": 0.0\n",
+    "        }\n",
+    "\n",
+    "    # Flesch-Douma\n",
+    "    asl = num_words / num_sentences  # Average sentence length\n",
+    "    asw = num_syllables / num_words  # Average syllables per word\n",
+    "    flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)\n",
+    "\n",
+    "    # LIX\n",
+    "    lix = (num_words / num_sentences) + (100 * num_long_words / num_words)\n",
+    "\n",
+    "    # Kandel-Moles\n",
+    "    kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779\n",
+    "\n",
+    "    return {\n",
+    "        \"Flesch-Douma\": round(flesch_douma, 2),\n",
+    "        \"LIX\": round(lix, 2),\n",
+    "        \"Kandel-Moles\": round(kandel_moles, 2)\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test on a sample French text\n",
+    "sample_french_text = \"\"\"\n",
+    "Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.\n",
+    "\"\"\"\n",
+    "compute_french_readability(sample_french_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b",
+   "metadata": {},
+   "outputs": [],
   "source": []
  }
 ],

 %% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:

 ## Type-token ratio

-%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
+%% Cell type:code id:510c3726-366d-4e26-a2bb-b55391b473bd tags:

 ``` python
-def compute_ttr(text):
-    """
-    Compute the type/token ratio (TTR) from column-formatted text.
-    Only the first column is used (tokens).
+import re
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize

-    Parameters:
-    - text: str, the input text in column format
+# Download once if not already
+#nltk.download('punkt')
+```

-    Returns:
-    - ttr: float, the type/token ratio
-    """
-    tokens = []
+%% Output

-    for line in text.strip().splitlines():
-        if line.strip():  # skip empty lines
-            token = line.split('\t')[0]
-            tokens.append(token)
+    [nltk_data] Downloading package punkt to
+    [nltk_data]     /home/tatiana.bladier/nltk_data...
+    [nltk_data]   Package punkt is already up-to-date!

-    if not tokens:
-        return 0.0
+    True

-    types = set(tokens)
-    return len(types) / len(tokens)
-```
-
-%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
+%% Cell type:code id:ee59c294-fdcd-429e-a126-734480d1b0ba tags:

 ``` python
 sample_text = """
 <s>	<s>	<s>	0
 Aramis	npp	<nul>@@<nul>	0
 était	v	<nul>@@<nul>	0
 à	p	<nul>@@<nul>	0
 son	det	NP@@<nul>	0
 poste	nc	<nul>@@<nul>	1
 ,	ponct	<nul>@@<nul>	0
 il	cls-suj	VN@@<nul>	0
 était	v	<nul>@@<nul>	1
 tombé	vpp	<nul>@@<nul>	1
 de	p	PP-DE_OBJ@@Sint-MOD	1
 ses	det	NP@@<nul>	2
 bras	nc	<nul>@@<nul>	3
 .	ponct	<nul>@@<nul>	0
 </s>	</s>	</s>	0
 """
+```
+
+%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
+
+``` python
+import string
+
+def compute_ttr_cleaned(text):
+    """
+    Compute the type/token ratio (TTR) from column-formatted text.
+    - Only the first column is used (tokens).
+    - Tokens are lowercased.
+    - Punctuation tokens are ignored.
+
+    Parameters:
+    - text: str, the input text in column format

+    Returns:
+    - ttr: float, the type/token ratio
+    """
+    tokens = []
+
+    for line in text.strip().splitlines():
+        if line.strip():  # skip empty lines
+            token = line.split('\t')[0].lower()
+            if token not in string.punctuation:
+                tokens.append(token)
+
+    if not tokens:
+        return 0.0
+
+    types = set(tokens)
+    return len(types) / len(tokens)
+```
+
+%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
+
+``` python
 ttr = compute_ttr(sample_text)
 print(f"Type/Token Ratio: {ttr:.3f}")
 ```

 %% Output

    Type/Token Ratio: 0.933

 %% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:

 ``` python
+def compute_ttr_by_pos(text):
+    """
+    Compute type/token ratios for verbs and nouns in column-formatted text.
+    - Columns: token \t pos \t ...
+    - Verbs: POS in {'v', 'vpp', 'vpr'}
+    - Nouns: POS in {'nc', 'npp'}
+    - Tokens are lowercased.
+    - Punctuation is ignored.
+
+    Returns:
+    - A dictionary with TTRs for verbs and nouns.
+    """
+    import string
+
+    verb_pos = {'v', 'vpp', 'vpr'}
+    noun_pos = {'nc', 'npp'}
+
+    verb_tokens = []
+    noun_tokens = []
+
+    for line in text.strip().splitlines():
+        if line.strip():
+            parts = line.split('\t')
+            if len(parts) >= 2:
+                token = parts[0].lower()
+                pos = parts[1]
+
+                # Ignore punctuation
+                if token in string.punctuation:
+                    continue
+
+                if pos in verb_pos:
+                    verb_tokens.append(token)
+                elif pos in noun_pos:
+                    noun_tokens.append(token)
+
+    # Compute TTRs
+    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0
+    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0
+
+    return {
+        'verb_ttr': ttr_verb,
+        'noun_ttr': ttr_noun
+    }
+```
+
+%% Cell type:code id:1363f307-fa4b-43ba-93d5-2d1c11ceb9e4 tags:
+
+``` python
+result = compute_ttr_by_pos(sample_text)
+print(f"Verb TTR: {result['verb_ttr']:.3f}")
+print(f"Noun TTR: {result['noun_ttr']:.3f}")
+```
+
+%% Output
+
+    Verb TTR: 0.667
+    Noun TTR: 1.000
+
+%% Cell type:code id:1362e192-514a-4a77-a8cb-5c012026e2bb tags:
+
+``` python
+def compute_nv_ratios(text):
+    """
+    Compute nominal/verb and verb/nominal ratios from column-formatted text.
+    - Uses the second column (POS).
+    - Verbs: 'v', 'vpp', 'vpr'
+    - Nouns: 'nc', 'npp'
+
+    Returns:
+    - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'
+    """
+    verb_pos = {'v', 'vpp', 'vpr'}
+    noun_pos = {'nc', 'npp'}
+
+    verb_count = 0
+    noun_count = 0
+
+    for line in text.strip().splitlines():
+        if line.strip():
+            parts = line.split('\t')
+            if len(parts) >= 2:
+                pos = parts[1]
+                if pos in verb_pos:
+                    verb_count += 1
+                elif pos in noun_pos:
+                    noun_count += 1
+
+    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')
+    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')
+
+    return {
+        'nominal_verb_ratio': nominal_verb_ratio,
+        'verb_nominal_ratio': verb_nominal_ratio
+    }
+```
+
+%% Cell type:code id:544ff6aa-4104-4580-a01f-97429ffcc228 tags:
+
+``` python
+ratios = compute_nv_ratios(sample_text)
+print(f"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}")
+print(f"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}")
+```
+
+%% Output
+
+    Nominal/Verb Ratio: 1.00
+    Verb/Nominal Ratio: 1.00
+
+%% Cell type:markdown id:d3a929bf-61cb-4ef8-bc00-6e2a59760d37 tags:
+
+
+## Readability
+
+%% Cell type:markdown id:3fe25ff0-3f83-40fe-8420-08c09ffe98e6 tags:
+
+### 📚 French Readability MetricsTodo: verify this
+
+This notebook implements and explains three common **readability formulas** tailored for **French texts**:
+
+---
+
+#### 1. 🟦 **Flesch–Douma Index**
+
+An adaptation of the original Flesch Reading Ease formula for French.
+
+$\text{Flesch–Douma} = 207 - (1.015 \times \text{ASL}) - (73.6 \times \text{ASW})$
+
+Where:
+- **ASL** = Average Sentence Length = (number of words) / (number of sentences)
+- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)
+
+📊 **Interpretation**:
+- 90–100: Very easy
+- 60–70: Standard
+- 30–50: Difficult
+- < 30: Very difficult
+
+---
+
+#### 2. 🟨 **LIX Index**
+
+Used widely in French and other European languages. Measures sentence length and lexical complexity.
+
+$\text{LIX} = \frac{\text{number of words}}{\text{number of sentences}} + \frac{100 \times \text{number of long words (≥7 chars)}}{\text{number of words}}$
+
+📊 **Interpretation**:
+- $<$ 30: Easy
+- 30–40: Medium
+- $>$ 50: Difficult
+
+---
+
+#### 3. 🟥 **Kandel–Moles Index**
+
+A linear formula proposed for French readability:
+
+$\text{Kandel–Moles} = 0.1935 \times \text{number of words} + 0.1672 \times \text{number of syllables} - 1.779$
+
+📊 **Interpretation**:
+- Higher values indicate more complex texts.
+
+---
+
+These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc.
+
+%% Cell type:code id:b9052dc2-ce45-4af4-a0a0-46c60a13da12 tags:
+
+``` python
+# Rewriting the readability metric functions here, without relying on downloading external resources
+
+import re
+
+# Naive sentence splitter (based on punctuation)
+def naive_sentence_tokenize(text):
+    return re.split(r'[.!?]+', text.strip())
+
+# Naive word tokenizer (splits on whitespace and punctuation)
+def naive_word_tokenize(text):
+    return re.findall(r'\b\w+\b', text.lower())
+
+# Function to count syllables in a French word (naive method using vowel groups)
+def count_syllables(word):
+    vowels = "aeiouyàâäéèêëîïôöùûüœ"
+    syllables = re.findall(rf"[{vowels}]+", word)
+    return max(1, len(syllables))
+
+# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices
+def compute_french_readability(text):
+    sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]
+    words = [w for w in naive_word_tokenize(text) if re.match(r"\w+", w)]
+
+    num_sentences = len(sentences)
+    num_words = len(words)
+    num_syllables = sum(count_syllables(w) for w in words)
+    num_long_words = sum(1 for w in words if len(w) >= 7)
+
+    if num_sentences == 0 or num_words == 0:
+        return {
+            "Flesch-Douma": 0.0,
+            "LIX": 0.0,
+            "Kandel-Moles": 0.0
+        }
+
+    # Flesch-Douma
+    asl = num_words / num_sentences  # Average sentence length
+    asw = num_syllables / num_words  # Average syllables per word
+    flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)
+
+    # LIX
+    lix = (num_words / num_sentences) + (100 * num_long_words / num_words)
+
+    # Kandel-Moles
+    kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779
+
+    return {
+        "Flesch-Douma": round(flesch_douma, 2),
+        "LIX": round(lix, 2),
+        "Kandel-Moles": round(kandel_moles, 2)
+    }
+```
+
+%% Cell type:code id:1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e tags:
+
+``` python
+# Test on a sample French text
+sample_french_text = """
+Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.
+"""
+compute_french_readability(sample_french_text)
+```
+
+%% Output
+
+    {'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}
+
+%% Cell type:code id:b2cd53c6-1c16-4eaf-8ac8-af166afaa97b tags:
+
+``` python
 ```

--- a/tania_scripts/tania-some-other-metrics.ipynb
+++ b/tania_scripts/tania-some-other-metrics.ipynb
@@ -10,15 +10,80 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
+   "id": "510c3726-366d-4e26-a2bb-b55391b473bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /home/tatiana.bladier/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import nltk\n",
+    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
+    "\n",
+    "# Download once if not already\n",
+    "#nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ee59c294-fdcd-429e-a126-734480d1b0ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
   "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def compute_ttr(text):\n",
+    "import string\n",
+    "\n",
+    "def compute_ttr_cleaned(text):\n",
    "    \"\"\"\n",
    "    Compute the type/token ratio (TTR) from column-formatted text.\n",
-    "    Only the first column is used (tokens).\n",
+    "    - Only the first column is used (tokens).\n",
+    "    - Tokens are lowercased.\n",
+    "    - Punctuation tokens are ignored.\n",
    "\n",
    "    Parameters:\n",
    "    - text: str, the input text in column format\n",
@@ -30,7 +95,8 @@
    "\n",
    "    for line in text.strip().splitlines():\n",
    "        if line.strip():  # skip empty lines\n",
-    "            token = line.split('\\t')[0]\n",
+    "            token = line.split('\\t')[0].lower()\n",
+    "            if token not in string.punctuation:\n",
    "                tokens.append(token)\n",
    "\n",
    "    if not tokens:\n",
@@ -42,7 +108,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
   "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
   "metadata": {},
   "outputs": [
@@ -55,34 +121,303 @@
    }
   ],
   "source": [
-    "sample_text = \"\"\"\n",
-    "<s>\t<s>\t<s>\t0\n",
-    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
-    "était\tv\t<nul>@@<nul>\t0\n",
-    "à\tp\t<nul>@@<nul>\t0\n",
-    "son\tdet\tNP@@<nul>\t0\n",
-    "poste\tnc\t<nul>@@<nul>\t1\n",
-    ",\tponct\t<nul>@@<nul>\t0\n",
-    "il\tcls-suj\tVN@@<nul>\t0\n",
-    "était\tv\t<nul>@@<nul>\t1\n",
-    "tombé\tvpp\t<nul>@@<nul>\t1\n",
-    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
-    "ses\tdet\tNP@@<nul>\t2\n",
-    "bras\tnc\t<nul>@@<nul>\t3\n",
-    ".\tponct\t<nul>@@<nul>\t0\n",
-    "</s>\t</s>\t</s>\t0\n",
-    "\"\"\"\n",
-    "\n",
    "ttr = compute_ttr(sample_text)\n",
    "print(f\"Type/Token Ratio: {ttr:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "def compute_ttr_by_pos(text):\n",
+    "    \"\"\"\n",
+    "    Compute type/token ratios for verbs and nouns in column-formatted text.\n",
+    "    - Columns: token \\t pos \\t ...\n",
+    "    - Verbs: POS in {'v', 'vpp', 'vpr'}\n",
+    "    - Nouns: POS in {'nc', 'npp'}\n",
+    "    - Tokens are lowercased.\n",
+    "    - Punctuation is ignored.\n",
+    "\n",
+    "    Returns:\n",
+    "    - A dictionary with TTRs for verbs and nouns.\n",
+    "    \"\"\"\n",
+    "    import string\n",
+    "\n",
+    "    verb_pos = {'v', 'vpp', 'vpr'}\n",
+    "    noun_pos = {'nc', 'npp'}\n",
+    "\n",
+    "    verb_tokens = []\n",
+    "    noun_tokens = []\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():\n",
+    "            parts = line.split('\\t')\n",
+    "            if len(parts) >= 2:\n",
+    "                token = parts[0].lower()\n",
+    "                pos = parts[1]\n",
+    "\n",
+    "                # Ignore punctuation\n",
+    "                if token in string.punctuation:\n",
+    "                    continue\n",
+    "\n",
+    "                if pos in verb_pos:\n",
+    "                    verb_tokens.append(token)\n",
+    "                elif pos in noun_pos:\n",
+    "                    noun_tokens.append(token)\n",
+    "\n",
+    "    # Compute TTRs\n",
+    "    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n",
+    "    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n",
+    "\n",
+    "    return {\n",
+    "        'verb_ttr': ttr_verb,\n",
+    "        'noun_ttr': ttr_noun\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verb TTR: 0.667\n",
+      "Noun TTR: 1.000\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = compute_ttr_by_pos(sample_text)\n",
+    "print(f\"Verb TTR: {result['verb_ttr']:.3f}\")\n",
+    "print(f\"Noun TTR: {result['noun_ttr']:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1362e192-514a-4a77-a8cb-5c012026e2bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_nv_ratios(text):\n",
+    "    \"\"\"\n",
+    "    Compute nominal/verb and verb/nominal ratios from column-formatted text.\n",
+    "    - Uses the second column (POS).\n",
+    "    - Verbs: 'v', 'vpp', 'vpr'\n",
+    "    - Nouns: 'nc', 'npp'\n",
+    "\n",
+    "    Returns:\n",
+    "    - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'\n",
+    "    \"\"\"\n",
+    "    verb_pos = {'v', 'vpp', 'vpr'}\n",
+    "    noun_pos = {'nc', 'npp'}\n",
+    "\n",
+    "    verb_count = 0\n",
+    "    noun_count = 0\n",
+    "\n",
+    "    for line in text.strip().splitlines():\n",
+    "        if line.strip():\n",
+    "            parts = line.split('\\t')\n",
+    "            if len(parts) >= 2:\n",
+    "                pos = parts[1]\n",
+    "                if pos in verb_pos:\n",
+    "                    verb_count += 1\n",
+    "                elif pos in noun_pos:\n",
+    "                    noun_count += 1\n",
+    "\n",
+    "    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n",
+    "    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n",
+    "\n",
+    "    return {\n",
+    "        'nominal_verb_ratio': nominal_verb_ratio,\n",
+    "        'verb_nominal_ratio': verb_nominal_ratio\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "544ff6aa-4104-4580-a01f-97429ffcc228",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nominal/Verb Ratio: 1.00\n",
+      "Verb/Nominal Ratio: 1.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "ratios = compute_nv_ratios(sample_text)\n",
+    "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n",
+    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3a929bf-61cb-4ef8-bc00-6e2a59760d37",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Readability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3fe25ff0-3f83-40fe-8420-08c09ffe98e6",
+   "metadata": {},
+   "source": [
+    "### 📚 French Readability MetricsTodo: verify this\n",
+    "\n",
+    "This notebook implements and explains three common **readability formulas** tailored for **French texts**:\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 1. 🟦 **Flesch–Douma Index**\n",
+    "\n",
+    "An adaptation of the original Flesch Reading Ease formula for French.\n",
+    "\n",
+    "$\\text{Flesch–Douma} = 207 - (1.015 \\times \\text{ASL}) - (73.6 \\times \\text{ASW})$\n",
+    "\n",
+    "Where:\n",
+    "- **ASL** = Average Sentence Length = (number of words) / (number of sentences)\n",
+    "- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- 90–100: Very easy\n",
+    "- 60–70: Standard\n",
+    "- 30–50: Difficult\n",
+    "- < 30: Very difficult\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 2. 🟨 **LIX Index**\n",
+    "\n",
+    "Used widely in French and other European languages. Measures sentence length and lexical complexity.\n",
+    "\n",
+    "$\\text{LIX} = \\frac{\\text{number of words}}{\\text{number of sentences}} + \\frac{100 \\times \\text{number of long words (≥7 chars)}}{\\text{number of words}}$\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- $<$ 30: Easy\n",
+    "- 30–40: Medium\n",
+    "- $>$ 50: Difficult\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### 3. 🟥 **Kandel–Moles Index**\n",
+    "\n",
+    "A linear formula proposed for French readability:\n",
+    "\n",
+    "$\\text{Kandel–Moles} = 0.1935 \\times \\text{number of words} + 0.1672 \\times \\text{number of syllables} - 1.779$\n",
+    "\n",
+    "📊 **Interpretation**:\n",
+    "- Higher values indicate more complex texts.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rewriting the readability metric functions here, without relying on downloading external resources\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "# Naive sentence splitter (based on punctuation)\n",
+    "def naive_sentence_tokenize(text):\n",
+    "    return re.split(r'[.!?]+', text.strip())\n",
+    "\n",
+    "# Naive word tokenizer (splits on whitespace and punctuation)\n",
+    "def naive_word_tokenize(text):\n",
+    "    return re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "\n",
+    "# Function to count syllables in a French word (naive method using vowel groups)\n",
+    "def count_syllables(word):\n",
+    "    vowels = \"aeiouyàâäéèêëîïôöùûüœ\"\n",
+    "    syllables = re.findall(rf\"[{vowels}]+\", word)\n",
+    "    return max(1, len(syllables))\n",
+    "\n",
+    "# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices\n",
+    "def compute_french_readability(text):\n",
+    "    sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]\n",
+    "    words = [w for w in naive_word_tokenize(text) if re.match(r\"\\w+\", w)]\n",
+    "    \n",
+    "    num_sentences = len(sentences)\n",
+    "    num_words = len(words)\n",
+    "    num_syllables = sum(count_syllables(w) for w in words)\n",
+    "    num_long_words = sum(1 for w in words if len(w) >= 7)\n",
+    "\n",
+    "    if num_sentences == 0 or num_words == 0:\n",
+    "        return {\n",
+    "            \"Flesch-Douma\": 0.0,\n",
+    "            \"LIX\": 0.0,\n",
+    "            \"Kandel-Moles\": 0.0\n",
+    "        }\n",
+    "\n",
+    "    # Flesch-Douma\n",
+    "    asl = num_words / num_sentences  # Average sentence length\n",
+    "    asw = num_syllables / num_words  # Average syllables per word\n",
+    "    flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)\n",
+    "\n",
+    "    # LIX\n",
+    "    lix = (num_words / num_sentences) + (100 * num_long_words / num_words)\n",
+    "\n",
+    "    # Kandel-Moles\n",
+    "    kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779\n",
+    "\n",
+    "    return {\n",
+    "        \"Flesch-Douma\": round(flesch_douma, 2),\n",
+    "        \"LIX\": round(lix, 2),\n",
+    "        \"Kandel-Moles\": round(kandel_moles, 2)\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test on a sample French text\n",
+    "sample_french_text = \"\"\"\n",
+    "Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.\n",
+    "\"\"\"\n",
+    "compute_french_readability(sample_french_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b",
+   "metadata": {},
+   "outputs": [],
   "source": []
  }
 ],

 %% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:

 ## Type-token ratio

-%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
+%% Cell type:code id:510c3726-366d-4e26-a2bb-b55391b473bd tags:

 ``` python
-def compute_ttr(text):
-    """
-    Compute the type/token ratio (TTR) from column-formatted text.
-    Only the first column is used (tokens).
+import re
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize

-    Parameters:
-    - text: str, the input text in column format
+# Download once if not already
+#nltk.download('punkt')
+```

-    Returns:
-    - ttr: float, the type/token ratio
-    """
-    tokens = []
+%% Output

-    for line in text.strip().splitlines():
-        if line.strip():  # skip empty lines
-            token = line.split('\t')[0]
-            tokens.append(token)
+    [nltk_data] Downloading package punkt to
+    [nltk_data]     /home/tatiana.bladier/nltk_data...
+    [nltk_data]   Package punkt is already up-to-date!

-    if not tokens:
-        return 0.0
+    True

-    types = set(tokens)
-    return len(types) / len(tokens)
-```
-
-%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
+%% Cell type:code id:ee59c294-fdcd-429e-a126-734480d1b0ba tags:

 ``` python
 sample_text = """
 <s>	<s>	<s>	0
 Aramis	npp	<nul>@@<nul>	0
 était	v	<nul>@@<nul>	0
 à	p	<nul>@@<nul>	0
 son	det	NP@@<nul>	0
 poste	nc	<nul>@@<nul>	1
 ,	ponct	<nul>@@<nul>	0
 il	cls-suj	VN@@<nul>	0
 était	v	<nul>@@<nul>	1
 tombé	vpp	<nul>@@<nul>	1
 de	p	PP-DE_OBJ@@Sint-MOD	1
 ses	det	NP@@<nul>	2
 bras	nc	<nul>@@<nul>	3
 .	ponct	<nul>@@<nul>	0
 </s>	</s>	</s>	0
 """
+```
+
+%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
+
+``` python
+import string
+
+def compute_ttr_cleaned(text):
+    """
+    Compute the type/token ratio (TTR) from column-formatted text.
+    - Only the first column is used (tokens).
+    - Tokens are lowercased.
+    - Punctuation tokens are ignored.
+
+    Parameters:
+    - text: str, the input text in column format

+    Returns:
+    - ttr: float, the type/token ratio
+    """
+    tokens = []
+
+    for line in text.strip().splitlines():
+        if line.strip():  # skip empty lines
+            token = line.split('\t')[0].lower()
+            if token not in string.punctuation:
+                tokens.append(token)
+
+    if not tokens:
+        return 0.0
+
+    types = set(tokens)
+    return len(types) / len(tokens)
+```
+
+%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
+
+``` python
 ttr = compute_ttr(sample_text)
 print(f"Type/Token Ratio: {ttr:.3f}")
 ```

 %% Output

    Type/Token Ratio: 0.933

 %% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:

 ``` python
+def compute_ttr_by_pos(text):
+    """
+    Compute type/token ratios for verbs and nouns in column-formatted text.
+    - Columns: token \t pos \t ...
+    - Verbs: POS in {'v', 'vpp', 'vpr'}
+    - Nouns: POS in {'nc', 'npp'}
+    - Tokens are lowercased.
+    - Punctuation is ignored.
+
+    Returns:
+    - A dictionary with TTRs for verbs and nouns.
+    """
+    import string
+
+    verb_pos = {'v', 'vpp', 'vpr'}
+    noun_pos = {'nc', 'npp'}
+
+    verb_tokens = []
+    noun_tokens = []
+
+    for line in text.strip().splitlines():
+        if line.strip():
+            parts = line.split('\t')
+            if len(parts) >= 2:
+                token = parts[0].lower()
+                pos = parts[1]
+
+                # Ignore punctuation
+                if token in string.punctuation:
+                    continue
+
+                if pos in verb_pos:
+                    verb_tokens.append(token)
+                elif pos in noun_pos:
+                    noun_tokens.append(token)
+
+    # Compute TTRs
+    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0
+    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0
+
+    return {
+        'verb_ttr': ttr_verb,
+        'noun_ttr': ttr_noun
+    }
+```
+
+%% Cell type:code id:1363f307-fa4b-43ba-93d5-2d1c11ceb9e4 tags:
+
+``` python
+result = compute_ttr_by_pos(sample_text)
+print(f"Verb TTR: {result['verb_ttr']:.3f}")
+print(f"Noun TTR: {result['noun_ttr']:.3f}")
+```
+
+%% Output
+
+    Verb TTR: 0.667
+    Noun TTR: 1.000
+
+%% Cell type:code id:1362e192-514a-4a77-a8cb-5c012026e2bb tags:
+
+``` python
+def compute_nv_ratios(text):
+    """
+    Compute nominal/verb and verb/nominal ratios from column-formatted text.
+    - Uses the second column (POS).
+    - Verbs: 'v', 'vpp', 'vpr'
+    - Nouns: 'nc', 'npp'
+
+    Returns:
+    - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'
+    """
+    verb_pos = {'v', 'vpp', 'vpr'}
+    noun_pos = {'nc', 'npp'}
+
+    verb_count = 0
+    noun_count = 0
+
+    for line in text.strip().splitlines():
+        if line.strip():
+            parts = line.split('\t')
+            if len(parts) >= 2:
+                pos = parts[1]
+                if pos in verb_pos:
+                    verb_count += 1
+                elif pos in noun_pos:
+                    noun_count += 1
+
+    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')
+    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')
+
+    return {
+        'nominal_verb_ratio': nominal_verb_ratio,
+        'verb_nominal_ratio': verb_nominal_ratio
+    }
+```
+
+%% Cell type:code id:544ff6aa-4104-4580-a01f-97429ffcc228 tags:
+
+``` python
+ratios = compute_nv_ratios(sample_text)
+print(f"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}")
+print(f"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}")
+```
+
+%% Output
+
+    Nominal/Verb Ratio: 1.00
+    Verb/Nominal Ratio: 1.00
+
+%% Cell type:markdown id:d3a929bf-61cb-4ef8-bc00-6e2a59760d37 tags:
+
+
+## Readability
+
+%% Cell type:markdown id:3fe25ff0-3f83-40fe-8420-08c09ffe98e6 tags:
+
+### 📚 French Readability MetricsTodo: verify this
+
+This notebook implements and explains three common **readability formulas** tailored for **French texts**:
+
+---
+
+#### 1. 🟦 **Flesch–Douma Index**
+
+An adaptation of the original Flesch Reading Ease formula for French.
+
+$\text{Flesch–Douma} = 207 - (1.015 \times \text{ASL}) - (73.6 \times \text{ASW})$
+
+Where:
+- **ASL** = Average Sentence Length = (number of words) / (number of sentences)
+- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)
+
+📊 **Interpretation**:
+- 90–100: Very easy
+- 60–70: Standard
+- 30–50: Difficult
+- < 30: Very difficult
+
+---
+
+#### 2. 🟨 **LIX Index**
+
+Used widely in French and other European languages. Measures sentence length and lexical complexity.
+
+$\text{LIX} = \frac{\text{number of words}}{\text{number of sentences}} + \frac{100 \times \text{number of long words (≥7 chars)}}{\text{number of words}}$
+
+📊 **Interpretation**:
+- $<$ 30: Easy
+- 30–40: Medium
+- $>$ 50: Difficult
+
+---
+
+#### 3. 🟥 **Kandel–Moles Index**
+
+A linear formula proposed for French readability:
+
+$\text{Kandel–Moles} = 0.1935 \times \text{number of words} + 0.1672 \times \text{number of syllables} - 1.779$
+
+📊 **Interpretation**:
+- Higher values indicate more complex texts.
+
+---
+
+These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc.
+
+%% Cell type:code id:b9052dc2-ce45-4af4-a0a0-46c60a13da12 tags:
+
+``` python
+# Rewriting the readability metric functions here, without relying on downloading external resources
+
+import re
+
+# Naive sentence splitter (based on punctuation)
+def naive_sentence_tokenize(text):
+    return re.split(r'[.!?]+', text.strip())
+
+# Naive word tokenizer (splits on whitespace and punctuation)
+def naive_word_tokenize(text):
+    return re.findall(r'\b\w+\b', text.lower())
+
+# Function to count syllables in a French word (naive method using vowel groups)
+def count_syllables(word):
+    vowels = "aeiouyàâäéèêëîïôöùûüœ"
+    syllables = re.findall(rf"[{vowels}]+", word)
+    return max(1, len(syllables))
+
+# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices
+def compute_french_readability(text):
+    sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]
+    words = [w for w in naive_word_tokenize(text) if re.match(r"\w+", w)]
+
+    num_sentences = len(sentences)
+    num_words = len(words)
+    num_syllables = sum(count_syllables(w) for w in words)
+    num_long_words = sum(1 for w in words if len(w) >= 7)
+
+    if num_sentences == 0 or num_words == 0:
+        return {
+            "Flesch-Douma": 0.0,
+            "LIX": 0.0,
+            "Kandel-Moles": 0.0
+        }
+
+    # Flesch-Douma
+    asl = num_words / num_sentences  # Average sentence length
+    asw = num_syllables / num_words  # Average syllables per word
+    flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)
+
+    # LIX
+    lix = (num_words / num_sentences) + (100 * num_long_words / num_words)
+
+    # Kandel-Moles
+    kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779
+
+    return {
+        "Flesch-Douma": round(flesch_douma, 2),
+        "LIX": round(lix, 2),
+        "Kandel-Moles": round(kandel_moles, 2)
+    }
+```
+
+%% Cell type:code id:1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e tags:
+
+``` python
+# Test on a sample French text
+sample_french_text = """
+Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.
+"""
+compute_french_readability(sample_french_text)
+```
+
+%% Output
+
+    {'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}
+
+%% Cell type:code id:b2cd53c6-1c16-4eaf-8ac8-af166afaa97b tags:
+
+``` python
 ```