diff --git a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb index 76b435504a3972105238e6a1cc9f1a95ac0c833b..432660ba42e45d9089d7ec0866fa314d38ec94a8 100644 --- a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb +++ b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb @@ -10,15 +10,80 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, + "id": "510c3726-366d-4e26-a2bb-b55391b473bd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /home/tatiana.bladier/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import nltk\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "\n", + "# Download once if not already\n", + "#nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ee59c294-fdcd-429e-a126-734480d1b0ba", + "metadata": {}, + "outputs": [], + "source": [ + "sample_text = \"\"\"\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "son\tdet\tNP@@<nul>\t0\n", + "poste\tnc\t<nul>@@<nul>\t1\n", + ",\tponct\t<nul>@@<nul>\t0\n", + "il\tcls-suj\tVN@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t1\n", + "tombé\tvpp\t<nul>@@<nul>\t1\n", + "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", + "ses\tdet\tNP@@<nul>\t2\n", + "bras\tnc\t<nul>@@<nul>\t3\n", + ".\tponct\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7", "metadata": {}, "outputs": [], "source": [ - "def compute_ttr(text):\n", + "import string\n", + "\n", + "def compute_ttr_cleaned(text):\n", " \"\"\"\n", " Compute the type/token ratio (TTR) from column-formatted text.\n", - " Only the first column is used (tokens).\n", + " - Only the first column is used (tokens).\n", + " - Tokens are lowercased.\n", + " - Punctuation tokens are ignored.\n", "\n", " Parameters:\n", " - text: str, the input text in column format\n", @@ -30,8 +95,9 @@ "\n", " for line in text.strip().splitlines():\n", " if line.strip(): # skip empty lines\n", - " token = line.split('\\t')[0]\n", - " tokens.append(token)\n", + " token = line.split('\\t')[0].lower()\n", + " if token not in string.punctuation:\n", + " tokens.append(token)\n", "\n", " if not tokens:\n", " return 0.0\n", @@ -42,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f", "metadata": {}, "outputs": [ @@ -55,34 +121,303 @@ } ], "source": [ - "sample_text = \"\"\"\n", - "<s>\t<s>\t<s>\t0\n", - "Aramis\tnpp\t<nul>@@<nul>\t0\n", - "était\tv\t<nul>@@<nul>\t0\n", - "à\tp\t<nul>@@<nul>\t0\n", - "son\tdet\tNP@@<nul>\t0\n", - "poste\tnc\t<nul>@@<nul>\t1\n", - ",\tponct\t<nul>@@<nul>\t0\n", - "il\tcls-suj\tVN@@<nul>\t0\n", - "était\tv\t<nul>@@<nul>\t1\n", - "tombé\tvpp\t<nul>@@<nul>\t1\n", - "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", - "ses\tdet\tNP@@<nul>\t2\n", - "bras\tnc\t<nul>@@<nul>\t3\n", - ".\tponct\t<nul>@@<nul>\t0\n", - "</s>\t</s>\t</s>\t0\n", - "\"\"\"\n", - "\n", "ttr = compute_ttr(sample_text)\n", "print(f\"Type/Token Ratio: {ttr:.3f}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2", "metadata": {}, "outputs": [], + "source": [ + "def compute_ttr_by_pos(text):\n", + " \"\"\"\n", + " Compute type/token ratios for verbs and nouns in column-formatted text.\n", + " - Columns: token \\t pos \\t ...\n", + " - Verbs: POS in {'v', 'vpp', 'vpr'}\n", + " - Nouns: POS in {'nc', 'npp'}\n", + " - Tokens are lowercased.\n", + " - Punctuation is ignored.\n", + "\n", + " Returns:\n", + " - A dictionary with TTRs for verbs and nouns.\n", + " \"\"\"\n", + " import string\n", + "\n", + " verb_pos = {'v', 'vpp', 'vpr'}\n", + " noun_pos = {'nc', 'npp'}\n", + "\n", + " verb_tokens = []\n", + " noun_tokens = []\n", + "\n", + " for line in text.strip().splitlines():\n", + " if line.strip():\n", + " parts = line.split('\\t')\n", + " if len(parts) >= 2:\n", + " token = parts[0].lower()\n", + " pos = parts[1]\n", + "\n", + " # Ignore punctuation\n", + " if token in string.punctuation:\n", + " continue\n", + "\n", + " if pos in verb_pos:\n", + " verb_tokens.append(token)\n", + " elif pos in noun_pos:\n", + " noun_tokens.append(token)\n", + "\n", + " # Compute TTRs\n", + " ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n", + " ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n", + "\n", + " return {\n", + " 'verb_ttr': ttr_verb,\n", + " 'noun_ttr': ttr_noun\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verb TTR: 0.667\n", + "Noun TTR: 1.000\n" + ] + } + ], + "source": [ + "result = compute_ttr_by_pos(sample_text)\n", + "print(f\"Verb TTR: {result['verb_ttr']:.3f}\")\n", + "print(f\"Noun TTR: {result['noun_ttr']:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1362e192-514a-4a77-a8cb-5c012026e2bb", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_nv_ratios(text):\n", + " \"\"\"\n", + " Compute nominal/verb and verb/nominal ratios from column-formatted text.\n", + " - Uses the second column (POS).\n", + " - Verbs: 'v', 'vpp', 'vpr'\n", + " - Nouns: 'nc', 'npp'\n", + "\n", + " Returns:\n", + " - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'\n", + " \"\"\"\n", + " verb_pos = {'v', 'vpp', 'vpr'}\n", + " noun_pos = {'nc', 'npp'}\n", + "\n", + " verb_count = 0\n", + " noun_count = 0\n", + "\n", + " for line in text.strip().splitlines():\n", + " if line.strip():\n", + " parts = line.split('\\t')\n", + " if len(parts) >= 2:\n", + " pos = parts[1]\n", + " if pos in verb_pos:\n", + " verb_count += 1\n", + " elif pos in noun_pos:\n", + " noun_count += 1\n", + "\n", + " nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n", + " verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n", + "\n", + " return {\n", + " 'nominal_verb_ratio': nominal_verb_ratio,\n", + " 'verb_nominal_ratio': verb_nominal_ratio\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "544ff6aa-4104-4580-a01f-97429ffcc228", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nominal/Verb Ratio: 1.00\n", + "Verb/Nominal Ratio: 1.00\n" + ] + } + ], + "source": [ + "ratios = compute_nv_ratios(sample_text)\n", + "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n", + "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3a929bf-61cb-4ef8-bc00-6e2a59760d37", + "metadata": {}, + "source": [ + "\n", + "## Readability" + ] + }, + { + "cell_type": "markdown", + "id": "3fe25ff0-3f83-40fe-8420-08c09ffe98e6", + "metadata": {}, + "source": [ + "### 📚 French Readability MetricsTodo: verify this\n", + "\n", + "This notebook implements and explains three common **readability formulas** tailored for **French texts**:\n", + "\n", + "---\n", + "\n", + "#### 1. 🟦 **Flesch–Douma Index**\n", + "\n", + "An adaptation of the original Flesch Reading Ease formula for French.\n", + "\n", + "$\\text{Flesch–Douma} = 207 - (1.015 \\times \\text{ASL}) - (73.6 \\times \\text{ASW})$\n", + "\n", + "Where:\n", + "- **ASL** = Average Sentence Length = (number of words) / (number of sentences)\n", + "- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)\n", + "\n", + "📊 **Interpretation**:\n", + "- 90–100: Very easy\n", + "- 60–70: Standard\n", + "- 30–50: Difficult\n", + "- < 30: Very difficult\n", + "\n", + "---\n", + "\n", + "#### 2. 🟨 **LIX Index**\n", + "\n", + "Used widely in French and other European languages. Measures sentence length and lexical complexity.\n", + "\n", + "$\\text{LIX} = \\frac{\\text{number of words}}{\\text{number of sentences}} + \\frac{100 \\times \\text{number of long words (≥7 chars)}}{\\text{number of words}}$\n", + "\n", + "📊 **Interpretation**:\n", + "- $<$ 30: Easy\n", + "- 30–40: Medium\n", + "- $>$ 50: Difficult\n", + "\n", + "---\n", + "\n", + "#### 3. 🟥 **Kandel–Moles Index**\n", + "\n", + "A linear formula proposed for French readability:\n", + "\n", + "$\\text{Kandel–Moles} = 0.1935 \\times \\text{number of words} + 0.1672 \\times \\text{number of syllables} - 1.779$\n", + "\n", + "📊 **Interpretation**:\n", + "- Higher values indicate more complex texts.\n", + "\n", + "---\n", + "\n", + "These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12", + "metadata": {}, + "outputs": [], + "source": [ + "# Rewriting the readability metric functions here, without relying on downloading external resources\n", + "\n", + "import re\n", + "\n", + "# Naive sentence splitter (based on punctuation)\n", + "def naive_sentence_tokenize(text):\n", + " return re.split(r'[.!?]+', text.strip())\n", + "\n", + "# Naive word tokenizer (splits on whitespace and punctuation)\n", + "def naive_word_tokenize(text):\n", + " return re.findall(r'\\b\\w+\\b', text.lower())\n", + "\n", + "# Function to count syllables in a French word (naive method using vowel groups)\n", + "def count_syllables(word):\n", + " vowels = \"aeiouyàâäéèêëîïôöùûüœ\"\n", + " syllables = re.findall(rf\"[{vowels}]+\", word)\n", + " return max(1, len(syllables))\n", + "\n", + "# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices\n", + "def compute_french_readability(text):\n", + " sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]\n", + " words = [w for w in naive_word_tokenize(text) if re.match(r\"\\w+\", w)]\n", + " \n", + " num_sentences = len(sentences)\n", + " num_words = len(words)\n", + " num_syllables = sum(count_syllables(w) for w in words)\n", + " num_long_words = sum(1 for w in words if len(w) >= 7)\n", + "\n", + " if num_sentences == 0 or num_words == 0:\n", + " return {\n", + " \"Flesch-Douma\": 0.0,\n", + " \"LIX\": 0.0,\n", + " \"Kandel-Moles\": 0.0\n", + " }\n", + "\n", + " # Flesch-Douma\n", + " asl = num_words / num_sentences # Average sentence length\n", + " asw = num_syllables / num_words # Average syllables per word\n", + " flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)\n", + "\n", + " # LIX\n", + " lix = (num_words / num_sentences) + (100 * num_long_words / num_words)\n", + "\n", + " # Kandel-Moles\n", + " kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779\n", + "\n", + " return {\n", + " \"Flesch-Douma\": round(flesch_douma, 2),\n", + " \"LIX\": round(lix, 2),\n", + " \"Kandel-Moles\": round(kandel_moles, 2)\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test on a sample French text\n", + "sample_french_text = \"\"\"\n", + "Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.\n", + "\"\"\"\n", + "compute_french_readability(sample_french_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b", + "metadata": {}, + "outputs": [], "source": [] } ], diff --git a/tania_scripts/tania-some-other-metrics.ipynb b/tania_scripts/tania-some-other-metrics.ipynb index 76b435504a3972105238e6a1cc9f1a95ac0c833b..432660ba42e45d9089d7ec0866fa314d38ec94a8 100644 --- a/tania_scripts/tania-some-other-metrics.ipynb +++ b/tania_scripts/tania-some-other-metrics.ipynb @@ -10,15 +10,80 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, + "id": "510c3726-366d-4e26-a2bb-b55391b473bd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /home/tatiana.bladier/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import nltk\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "\n", + "# Download once if not already\n", + "#nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ee59c294-fdcd-429e-a126-734480d1b0ba", + "metadata": {}, + "outputs": [], + "source": [ + "sample_text = \"\"\"\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "son\tdet\tNP@@<nul>\t0\n", + "poste\tnc\t<nul>@@<nul>\t1\n", + ",\tponct\t<nul>@@<nul>\t0\n", + "il\tcls-suj\tVN@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t1\n", + "tombé\tvpp\t<nul>@@<nul>\t1\n", + "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", + "ses\tdet\tNP@@<nul>\t2\n", + "bras\tnc\t<nul>@@<nul>\t3\n", + ".\tponct\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7", "metadata": {}, "outputs": [], "source": [ - "def compute_ttr(text):\n", + "import string\n", + "\n", + "def compute_ttr_cleaned(text):\n", " \"\"\"\n", " Compute the type/token ratio (TTR) from column-formatted text.\n", - " Only the first column is used (tokens).\n", + " - Only the first column is used (tokens).\n", + " - Tokens are lowercased.\n", + " - Punctuation tokens are ignored.\n", "\n", " Parameters:\n", " - text: str, the input text in column format\n", @@ -30,8 +95,9 @@ "\n", " for line in text.strip().splitlines():\n", " if line.strip(): # skip empty lines\n", - " token = line.split('\\t')[0]\n", - " tokens.append(token)\n", + " token = line.split('\\t')[0].lower()\n", + " if token not in string.punctuation:\n", + " tokens.append(token)\n", "\n", " if not tokens:\n", " return 0.0\n", @@ -42,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f", "metadata": {}, "outputs": [ @@ -55,34 +121,303 @@ } ], "source": [ - "sample_text = \"\"\"\n", - "<s>\t<s>\t<s>\t0\n", - "Aramis\tnpp\t<nul>@@<nul>\t0\n", - "était\tv\t<nul>@@<nul>\t0\n", - "à\tp\t<nul>@@<nul>\t0\n", - "son\tdet\tNP@@<nul>\t0\n", - "poste\tnc\t<nul>@@<nul>\t1\n", - ",\tponct\t<nul>@@<nul>\t0\n", - "il\tcls-suj\tVN@@<nul>\t0\n", - "était\tv\t<nul>@@<nul>\t1\n", - "tombé\tvpp\t<nul>@@<nul>\t1\n", - "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", - "ses\tdet\tNP@@<nul>\t2\n", - "bras\tnc\t<nul>@@<nul>\t3\n", - ".\tponct\t<nul>@@<nul>\t0\n", - "</s>\t</s>\t</s>\t0\n", - "\"\"\"\n", - "\n", "ttr = compute_ttr(sample_text)\n", "print(f\"Type/Token Ratio: {ttr:.3f}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2", "metadata": {}, "outputs": [], + "source": [ + "def compute_ttr_by_pos(text):\n", + " \"\"\"\n", + " Compute type/token ratios for verbs and nouns in column-formatted text.\n", + " - Columns: token \\t pos \\t ...\n", + " - Verbs: POS in {'v', 'vpp', 'vpr'}\n", + " - Nouns: POS in {'nc', 'npp'}\n", + " - Tokens are lowercased.\n", + " - Punctuation is ignored.\n", + "\n", + " Returns:\n", + " - A dictionary with TTRs for verbs and nouns.\n", + " \"\"\"\n", + " import string\n", + "\n", + " verb_pos = {'v', 'vpp', 'vpr'}\n", + " noun_pos = {'nc', 'npp'}\n", + "\n", + " verb_tokens = []\n", + " noun_tokens = []\n", + "\n", + " for line in text.strip().splitlines():\n", + " if line.strip():\n", + " parts = line.split('\\t')\n", + " if len(parts) >= 2:\n", + " token = parts[0].lower()\n", + " pos = parts[1]\n", + "\n", + " # Ignore punctuation\n", + " if token in string.punctuation:\n", + " continue\n", + "\n", + " if pos in verb_pos:\n", + " verb_tokens.append(token)\n", + " elif pos in noun_pos:\n", + " noun_tokens.append(token)\n", + "\n", + " # Compute TTRs\n", + " ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n", + " ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n", + "\n", + " return {\n", + " 'verb_ttr': ttr_verb,\n", + " 'noun_ttr': ttr_noun\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verb TTR: 0.667\n", + "Noun TTR: 1.000\n" + ] + } + ], + "source": [ + "result = compute_ttr_by_pos(sample_text)\n", + "print(f\"Verb TTR: {result['verb_ttr']:.3f}\")\n", + "print(f\"Noun TTR: {result['noun_ttr']:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1362e192-514a-4a77-a8cb-5c012026e2bb", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_nv_ratios(text):\n", + " \"\"\"\n", + " Compute nominal/verb and verb/nominal ratios from column-formatted text.\n", + " - Uses the second column (POS).\n", + " - Verbs: 'v', 'vpp', 'vpr'\n", + " - Nouns: 'nc', 'npp'\n", + "\n", + " Returns:\n", + " - Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'\n", + " \"\"\"\n", + " verb_pos = {'v', 'vpp', 'vpr'}\n", + " noun_pos = {'nc', 'npp'}\n", + "\n", + " verb_count = 0\n", + " noun_count = 0\n", + "\n", + " for line in text.strip().splitlines():\n", + " if line.strip():\n", + " parts = line.split('\\t')\n", + " if len(parts) >= 2:\n", + " pos = parts[1]\n", + " if pos in verb_pos:\n", + " verb_count += 1\n", + " elif pos in noun_pos:\n", + " noun_count += 1\n", + "\n", + " nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n", + " verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n", + "\n", + " return {\n", + " 'nominal_verb_ratio': nominal_verb_ratio,\n", + " 'verb_nominal_ratio': verb_nominal_ratio\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "544ff6aa-4104-4580-a01f-97429ffcc228", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nominal/Verb Ratio: 1.00\n", + "Verb/Nominal Ratio: 1.00\n" + ] + } + ], + "source": [ + "ratios = compute_nv_ratios(sample_text)\n", + "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n", + "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3a929bf-61cb-4ef8-bc00-6e2a59760d37", + "metadata": {}, + "source": [ + "\n", + "## Readability" + ] + }, + { + "cell_type": "markdown", + "id": "3fe25ff0-3f83-40fe-8420-08c09ffe98e6", + "metadata": {}, + "source": [ + "### 📚 French Readability MetricsTodo: verify this\n", + "\n", + "This notebook implements and explains three common **readability formulas** tailored for **French texts**:\n", + "\n", + "---\n", + "\n", + "#### 1. 🟦 **Flesch–Douma Index**\n", + "\n", + "An adaptation of the original Flesch Reading Ease formula for French.\n", + "\n", + "$\\text{Flesch–Douma} = 207 - (1.015 \\times \\text{ASL}) - (73.6 \\times \\text{ASW})$\n", + "\n", + "Where:\n", + "- **ASL** = Average Sentence Length = (number of words) / (number of sentences)\n", + "- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)\n", + "\n", + "📊 **Interpretation**:\n", + "- 90–100: Very easy\n", + "- 60–70: Standard\n", + "- 30–50: Difficult\n", + "- < 30: Very difficult\n", + "\n", + "---\n", + "\n", + "#### 2. 🟨 **LIX Index**\n", + "\n", + "Used widely in French and other European languages. Measures sentence length and lexical complexity.\n", + "\n", + "$\\text{LIX} = \\frac{\\text{number of words}}{\\text{number of sentences}} + \\frac{100 \\times \\text{number of long words (≥7 chars)}}{\\text{number of words}}$\n", + "\n", + "📊 **Interpretation**:\n", + "- $<$ 30: Easy\n", + "- 30–40: Medium\n", + "- $>$ 50: Difficult\n", + "\n", + "---\n", + "\n", + "#### 3. 🟥 **Kandel–Moles Index**\n", + "\n", + "A linear formula proposed for French readability:\n", + "\n", + "$\\text{Kandel–Moles} = 0.1935 \\times \\text{number of words} + 0.1672 \\times \\text{number of syllables} - 1.779$\n", + "\n", + "📊 **Interpretation**:\n", + "- Higher values indicate more complex texts.\n", + "\n", + "---\n", + "\n", + "These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12", + "metadata": {}, + "outputs": [], + "source": [ + "# Rewriting the readability metric functions here, without relying on downloading external resources\n", + "\n", + "import re\n", + "\n", + "# Naive sentence splitter (based on punctuation)\n", + "def naive_sentence_tokenize(text):\n", + " return re.split(r'[.!?]+', text.strip())\n", + "\n", + "# Naive word tokenizer (splits on whitespace and punctuation)\n", + "def naive_word_tokenize(text):\n", + " return re.findall(r'\\b\\w+\\b', text.lower())\n", + "\n", + "# Function to count syllables in a French word (naive method using vowel groups)\n", + "def count_syllables(word):\n", + " vowels = \"aeiouyàâäéèêëîïôöùûüœ\"\n", + " syllables = re.findall(rf\"[{vowels}]+\", word)\n", + " return max(1, len(syllables))\n", + "\n", + "# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices\n", + "def compute_french_readability(text):\n", + " sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]\n", + " words = [w for w in naive_word_tokenize(text) if re.match(r\"\\w+\", w)]\n", + " \n", + " num_sentences = len(sentences)\n", + " num_words = len(words)\n", + " num_syllables = sum(count_syllables(w) for w in words)\n", + " num_long_words = sum(1 for w in words if len(w) >= 7)\n", + "\n", + " if num_sentences == 0 or num_words == 0:\n", + " return {\n", + " \"Flesch-Douma\": 0.0,\n", + " \"LIX\": 0.0,\n", + " \"Kandel-Moles\": 0.0\n", + " }\n", + "\n", + " # Flesch-Douma\n", + " asl = num_words / num_sentences # Average sentence length\n", + " asw = num_syllables / num_words # Average syllables per word\n", + " flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)\n", + "\n", + " # LIX\n", + " lix = (num_words / num_sentences) + (100 * num_long_words / num_words)\n", + "\n", + " # Kandel-Moles\n", + " kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779\n", + "\n", + " return {\n", + " \"Flesch-Douma\": round(flesch_douma, 2),\n", + " \"LIX\": round(lix, 2),\n", + " \"Kandel-Moles\": round(kandel_moles, 2)\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test on a sample French text\n", + "sample_french_text = \"\"\"\n", + "Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.\n", + "\"\"\"\n", + "compute_french_readability(sample_french_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b", + "metadata": {}, + "outputs": [], "source": [] } ],