From b290b140650bd16e46794c161746f096b591bfd4 Mon Sep 17 00:00:00 2001
From: BLADIER Tatiana <tatiana.bladier@lis-lab.fr>
Date: Thu, 15 May 2025 12:46:38 +0200
Subject: [PATCH] add some more metrics from the canon paper

---
 .../tania-some-other-metrics-checkpoint.ipynb | 441 ++++++++++++++++--
 tania_scripts/tania-some-other-metrics.ipynb  | 441 ++++++++++++++++--
 2 files changed, 802 insertions(+), 80 deletions(-)

diff --git a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
index 432660b..fceab33 100644
--- a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
+++ b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb
@@ -10,30 +10,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "id": "510c3726-366d-4e26-a2bb-b55391b473bd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     /home/tatiana.bladier/nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import re\n",
     "import nltk\n",
@@ -45,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "id": "ee59c294-fdcd-429e-a126-734480d1b0ba",
    "metadata": {},
    "outputs": [],
@@ -71,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
    "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
    "metadata": {},
    "outputs": [],
@@ -108,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
    "metadata": {},
    "outputs": [
@@ -116,18 +96,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Type/Token Ratio: 0.933\n"
+      "Type/Token Ratio: 0.923\n"
      ]
     }
    ],
    "source": [
-    "ttr = compute_ttr(sample_text)\n",
+    "ttr = compute_ttr_cleaned(sample_text)\n",
     "print(f\"Type/Token Ratio: {ttr:.3f}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
    "metadata": {},
    "outputs": [],
@@ -152,6 +132,7 @@
     "    verb_tokens = []\n",
     "    noun_tokens = []\n",
     "\n",
+    "\n",
     "    for line in text.strip().splitlines():\n",
     "        if line.strip():\n",
     "            parts = line.split('\\t')\n",
@@ -172,15 +153,17 @@
     "    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n",
     "    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n",
     "\n",
+    "\n",
+    "\n",
     "    return {\n",
     "        'verb_ttr': ttr_verb,\n",
-    "        'noun_ttr': ttr_noun\n",
+    "        'noun_ttr': ttr_noun, \n",
     "    }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 7,
    "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4",
    "metadata": {},
    "outputs": [
@@ -201,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 8,
    "id": "1362e192-514a-4a77-a8cb-5c012026e2bb",
    "metadata": {},
    "outputs": [],
@@ -218,9 +201,13 @@
     "    \"\"\"\n",
     "    verb_pos = {'v', 'vpp', 'vpr'}\n",
     "    noun_pos = {'nc', 'npp'}\n",
+    "    adj_pos = {'adj'}\n",
+    "    adv_pos = {'adv'}\n",
     "\n",
     "    verb_count = 0\n",
     "    noun_count = 0\n",
+    "    adj_count = 0\n",
+    "    adv_count = 0\n",
     "\n",
     "    for line in text.strip().splitlines():\n",
     "        if line.strip():\n",
@@ -229,21 +216,29 @@
     "                pos = parts[1]\n",
     "                if pos in verb_pos:\n",
     "                    verb_count += 1\n",
-    "                elif pos in noun_pos:\n",
+    "                if pos in noun_pos:\n",
     "                    noun_count += 1\n",
+    "                if pos in adj_pos:\n",
+    "                    adj_count += 1\n",
+    "                if pos in adv_pos:\n",
+    "                    adv_count += 1\n",
     "\n",
     "    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n",
     "    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n",
+    "    adv_verb_ratio = adv_count / verb_count if verb_count else float('inf')\n",
+    "    adj_noun_verb_ratio = (adj_count + noun_count) / verb_count if verb_count else float('inf')\n",
     "\n",
     "    return {\n",
     "        'nominal_verb_ratio': nominal_verb_ratio,\n",
-    "        'verb_nominal_ratio': verb_nominal_ratio\n",
+    "        'verb_nominal_ratio': verb_nominal_ratio, \n",
+    "        'adv_verb_ratio': adv_verb_ratio,\n",
+    "        'adj_noun_verb_ratio': adj_noun_verb_ratio\n",
     "    }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 9,
    "id": "544ff6aa-4104-4580-a01f-97429ffcc228",
    "metadata": {},
    "outputs": [
@@ -252,14 +247,18 @@
      "output_type": "stream",
      "text": [
       "Nominal/Verb Ratio: 1.00\n",
-      "Verb/Nominal Ratio: 1.00\n"
+      "Verb/Nominal Ratio: 1.00\n",
+      "Adverb/Verb Ratio: 0.00\n",
+      "Adj+Noun/Verb Ratio: 1.00\n"
      ]
     }
    ],
    "source": [
     "ratios = compute_nv_ratios(sample_text)\n",
     "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n",
-    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")"
+    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")\n",
+    "print(f\"Adverb/Verb Ratio: {ratios['adv_verb_ratio']:.2f}\")\n",
+    "print(f\"Adj+Noun/Verb Ratio: {ratios['adj_noun_verb_ratio']:.2f}\")"
    ]
   },
   {
@@ -329,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12",
    "metadata": {},
    "outputs": [],
@@ -389,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 11,
    "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e",
    "metadata": {},
    "outputs": [
@@ -399,7 +398,7 @@
        "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -412,10 +411,372 @@
     "compute_french_readability(sample_french_text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8a0c0fff-d605-4349-a698-a11fd404e2e8",
+   "metadata": {},
+   "source": [
+    "## Calculate avg scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "24bc84a5-b2df-4194-838a-8f24302599bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the function to compute average word length and sentence length\n",
+    "def compute_avg_lengths(sample_text):\n",
+    "    sentences = []\n",
+    "    current_sentence = []\n",
+    "    \n",
+    "    for line in sample_text.strip().split('\\n'):\n",
+    "        cols = line.strip().split('\\t')\n",
+    "        if not cols or len(cols) < 1:\n",
+    "            continue\n",
+    "        token = cols[0]\n",
+    "\n",
+    "        if token == '<s>':\n",
+    "            current_sentence = []\n",
+    "        elif token == '</s>':\n",
+    "            if current_sentence:\n",
+    "                sentences.append(current_sentence)\n",
+    "        else:\n",
+    "            current_sentence.append(token)\n",
+    "\n",
+    "    total_words = 0\n",
+    "    total_word_length = 0\n",
+    "    sentence_lengths = []\n",
+    "\n",
+    "    for sentence in sentences:\n",
+    "        words = [w for w in sentence if re.match(r'\\w+', w) and w not in ['<s>', '</s>']]\n",
+    "        sentence_lengths.append(len(words))\n",
+    "        total_words += len(words)\n",
+    "        total_word_length += sum(len(w) for w in words)\n",
+    "\n",
+    "    avg_word_length = total_word_length / total_words if total_words else 0\n",
+    "    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0\n",
+    "\n",
+    "    return {\n",
+    "        \"Average Word Length\": round(avg_word_length, 2),\n",
+    "        \"Average Sentence Length\": round(avg_sentence_length, 2)\n",
+    "    }\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0cdb972f-31b6-4e7e-82a8-371eda344f2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Average Word Length': 3.79, 'Average Sentence Length': 7.0}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Sample text from the user\n",
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\"\n",
+    "\n",
+    "# Compute and display the results\n",
+    "compute_avg_lengths(sample_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf5b0b52-e5c4-4b40-b925-495f4dd8e3be",
+   "metadata": {},
+   "source": [
+    "## Calculate POS frequencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "56af520c-d56b-404a-aebf-ad7c2a9ca503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_pos_frequency(column_text):\n",
+    "    verb_tags = {\"v\", \"vpp\", \"vpr\"}\n",
+    "    noun_tags = {'nc', 'npp'}\n",
+    "    adj_tags = {'adj'}\n",
+    "    adv_tags = {'adv'}\n",
+    "\n",
+    "    total_tokens = 0\n",
+    "    verb_count = 0\n",
+    "    noun_count = 0\n",
+    "    adj_count = 0\n",
+    "    adv_count = 0\n",
+    "\n",
+    "    for line in column_text.strip().split('\\n'):\n",
+    "        parts = line.strip().split('\\t')\n",
+    "        if len(parts) < 2:\n",
+    "            continue\n",
+    "        token, pos = parts[0], parts[1]\n",
+    "        if re.match(r'\\w+', token):  # ignore punctuation\n",
+    "            total_tokens += 1\n",
+    "            if pos in verb_tags:\n",
+    "                verb_count += 1\n",
+    "            if pos in noun_tags:\n",
+    "                noun_count += 1\n",
+    "            if pos in adj_tags:\n",
+    "                adj_count += 1\n",
+    "            if pos in adv_tags:\n",
+    "                adv_count += 1\n",
+    "\n",
+    "    if total_tokens == 0:\n",
+    "        return 0.0\n",
+    "\n",
+    "    return {\n",
+    "        'verb_freq': round(verb_count / total_tokens, 4),\n",
+    "        'noun_freq': round(noun_count / total_tokens, 4), \n",
+    "        'adv_freq': round(adv_count / total_tokens, 4),\n",
+    "        'adj_freq': round(adj_count / total_tokens, 4),\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f7c8b125-4651-4b21-bcc4-93ef78a4239b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verb Frequency: 0.29\n",
+      "Noun Frequency: 0.29\n",
+      "Adj Frequency: 0.00\n",
+      "Adv Frequency: 0.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs = compute_pos_frequency(sample_text)\n",
+    "\n",
+    "print(f\"Verb Frequency: {freqs['verb_freq']:.2f}\")\n",
+    "print(f\"Noun Frequency: {freqs['noun_freq']:.2f}\")\n",
+    "print(f\"Adj Frequency: {freqs['adv_freq']:.2f}\")\n",
+    "print(f\"Adv Frequency: {freqs['adj_freq']:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cd15f8f-5618-4586-bd43-30f4919c7274",
+   "metadata": {},
+   "source": [
+    "### MSTTR-100 (Mean Segmental Type-Token Ratio)\n",
+    "\n",
+    "MSTTR-100 measures lexical diversity by dividing the text into consecutive segments of 100 tokens and computing the type-token ratio (TTR) for each segment. The final MSTTR-100 is the average TTR across all segments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "daa17c33-adca-4695-90eb-741579382939",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def msttr(text, segment_size):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    if len(words) < segment_size:\n",
+    "        return len(set(words)) / len(words)\n",
+    "\n",
+    "    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]\n",
+    "    ttrs = [len(set(segment)) / len(segment) for segment in segments if len(segment) == segment_size]\n",
+    "    return sum(ttrs) / len(ttrs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "80d8fa08-6b7d-4ab7-85cd-987823639277",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MSTTR:  0.8823529411764706\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"MSTTR: \", msttr(sample_french_text, 100))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91c7969a-3fff-4935-9f26-7e1ebb6b64c6",
+   "metadata": {},
+   "source": [
+    "### BZIP TXT\n",
+    "\n",
+    "\"BZIP TXT\" refers to the compression ratio achieved by compressing the text using the BZIP2 algorithm. It serves as a proxy for the text's redundancy and complexity.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c8bd9186-eab8-4ca6-93bd-82b260cd3d19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bz2\n",
+    "\n",
+    "def bzip_compression_ratio(text):\n",
+    "    original_size = len(text.encode('utf-8'))\n",
+    "    compressed_size = len(bz2.compress(text.encode('utf-8')))\n",
+    "    return compressed_size / original_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "3f9c7dc7-6820-4013-a85c-2af4f846d4f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BZIP:  1.1931818181818181\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BZIP: \", bzip_compression_ratio(sample_french_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88b6f5f8-90b7-4dfe-b8ee-d54380bf3194",
+   "metadata": {},
+   "source": [
+    "### Word Entropy\n",
+    "\n",
+    "Word entropy quantifies the unpredictability or information content of words in a text. It's calculated using Shannon's entropy formula over the distribution of word frequencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "65e1a630-c46e-4b18-9831-b97864de53ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import math\n",
+    "from collections import Counter\n",
+    "\n",
+    "def word_entropy(text):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    total_words = len(words)\n",
+    "    word_counts = Counter(words)\n",
+    "    return -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "1612e911-12a8-47c9-b811-b2d6885c3647",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WORD ENTROPY:  3.807763576417195\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"WORD ENTROPY: \", word_entropy(sample_french_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a58d962f-5d90-4ee9-b347-64f5bb52c24a",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Bigram Entropy\n",
+    "\n",
+    "Bigram entropy measures the unpredictability of word pairs (bigrams) in a text, providing insight into the text's syntactic complexity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "925a3a75-aaaa-4851-b77b-b42cb1e21e11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bigram_entropy(text):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    bigrams = list(zip(words, words[1:]))\n",
+    "    total_bigrams = len(bigrams)\n",
+    "    bigram_counts = Counter(bigrams)\n",
+    "    return -sum((count/total_bigrams) * math.log2(count/total_bigrams) for count in bigram_counts.values())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "6fa60897-ad26-43b4-b8de-861290ca6bd3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BIGRAM ENTROPY:  4.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BIGRAM ENTROPY: \", bigram_entropy(sample_french_text))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b",
+   "id": "d2afe949-9351-4ec8-aefc-7fe79b7c5a88",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/tania_scripts/tania-some-other-metrics.ipynb b/tania_scripts/tania-some-other-metrics.ipynb
index 432660b..fceab33 100644
--- a/tania_scripts/tania-some-other-metrics.ipynb
+++ b/tania_scripts/tania-some-other-metrics.ipynb
@@ -10,30 +10,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "id": "510c3726-366d-4e26-a2bb-b55391b473bd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     /home/tatiana.bladier/nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import re\n",
     "import nltk\n",
@@ -45,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "id": "ee59c294-fdcd-429e-a126-734480d1b0ba",
    "metadata": {},
    "outputs": [],
@@ -71,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
    "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7",
    "metadata": {},
    "outputs": [],
@@ -108,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f",
    "metadata": {},
    "outputs": [
@@ -116,18 +96,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Type/Token Ratio: 0.933\n"
+      "Type/Token Ratio: 0.923\n"
      ]
     }
    ],
    "source": [
-    "ttr = compute_ttr(sample_text)\n",
+    "ttr = compute_ttr_cleaned(sample_text)\n",
     "print(f\"Type/Token Ratio: {ttr:.3f}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2",
    "metadata": {},
    "outputs": [],
@@ -152,6 +132,7 @@
     "    verb_tokens = []\n",
     "    noun_tokens = []\n",
     "\n",
+    "\n",
     "    for line in text.strip().splitlines():\n",
     "        if line.strip():\n",
     "            parts = line.split('\\t')\n",
@@ -172,15 +153,17 @@
     "    ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n",
     "    ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n",
     "\n",
+    "\n",
+    "\n",
     "    return {\n",
     "        'verb_ttr': ttr_verb,\n",
-    "        'noun_ttr': ttr_noun\n",
+    "        'noun_ttr': ttr_noun, \n",
     "    }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 7,
    "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4",
    "metadata": {},
    "outputs": [
@@ -201,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 8,
    "id": "1362e192-514a-4a77-a8cb-5c012026e2bb",
    "metadata": {},
    "outputs": [],
@@ -218,9 +201,13 @@
     "    \"\"\"\n",
     "    verb_pos = {'v', 'vpp', 'vpr'}\n",
     "    noun_pos = {'nc', 'npp'}\n",
+    "    adj_pos = {'adj'}\n",
+    "    adv_pos = {'adv'}\n",
     "\n",
     "    verb_count = 0\n",
     "    noun_count = 0\n",
+    "    adj_count = 0\n",
+    "    adv_count = 0\n",
     "\n",
     "    for line in text.strip().splitlines():\n",
     "        if line.strip():\n",
@@ -229,21 +216,29 @@
     "                pos = parts[1]\n",
     "                if pos in verb_pos:\n",
     "                    verb_count += 1\n",
-    "                elif pos in noun_pos:\n",
+    "                if pos in noun_pos:\n",
     "                    noun_count += 1\n",
+    "                if pos in adj_pos:\n",
+    "                    adj_count += 1\n",
+    "                if pos in adv_pos:\n",
+    "                    adv_count += 1\n",
     "\n",
     "    nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n",
     "    verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n",
+    "    adv_verb_ratio = adv_count / verb_count if verb_count else float('inf')\n",
+    "    adj_noun_verb_ratio = (adj_count + noun_count) / verb_count if verb_count else float('inf')\n",
     "\n",
     "    return {\n",
     "        'nominal_verb_ratio': nominal_verb_ratio,\n",
-    "        'verb_nominal_ratio': verb_nominal_ratio\n",
+    "        'verb_nominal_ratio': verb_nominal_ratio, \n",
+    "        'adv_verb_ratio': adv_verb_ratio,\n",
+    "        'adj_noun_verb_ratio': adj_noun_verb_ratio\n",
     "    }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 9,
    "id": "544ff6aa-4104-4580-a01f-97429ffcc228",
    "metadata": {},
    "outputs": [
@@ -252,14 +247,18 @@
      "output_type": "stream",
      "text": [
       "Nominal/Verb Ratio: 1.00\n",
-      "Verb/Nominal Ratio: 1.00\n"
+      "Verb/Nominal Ratio: 1.00\n",
+      "Adverb/Verb Ratio: 0.00\n",
+      "Adj+Noun/Verb Ratio: 1.00\n"
      ]
     }
    ],
    "source": [
     "ratios = compute_nv_ratios(sample_text)\n",
     "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n",
-    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")"
+    "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")\n",
+    "print(f\"Adverb/Verb Ratio: {ratios['adv_verb_ratio']:.2f}\")\n",
+    "print(f\"Adj+Noun/Verb Ratio: {ratios['adj_noun_verb_ratio']:.2f}\")"
    ]
   },
   {
@@ -329,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12",
    "metadata": {},
    "outputs": [],
@@ -389,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 11,
    "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e",
    "metadata": {},
    "outputs": [
@@ -399,7 +398,7 @@
        "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -412,10 +411,372 @@
     "compute_french_readability(sample_french_text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8a0c0fff-d605-4349-a698-a11fd404e2e8",
+   "metadata": {},
+   "source": [
+    "## Calculate avg scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "24bc84a5-b2df-4194-838a-8f24302599bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the function to compute average word length and sentence length\n",
+    "def compute_avg_lengths(sample_text):\n",
+    "    sentences = []\n",
+    "    current_sentence = []\n",
+    "    \n",
+    "    for line in sample_text.strip().split('\\n'):\n",
+    "        cols = line.strip().split('\\t')\n",
+    "        if not cols or len(cols) < 1:\n",
+    "            continue\n",
+    "        token = cols[0]\n",
+    "\n",
+    "        if token == '<s>':\n",
+    "            current_sentence = []\n",
+    "        elif token == '</s>':\n",
+    "            if current_sentence:\n",
+    "                sentences.append(current_sentence)\n",
+    "        else:\n",
+    "            current_sentence.append(token)\n",
+    "\n",
+    "    total_words = 0\n",
+    "    total_word_length = 0\n",
+    "    sentence_lengths = []\n",
+    "\n",
+    "    for sentence in sentences:\n",
+    "        words = [w for w in sentence if re.match(r'\\w+', w) and w not in ['<s>', '</s>']]\n",
+    "        sentence_lengths.append(len(words))\n",
+    "        total_words += len(words)\n",
+    "        total_word_length += sum(len(w) for w in words)\n",
+    "\n",
+    "    avg_word_length = total_word_length / total_words if total_words else 0\n",
+    "    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0\n",
+    "\n",
+    "    return {\n",
+    "        \"Average Word Length\": round(avg_word_length, 2),\n",
+    "        \"Average Sentence Length\": round(avg_sentence_length, 2)\n",
+    "    }\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0cdb972f-31b6-4e7e-82a8-371eda344f2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Average Word Length': 3.79, 'Average Sentence Length': 7.0}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Sample text from the user\n",
+    "sample_text = \"\"\"\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "son\tdet\tNP@@<nul>\t0\n",
+    "poste\tnc\t<nul>@@<nul>\t1\n",
+    ",\tponct\t<nul>@@<nul>\t0\n",
+    "il\tcls-suj\tVN@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t1\n",
+    "tombé\tvpp\t<nul>@@<nul>\t1\n",
+    "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n",
+    "ses\tdet\tNP@@<nul>\t2\n",
+    "bras\tnc\t<nul>@@<nul>\t3\n",
+    ".\tponct\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "<s>\t<s>\t<s>\t0\n",
+    "Aramis\tnpp\t<nul>@@<nul>\t0\n",
+    "était\tv\t<nul>@@<nul>\t0\n",
+    "à\tp\t<nul>@@<nul>\t0\n",
+    "</s>\t</s>\t</s>\t0\n",
+    "\"\"\"\n",
+    "\n",
+    "# Compute and display the results\n",
+    "compute_avg_lengths(sample_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf5b0b52-e5c4-4b40-b925-495f4dd8e3be",
+   "metadata": {},
+   "source": [
+    "## Calculate POS frequencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "56af520c-d56b-404a-aebf-ad7c2a9ca503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_pos_frequency(column_text):\n",
+    "    verb_tags = {\"v\", \"vpp\", \"vpr\"}\n",
+    "    noun_tags = {'nc', 'npp'}\n",
+    "    adj_tags = {'adj'}\n",
+    "    adv_tags = {'adv'}\n",
+    "\n",
+    "    total_tokens = 0\n",
+    "    verb_count = 0\n",
+    "    noun_count = 0\n",
+    "    adj_count = 0\n",
+    "    adv_count = 0\n",
+    "\n",
+    "    for line in column_text.strip().split('\\n'):\n",
+    "        parts = line.strip().split('\\t')\n",
+    "        if len(parts) < 2:\n",
+    "            continue\n",
+    "        token, pos = parts[0], parts[1]\n",
+    "        if re.match(r'\\w+', token):  # ignore punctuation\n",
+    "            total_tokens += 1\n",
+    "            if pos in verb_tags:\n",
+    "                verb_count += 1\n",
+    "            if pos in noun_tags:\n",
+    "                noun_count += 1\n",
+    "            if pos in adj_tags:\n",
+    "                adj_count += 1\n",
+    "            if pos in adv_tags:\n",
+    "                adv_count += 1\n",
+    "\n",
+    "    if total_tokens == 0:\n",
+    "        return 0.0\n",
+    "\n",
+    "    return {\n",
+    "        'verb_freq': round(verb_count / total_tokens, 4),\n",
+    "        'noun_freq': round(noun_count / total_tokens, 4), \n",
+    "        'adv_freq': round(adv_count / total_tokens, 4),\n",
+    "        'adj_freq': round(adj_count / total_tokens, 4),\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f7c8b125-4651-4b21-bcc4-93ef78a4239b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verb Frequency: 0.29\n",
+      "Noun Frequency: 0.29\n",
+      "Adj Frequency: 0.00\n",
+      "Adv Frequency: 0.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs = compute_pos_frequency(sample_text)\n",
+    "\n",
+    "print(f\"Verb Frequency: {freqs['verb_freq']:.2f}\")\n",
+    "print(f\"Noun Frequency: {freqs['noun_freq']:.2f}\")\n",
+    "print(f\"Adj Frequency: {freqs['adv_freq']:.2f}\")\n",
+    "print(f\"Adv Frequency: {freqs['adj_freq']:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cd15f8f-5618-4586-bd43-30f4919c7274",
+   "metadata": {},
+   "source": [
+    "### MSTTR-100 (Mean Segmental Type-Token Ratio)\n",
+    "\n",
+    "MSTTR-100 measures lexical diversity by dividing the text into consecutive segments of 100 tokens and computing the type-token ratio (TTR) for each segment. The final MSTTR-100 is the average TTR across all segments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "daa17c33-adca-4695-90eb-741579382939",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def msttr(text, segment_size):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    if len(words) < segment_size:\n",
+    "        return len(set(words)) / len(words)\n",
+    "\n",
+    "    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]\n",
+    "    ttrs = [len(set(segment)) / len(segment) for segment in segments if len(segment) == segment_size]\n",
+    "    return sum(ttrs) / len(ttrs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "80d8fa08-6b7d-4ab7-85cd-987823639277",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MSTTR:  0.8823529411764706\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"MSTTR: \", msttr(sample_french_text, 100))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91c7969a-3fff-4935-9f26-7e1ebb6b64c6",
+   "metadata": {},
+   "source": [
+    "### BZIP TXT\n",
+    "\n",
+    "\"BZIP TXT\" refers to the compression ratio achieved by compressing the text using the BZIP2 algorithm. It serves as a proxy for the text's redundancy and complexity.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c8bd9186-eab8-4ca6-93bd-82b260cd3d19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bz2\n",
+    "\n",
+    "def bzip_compression_ratio(text):\n",
+    "    original_size = len(text.encode('utf-8'))\n",
+    "    compressed_size = len(bz2.compress(text.encode('utf-8')))\n",
+    "    return compressed_size / original_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "3f9c7dc7-6820-4013-a85c-2af4f846d4f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BZIP:  1.1931818181818181\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BZIP: \", bzip_compression_ratio(sample_french_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88b6f5f8-90b7-4dfe-b8ee-d54380bf3194",
+   "metadata": {},
+   "source": [
+    "### Word Entropy\n",
+    "\n",
+    "Word entropy quantifies the unpredictability or information content of words in a text. It's calculated using Shannon's entropy formula over the distribution of word frequencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "65e1a630-c46e-4b18-9831-b97864de53ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import math\n",
+    "from collections import Counter\n",
+    "\n",
+    "def word_entropy(text):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    total_words = len(words)\n",
+    "    word_counts = Counter(words)\n",
+    "    return -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "1612e911-12a8-47c9-b811-b2d6885c3647",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WORD ENTROPY:  3.807763576417195\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"WORD ENTROPY: \", word_entropy(sample_french_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a58d962f-5d90-4ee9-b347-64f5bb52c24a",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Bigram Entropy\n",
+    "\n",
+    "Bigram entropy measures the unpredictability of word pairs (bigrams) in a text, providing insight into the text's syntactic complexity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "925a3a75-aaaa-4851-b77b-b42cb1e21e11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bigram_entropy(text):\n",
+    "    words = re.findall(r'\\b\\w+\\b', text.lower())\n",
+    "    bigrams = list(zip(words, words[1:]))\n",
+    "    total_bigrams = len(bigrams)\n",
+    "    bigram_counts = Counter(bigrams)\n",
+    "    return -sum((count/total_bigrams) * math.log2(count/total_bigrams) for count in bigram_counts.values())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "6fa60897-ad26-43b4-b8de-861290ca6bd3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BIGRAM ENTROPY:  4.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BIGRAM ENTROPY: \", bigram_entropy(sample_french_text))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b",
+   "id": "d2afe949-9351-4ec8-aefc-7fe79b7c5a88",
    "metadata": {},
    "outputs": [],
    "source": []
-- 
GitLab