From b290b140650bd16e46794c161746f096b591bfd4 Mon Sep 17 00:00:00 2001 From: BLADIER Tatiana <tatiana.bladier@lis-lab.fr> Date: Thu, 15 May 2025 12:46:38 +0200 Subject: [PATCH] add some more metrics from the canon paper --- .../tania-some-other-metrics-checkpoint.ipynb | 441 ++++++++++++++++-- tania_scripts/tania-some-other-metrics.ipynb | 441 ++++++++++++++++-- 2 files changed, 802 insertions(+), 80 deletions(-) diff --git a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb index 432660b..fceab33 100644 --- a/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb +++ b/tania_scripts/.ipynb_checkpoints/tania-some-other-metrics-checkpoint.ipynb @@ -10,30 +10,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "id": "510c3726-366d-4e26-a2bb-b55391b473bd", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] /home/tatiana.bladier/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import re\n", "import nltk\n", @@ -45,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "id": "ee59c294-fdcd-429e-a126-734480d1b0ba", "metadata": {}, "outputs": [], @@ -71,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7", "metadata": {}, "outputs": [], @@ -108,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f", "metadata": {}, "outputs": [ @@ -116,18 +96,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Type/Token Ratio: 0.933\n" + "Type/Token Ratio: 0.923\n" ] } ], "source": [ - "ttr = compute_ttr(sample_text)\n", + "ttr = compute_ttr_cleaned(sample_text)\n", "print(f\"Type/Token Ratio: {ttr:.3f}\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2", "metadata": {}, "outputs": [], @@ -152,6 +132,7 @@ " verb_tokens = []\n", " noun_tokens = []\n", "\n", + "\n", " for line in text.strip().splitlines():\n", " if line.strip():\n", " parts = line.split('\\t')\n", @@ -172,15 +153,17 @@ " ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n", " ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n", "\n", + "\n", + "\n", " return {\n", " 'verb_ttr': ttr_verb,\n", - " 'noun_ttr': ttr_noun\n", + " 'noun_ttr': ttr_noun, \n", " }" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4", "metadata": {}, "outputs": [ @@ -201,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "id": "1362e192-514a-4a77-a8cb-5c012026e2bb", "metadata": {}, "outputs": [], @@ -218,9 +201,13 @@ " \"\"\"\n", " verb_pos = {'v', 'vpp', 'vpr'}\n", " noun_pos = {'nc', 'npp'}\n", + " adj_pos = {'adj'}\n", + " adv_pos = {'adv'}\n", "\n", " verb_count = 0\n", " noun_count = 0\n", + " adj_count = 0\n", + " adv_count = 0\n", "\n", " for line in text.strip().splitlines():\n", " if line.strip():\n", @@ -229,21 +216,29 @@ " pos = parts[1]\n", " if pos in verb_pos:\n", " verb_count += 1\n", - " elif pos in noun_pos:\n", + " if pos in noun_pos:\n", " noun_count += 1\n", + " if pos in adj_pos:\n", + " adj_count += 1\n", + " if pos in adv_pos:\n", + " adv_count += 1\n", "\n", " nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n", " verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n", + " adv_verb_ratio = adv_count / verb_count if verb_count else float('inf')\n", + " adj_noun_verb_ratio = (adj_count + noun_count) / verb_count if verb_count else float('inf')\n", "\n", " return {\n", " 'nominal_verb_ratio': nominal_verb_ratio,\n", - " 'verb_nominal_ratio': verb_nominal_ratio\n", + " 'verb_nominal_ratio': verb_nominal_ratio, \n", + " 'adv_verb_ratio': adv_verb_ratio,\n", + " 'adj_noun_verb_ratio': adj_noun_verb_ratio\n", " }" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "id": "544ff6aa-4104-4580-a01f-97429ffcc228", "metadata": {}, "outputs": [ @@ -252,14 +247,18 @@ "output_type": "stream", "text": [ "Nominal/Verb Ratio: 1.00\n", - "Verb/Nominal Ratio: 1.00\n" + "Verb/Nominal Ratio: 1.00\n", + "Adverb/Verb Ratio: 0.00\n", + "Adj+Noun/Verb Ratio: 1.00\n" ] } ], "source": [ "ratios = compute_nv_ratios(sample_text)\n", "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n", - "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")" + "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")\n", + "print(f\"Adverb/Verb Ratio: {ratios['adv_verb_ratio']:.2f}\")\n", + "print(f\"Adj+Noun/Verb Ratio: {ratios['adj_noun_verb_ratio']:.2f}\")" ] }, { @@ -329,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12", "metadata": {}, "outputs": [], @@ -389,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e", "metadata": {}, "outputs": [ @@ -399,7 +398,7 @@ "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}" ] }, - "execution_count": 25, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -412,10 +411,372 @@ "compute_french_readability(sample_french_text)" ] }, + { + "cell_type": "markdown", + "id": "8a0c0fff-d605-4349-a698-a11fd404e2e8", + "metadata": {}, + "source": [ + "## Calculate avg scores" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "24bc84a5-b2df-4194-838a-8f24302599bd", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the function to compute average word length and sentence length\n", + "def compute_avg_lengths(sample_text):\n", + " sentences = []\n", + " current_sentence = []\n", + " \n", + " for line in sample_text.strip().split('\\n'):\n", + " cols = line.strip().split('\\t')\n", + " if not cols or len(cols) < 1:\n", + " continue\n", + " token = cols[0]\n", + "\n", + " if token == '<s>':\n", + " current_sentence = []\n", + " elif token == '</s>':\n", + " if current_sentence:\n", + " sentences.append(current_sentence)\n", + " else:\n", + " current_sentence.append(token)\n", + "\n", + " total_words = 0\n", + " total_word_length = 0\n", + " sentence_lengths = []\n", + "\n", + " for sentence in sentences:\n", + " words = [w for w in sentence if re.match(r'\\w+', w) and w not in ['<s>', '</s>']]\n", + " sentence_lengths.append(len(words))\n", + " total_words += len(words)\n", + " total_word_length += sum(len(w) for w in words)\n", + "\n", + " avg_word_length = total_word_length / total_words if total_words else 0\n", + " avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0\n", + "\n", + " return {\n", + " \"Average Word Length\": round(avg_word_length, 2),\n", + " \"Average Sentence Length\": round(avg_sentence_length, 2)\n", + " }\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0cdb972f-31b6-4e7e-82a8-371eda344f2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Average Word Length': 3.79, 'Average Sentence Length': 7.0}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample text from the user\n", + "sample_text = \"\"\"\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "son\tdet\tNP@@<nul>\t0\n", + "poste\tnc\t<nul>@@<nul>\t1\n", + ",\tponct\t<nul>@@<nul>\t0\n", + "il\tcls-suj\tVN@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t1\n", + "tombé\tvpp\t<nul>@@<nul>\t1\n", + "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", + "ses\tdet\tNP@@<nul>\t2\n", + "bras\tnc\t<nul>@@<nul>\t3\n", + ".\tponct\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "\"\"\"\n", + "\n", + "# Compute and display the results\n", + "compute_avg_lengths(sample_text)" + ] + }, + { + "cell_type": "markdown", + "id": "bf5b0b52-e5c4-4b40-b925-495f4dd8e3be", + "metadata": {}, + "source": [ + "## Calculate POS frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "56af520c-d56b-404a-aebf-ad7c2a9ca503", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_pos_frequency(column_text):\n", + " verb_tags = {\"v\", \"vpp\", \"vpr\"}\n", + " noun_tags = {'nc', 'npp'}\n", + " adj_tags = {'adj'}\n", + " adv_tags = {'adv'}\n", + "\n", + " total_tokens = 0\n", + " verb_count = 0\n", + " noun_count = 0\n", + " adj_count = 0\n", + " adv_count = 0\n", + "\n", + " for line in column_text.strip().split('\\n'):\n", + " parts = line.strip().split('\\t')\n", + " if len(parts) < 2:\n", + " continue\n", + " token, pos = parts[0], parts[1]\n", + " if re.match(r'\\w+', token): # ignore punctuation\n", + " total_tokens += 1\n", + " if pos in verb_tags:\n", + " verb_count += 1\n", + " if pos in noun_tags:\n", + " noun_count += 1\n", + " if pos in adj_tags:\n", + " adj_count += 1\n", + " if pos in adv_tags:\n", + " adv_count += 1\n", + "\n", + " if total_tokens == 0:\n", + " return 0.0\n", + "\n", + " return {\n", + " 'verb_freq': round(verb_count / total_tokens, 4),\n", + " 'noun_freq': round(noun_count / total_tokens, 4), \n", + " 'adv_freq': round(adv_count / total_tokens, 4),\n", + " 'adj_freq': round(adj_count / total_tokens, 4),\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f7c8b125-4651-4b21-bcc4-93ef78a4239b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verb Frequency: 0.29\n", + "Noun Frequency: 0.29\n", + "Adj Frequency: 0.00\n", + "Adv Frequency: 0.00\n" + ] + } + ], + "source": [ + "freqs = compute_pos_frequency(sample_text)\n", + "\n", + "print(f\"Verb Frequency: {freqs['verb_freq']:.2f}\")\n", + "print(f\"Noun Frequency: {freqs['noun_freq']:.2f}\")\n", + "print(f\"Adj Frequency: {freqs['adv_freq']:.2f}\")\n", + "print(f\"Adv Frequency: {freqs['adj_freq']:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4cd15f8f-5618-4586-bd43-30f4919c7274", + "metadata": {}, + "source": [ + "### MSTTR-100 (Mean Segmental Type-Token Ratio)\n", + "\n", + "MSTTR-100 measures lexical diversity by dividing the text into consecutive segments of 100 tokens and computing the type-token ratio (TTR) for each segment. The final MSTTR-100 is the average TTR across all segments." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "daa17c33-adca-4695-90eb-741579382939", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def msttr(text, segment_size):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " if len(words) < segment_size:\n", + " return len(set(words)) / len(words)\n", + "\n", + " segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]\n", + " ttrs = [len(set(segment)) / len(segment) for segment in segments if len(segment) == segment_size]\n", + " return sum(ttrs) / len(ttrs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "80d8fa08-6b7d-4ab7-85cd-987823639277", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSTTR: 0.8823529411764706\n" + ] + } + ], + "source": [ + "print(\"MSTTR: \", msttr(sample_french_text, 100))" + ] + }, + { + "cell_type": "markdown", + "id": "91c7969a-3fff-4935-9f26-7e1ebb6b64c6", + "metadata": {}, + "source": [ + "### BZIP TXT\n", + "\n", + "\"BZIP TXT\" refers to the compression ratio achieved by compressing the text using the BZIP2 algorithm. It serves as a proxy for the text's redundancy and complexity.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c8bd9186-eab8-4ca6-93bd-82b260cd3d19", + "metadata": {}, + "outputs": [], + "source": [ + "import bz2\n", + "\n", + "def bzip_compression_ratio(text):\n", + " original_size = len(text.encode('utf-8'))\n", + " compressed_size = len(bz2.compress(text.encode('utf-8')))\n", + " return compressed_size / original_size" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3f9c7dc7-6820-4013-a85c-2af4f846d4f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BZIP: 1.1931818181818181\n" + ] + } + ], + "source": [ + "print(\"BZIP: \", bzip_compression_ratio(sample_french_text))" + ] + }, + { + "cell_type": "markdown", + "id": "88b6f5f8-90b7-4dfe-b8ee-d54380bf3194", + "metadata": {}, + "source": [ + "### Word Entropy\n", + "\n", + "Word entropy quantifies the unpredictability or information content of words in a text. It's calculated using Shannon's entropy formula over the distribution of word frequencies." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "65e1a630-c46e-4b18-9831-b97864de53ee", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import math\n", + "from collections import Counter\n", + "\n", + "def word_entropy(text):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " total_words = len(words)\n", + " word_counts = Counter(words)\n", + " return -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1612e911-12a8-47c9-b811-b2d6885c3647", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WORD ENTROPY: 3.807763576417195\n" + ] + } + ], + "source": [ + "print(\"WORD ENTROPY: \", word_entropy(sample_french_text))" + ] + }, + { + "cell_type": "markdown", + "id": "a58d962f-5d90-4ee9-b347-64f5bb52c24a", + "metadata": {}, + "source": [ + "\n", + "### Bigram Entropy\n", + "\n", + "Bigram entropy measures the unpredictability of word pairs (bigrams) in a text, providing insight into the text's syntactic complexity." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "925a3a75-aaaa-4851-b77b-b42cb1e21e11", + "metadata": {}, + "outputs": [], + "source": [ + "def bigram_entropy(text):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " bigrams = list(zip(words, words[1:]))\n", + " total_bigrams = len(bigrams)\n", + " bigram_counts = Counter(bigrams)\n", + " return -sum((count/total_bigrams) * math.log2(count/total_bigrams) for count in bigram_counts.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6fa60897-ad26-43b4-b8de-861290ca6bd3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BIGRAM ENTROPY: 4.0\n" + ] + } + ], + "source": [ + "print(\"BIGRAM ENTROPY: \", bigram_entropy(sample_french_text))" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b", + "id": "d2afe949-9351-4ec8-aefc-7fe79b7c5a88", "metadata": {}, "outputs": [], "source": [] diff --git a/tania_scripts/tania-some-other-metrics.ipynb b/tania_scripts/tania-some-other-metrics.ipynb index 432660b..fceab33 100644 --- a/tania_scripts/tania-some-other-metrics.ipynb +++ b/tania_scripts/tania-some-other-metrics.ipynb @@ -10,30 +10,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "id": "510c3726-366d-4e26-a2bb-b55391b473bd", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] /home/tatiana.bladier/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import re\n", "import nltk\n", @@ -45,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "id": "ee59c294-fdcd-429e-a126-734480d1b0ba", "metadata": {}, "outputs": [], @@ -71,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "b6ae41ef-116f-473d-b3f3-115d90fe65b7", "metadata": {}, "outputs": [], @@ -108,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "id": "2a882cc9-8f9d-4457-becb-d2e26ab3f14f", "metadata": {}, "outputs": [ @@ -116,18 +96,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Type/Token Ratio: 0.933\n" + "Type/Token Ratio: 0.923\n" ] } ], "source": [ - "ttr = compute_ttr(sample_text)\n", + "ttr = compute_ttr_cleaned(sample_text)\n", "print(f\"Type/Token Ratio: {ttr:.3f}\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "id": "8897dcc3-4218-4ee5-9984-17b9a6d8dce2", "metadata": {}, "outputs": [], @@ -152,6 +132,7 @@ " verb_tokens = []\n", " noun_tokens = []\n", "\n", + "\n", " for line in text.strip().splitlines():\n", " if line.strip():\n", " parts = line.split('\\t')\n", @@ -172,15 +153,17 @@ " ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0\n", " ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0\n", "\n", + "\n", + "\n", " return {\n", " 'verb_ttr': ttr_verb,\n", - " 'noun_ttr': ttr_noun\n", + " 'noun_ttr': ttr_noun, \n", " }" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "id": "1363f307-fa4b-43ba-93d5-2d1c11ceb9e4", "metadata": {}, "outputs": [ @@ -201,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "id": "1362e192-514a-4a77-a8cb-5c012026e2bb", "metadata": {}, "outputs": [], @@ -218,9 +201,13 @@ " \"\"\"\n", " verb_pos = {'v', 'vpp', 'vpr'}\n", " noun_pos = {'nc', 'npp'}\n", + " adj_pos = {'adj'}\n", + " adv_pos = {'adv'}\n", "\n", " verb_count = 0\n", " noun_count = 0\n", + " adj_count = 0\n", + " adv_count = 0\n", "\n", " for line in text.strip().splitlines():\n", " if line.strip():\n", @@ -229,21 +216,29 @@ " pos = parts[1]\n", " if pos in verb_pos:\n", " verb_count += 1\n", - " elif pos in noun_pos:\n", + " if pos in noun_pos:\n", " noun_count += 1\n", + " if pos in adj_pos:\n", + " adj_count += 1\n", + " if pos in adv_pos:\n", + " adv_count += 1\n", "\n", " nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')\n", " verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')\n", + " adv_verb_ratio = adv_count / verb_count if verb_count else float('inf')\n", + " adj_noun_verb_ratio = (adj_count + noun_count) / verb_count if verb_count else float('inf')\n", "\n", " return {\n", " 'nominal_verb_ratio': nominal_verb_ratio,\n", - " 'verb_nominal_ratio': verb_nominal_ratio\n", + " 'verb_nominal_ratio': verb_nominal_ratio, \n", + " 'adv_verb_ratio': adv_verb_ratio,\n", + " 'adj_noun_verb_ratio': adj_noun_verb_ratio\n", " }" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "id": "544ff6aa-4104-4580-a01f-97429ffcc228", "metadata": {}, "outputs": [ @@ -252,14 +247,18 @@ "output_type": "stream", "text": [ "Nominal/Verb Ratio: 1.00\n", - "Verb/Nominal Ratio: 1.00\n" + "Verb/Nominal Ratio: 1.00\n", + "Adverb/Verb Ratio: 0.00\n", + "Adj+Noun/Verb Ratio: 1.00\n" ] } ], "source": [ "ratios = compute_nv_ratios(sample_text)\n", "print(f\"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}\")\n", - "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")" + "print(f\"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}\")\n", + "print(f\"Adverb/Verb Ratio: {ratios['adv_verb_ratio']:.2f}\")\n", + "print(f\"Adj+Noun/Verb Ratio: {ratios['adj_noun_verb_ratio']:.2f}\")" ] }, { @@ -329,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "id": "b9052dc2-ce45-4af4-a0a0-46c60a13da12", "metadata": {}, "outputs": [], @@ -389,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "id": "1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e", "metadata": {}, "outputs": [ @@ -399,7 +398,7 @@ "{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}" ] }, - "execution_count": 25, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -412,10 +411,372 @@ "compute_french_readability(sample_french_text)" ] }, + { + "cell_type": "markdown", + "id": "8a0c0fff-d605-4349-a698-a11fd404e2e8", + "metadata": {}, + "source": [ + "## Calculate avg scores" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "24bc84a5-b2df-4194-838a-8f24302599bd", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the function to compute average word length and sentence length\n", + "def compute_avg_lengths(sample_text):\n", + " sentences = []\n", + " current_sentence = []\n", + " \n", + " for line in sample_text.strip().split('\\n'):\n", + " cols = line.strip().split('\\t')\n", + " if not cols or len(cols) < 1:\n", + " continue\n", + " token = cols[0]\n", + "\n", + " if token == '<s>':\n", + " current_sentence = []\n", + " elif token == '</s>':\n", + " if current_sentence:\n", + " sentences.append(current_sentence)\n", + " else:\n", + " current_sentence.append(token)\n", + "\n", + " total_words = 0\n", + " total_word_length = 0\n", + " sentence_lengths = []\n", + "\n", + " for sentence in sentences:\n", + " words = [w for w in sentence if re.match(r'\\w+', w) and w not in ['<s>', '</s>']]\n", + " sentence_lengths.append(len(words))\n", + " total_words += len(words)\n", + " total_word_length += sum(len(w) for w in words)\n", + "\n", + " avg_word_length = total_word_length / total_words if total_words else 0\n", + " avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0\n", + "\n", + " return {\n", + " \"Average Word Length\": round(avg_word_length, 2),\n", + " \"Average Sentence Length\": round(avg_sentence_length, 2)\n", + " }\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0cdb972f-31b6-4e7e-82a8-371eda344f2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Average Word Length': 3.79, 'Average Sentence Length': 7.0}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample text from the user\n", + "sample_text = \"\"\"\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "son\tdet\tNP@@<nul>\t0\n", + "poste\tnc\t<nul>@@<nul>\t1\n", + ",\tponct\t<nul>@@<nul>\t0\n", + "il\tcls-suj\tVN@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t1\n", + "tombé\tvpp\t<nul>@@<nul>\t1\n", + "de\tp\tPP-DE_OBJ@@Sint-MOD\t1\n", + "ses\tdet\tNP@@<nul>\t2\n", + "bras\tnc\t<nul>@@<nul>\t3\n", + ".\tponct\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "<s>\t<s>\t<s>\t0\n", + "Aramis\tnpp\t<nul>@@<nul>\t0\n", + "était\tv\t<nul>@@<nul>\t0\n", + "à\tp\t<nul>@@<nul>\t0\n", + "</s>\t</s>\t</s>\t0\n", + "\"\"\"\n", + "\n", + "# Compute and display the results\n", + "compute_avg_lengths(sample_text)" + ] + }, + { + "cell_type": "markdown", + "id": "bf5b0b52-e5c4-4b40-b925-495f4dd8e3be", + "metadata": {}, + "source": [ + "## Calculate POS frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "56af520c-d56b-404a-aebf-ad7c2a9ca503", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_pos_frequency(column_text):\n", + " verb_tags = {\"v\", \"vpp\", \"vpr\"}\n", + " noun_tags = {'nc', 'npp'}\n", + " adj_tags = {'adj'}\n", + " adv_tags = {'adv'}\n", + "\n", + " total_tokens = 0\n", + " verb_count = 0\n", + " noun_count = 0\n", + " adj_count = 0\n", + " adv_count = 0\n", + "\n", + " for line in column_text.strip().split('\\n'):\n", + " parts = line.strip().split('\\t')\n", + " if len(parts) < 2:\n", + " continue\n", + " token, pos = parts[0], parts[1]\n", + " if re.match(r'\\w+', token): # ignore punctuation\n", + " total_tokens += 1\n", + " if pos in verb_tags:\n", + " verb_count += 1\n", + " if pos in noun_tags:\n", + " noun_count += 1\n", + " if pos in adj_tags:\n", + " adj_count += 1\n", + " if pos in adv_tags:\n", + " adv_count += 1\n", + "\n", + " if total_tokens == 0:\n", + " return 0.0\n", + "\n", + " return {\n", + " 'verb_freq': round(verb_count / total_tokens, 4),\n", + " 'noun_freq': round(noun_count / total_tokens, 4), \n", + " 'adv_freq': round(adv_count / total_tokens, 4),\n", + " 'adj_freq': round(adj_count / total_tokens, 4),\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f7c8b125-4651-4b21-bcc4-93ef78a4239b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verb Frequency: 0.29\n", + "Noun Frequency: 0.29\n", + "Adj Frequency: 0.00\n", + "Adv Frequency: 0.00\n" + ] + } + ], + "source": [ + "freqs = compute_pos_frequency(sample_text)\n", + "\n", + "print(f\"Verb Frequency: {freqs['verb_freq']:.2f}\")\n", + "print(f\"Noun Frequency: {freqs['noun_freq']:.2f}\")\n", + "print(f\"Adj Frequency: {freqs['adv_freq']:.2f}\")\n", + "print(f\"Adv Frequency: {freqs['adj_freq']:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4cd15f8f-5618-4586-bd43-30f4919c7274", + "metadata": {}, + "source": [ + "### MSTTR-100 (Mean Segmental Type-Token Ratio)\n", + "\n", + "MSTTR-100 measures lexical diversity by dividing the text into consecutive segments of 100 tokens and computing the type-token ratio (TTR) for each segment. The final MSTTR-100 is the average TTR across all segments." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "daa17c33-adca-4695-90eb-741579382939", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def msttr(text, segment_size):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " if len(words) < segment_size:\n", + " return len(set(words)) / len(words)\n", + "\n", + " segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]\n", + " ttrs = [len(set(segment)) / len(segment) for segment in segments if len(segment) == segment_size]\n", + " return sum(ttrs) / len(ttrs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "80d8fa08-6b7d-4ab7-85cd-987823639277", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSTTR: 0.8823529411764706\n" + ] + } + ], + "source": [ + "print(\"MSTTR: \", msttr(sample_french_text, 100))" + ] + }, + { + "cell_type": "markdown", + "id": "91c7969a-3fff-4935-9f26-7e1ebb6b64c6", + "metadata": {}, + "source": [ + "### BZIP TXT\n", + "\n", + "\"BZIP TXT\" refers to the compression ratio achieved by compressing the text using the BZIP2 algorithm. It serves as a proxy for the text's redundancy and complexity.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c8bd9186-eab8-4ca6-93bd-82b260cd3d19", + "metadata": {}, + "outputs": [], + "source": [ + "import bz2\n", + "\n", + "def bzip_compression_ratio(text):\n", + " original_size = len(text.encode('utf-8'))\n", + " compressed_size = len(bz2.compress(text.encode('utf-8')))\n", + " return compressed_size / original_size" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3f9c7dc7-6820-4013-a85c-2af4f846d4f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BZIP: 1.1931818181818181\n" + ] + } + ], + "source": [ + "print(\"BZIP: \", bzip_compression_ratio(sample_french_text))" + ] + }, + { + "cell_type": "markdown", + "id": "88b6f5f8-90b7-4dfe-b8ee-d54380bf3194", + "metadata": {}, + "source": [ + "### Word Entropy\n", + "\n", + "Word entropy quantifies the unpredictability or information content of words in a text. It's calculated using Shannon's entropy formula over the distribution of word frequencies." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "65e1a630-c46e-4b18-9831-b97864de53ee", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import math\n", + "from collections import Counter\n", + "\n", + "def word_entropy(text):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " total_words = len(words)\n", + " word_counts = Counter(words)\n", + " return -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1612e911-12a8-47c9-b811-b2d6885c3647", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WORD ENTROPY: 3.807763576417195\n" + ] + } + ], + "source": [ + "print(\"WORD ENTROPY: \", word_entropy(sample_french_text))" + ] + }, + { + "cell_type": "markdown", + "id": "a58d962f-5d90-4ee9-b347-64f5bb52c24a", + "metadata": {}, + "source": [ + "\n", + "### Bigram Entropy\n", + "\n", + "Bigram entropy measures the unpredictability of word pairs (bigrams) in a text, providing insight into the text's syntactic complexity." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "925a3a75-aaaa-4851-b77b-b42cb1e21e11", + "metadata": {}, + "outputs": [], + "source": [ + "def bigram_entropy(text):\n", + " words = re.findall(r'\\b\\w+\\b', text.lower())\n", + " bigrams = list(zip(words, words[1:]))\n", + " total_bigrams = len(bigrams)\n", + " bigram_counts = Counter(bigrams)\n", + " return -sum((count/total_bigrams) * math.log2(count/total_bigrams) for count in bigram_counts.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6fa60897-ad26-43b4-b8de-861290ca6bd3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BIGRAM ENTROPY: 4.0\n" + ] + } + ], + "source": [ + "print(\"BIGRAM ENTROPY: \", bigram_entropy(sample_french_text))" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "b2cd53c6-1c16-4eaf-8ac8-af166afaa97b", + "id": "d2afe949-9351-4ec8-aefc-7fe79b7c5a88", "metadata": {}, "outputs": [], "source": [] -- GitLab