Skip to content
Snippets Groups Projects
Commit c1922f59 authored by Tatiana BLADIER's avatar Tatiana BLADIER
Browse files

add readability metrics for French

parent da1ae3ab
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:
## Type-token ratio
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
%% Cell type:code id:510c3726-366d-4e26-a2bb-b55391b473bd tags:
``` python
def compute_ttr(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
Only the first column is used (tokens).
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
Parameters:
- text: str, the input text in column format
# Download once if not already
#nltk.download('punkt')
```
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
%% Output
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0]
tokens.append(token)
[nltk_data] Downloading package punkt to
[nltk_data] /home/tatiana.bladier/nltk_data...
[nltk_data] Package punkt is already up-to-date!
if not tokens:
return 0.0
True
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
%% Cell type:code id:ee59c294-fdcd-429e-a126-734480d1b0ba tags:
``` python
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
"""
```
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
``` python
import string
def compute_ttr_cleaned(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
- Only the first column is used (tokens).
- Tokens are lowercased.
- Punctuation tokens are ignored.
Parameters:
- text: str, the input text in column format
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0].lower()
if token not in string.punctuation:
tokens.append(token)
if not tokens:
return 0.0
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
``` python
ttr = compute_ttr(sample_text)
print(f"Type/Token Ratio: {ttr:.3f}")
```
%% Output
Type/Token Ratio: 0.933
%% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:
``` python
def compute_ttr_by_pos(text):
"""
Compute type/token ratios for verbs and nouns in column-formatted text.
- Columns: token \t pos \t ...
- Verbs: POS in {'v', 'vpp', 'vpr'}
- Nouns: POS in {'nc', 'npp'}
- Tokens are lowercased.
- Punctuation is ignored.
Returns:
- A dictionary with TTRs for verbs and nouns.
"""
import string
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
verb_tokens = []
noun_tokens = []
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
token = parts[0].lower()
pos = parts[1]
# Ignore punctuation
if token in string.punctuation:
continue
if pos in verb_pos:
verb_tokens.append(token)
elif pos in noun_pos:
noun_tokens.append(token)
# Compute TTRs
ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0
ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0
return {
'verb_ttr': ttr_verb,
'noun_ttr': ttr_noun
}
```
%% Cell type:code id:1363f307-fa4b-43ba-93d5-2d1c11ceb9e4 tags:
``` python
result = compute_ttr_by_pos(sample_text)
print(f"Verb TTR: {result['verb_ttr']:.3f}")
print(f"Noun TTR: {result['noun_ttr']:.3f}")
```
%% Output
Verb TTR: 0.667
Noun TTR: 1.000
%% Cell type:code id:1362e192-514a-4a77-a8cb-5c012026e2bb tags:
``` python
def compute_nv_ratios(text):
"""
Compute nominal/verb and verb/nominal ratios from column-formatted text.
- Uses the second column (POS).
- Verbs: 'v', 'vpp', 'vpr'
- Nouns: 'nc', 'npp'
Returns:
- Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'
"""
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
verb_count = 0
noun_count = 0
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
pos = parts[1]
if pos in verb_pos:
verb_count += 1
elif pos in noun_pos:
noun_count += 1
nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')
verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')
return {
'nominal_verb_ratio': nominal_verb_ratio,
'verb_nominal_ratio': verb_nominal_ratio
}
```
%% Cell type:code id:544ff6aa-4104-4580-a01f-97429ffcc228 tags:
``` python
ratios = compute_nv_ratios(sample_text)
print(f"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}")
print(f"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}")
```
%% Output
Nominal/Verb Ratio: 1.00
Verb/Nominal Ratio: 1.00
%% Cell type:markdown id:d3a929bf-61cb-4ef8-bc00-6e2a59760d37 tags:
## Readability
%% Cell type:markdown id:3fe25ff0-3f83-40fe-8420-08c09ffe98e6 tags:
### 📚 French Readability MetricsTodo: verify this
This notebook implements and explains three common **readability formulas** tailored for **French texts**:
---
#### 1. 🟦 **Flesch–Douma Index**
An adaptation of the original Flesch Reading Ease formula for French.
$\text{Flesch–Douma} = 207 - (1.015 \times \text{ASL}) - (73.6 \times \text{ASW})$
Where:
- **ASL** = Average Sentence Length = (number of words) / (number of sentences)
- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)
📊 **Interpretation**:
- 90–100: Very easy
- 60–70: Standard
- 30–50: Difficult
- < 30: Very difficult
---
#### 2. 🟨 **LIX Index**
Used widely in French and other European languages. Measures sentence length and lexical complexity.
$\text{LIX} = \frac{\text{number of words}}{\text{number of sentences}} + \frac{100 \times \text{number of long words (≥7 chars)}}{\text{number of words}}$
📊 **Interpretation**:
- $<$ 30: Easy
- 30–40: Medium
- $>$ 50: Difficult
---
#### 3. 🟥 **Kandel–Moles Index**
A linear formula proposed for French readability:
$\text{Kandel–Moles} = 0.1935 \times \text{number of words} + 0.1672 \times \text{number of syllables} - 1.779$
📊 **Interpretation**:
- Higher values indicate more complex texts.
---
These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc.
%% Cell type:code id:b9052dc2-ce45-4af4-a0a0-46c60a13da12 tags:
``` python
# Rewriting the readability metric functions here, without relying on downloading external resources
import re
# Naive sentence splitter (based on punctuation)
def naive_sentence_tokenize(text):
return re.split(r'[.!?]+', text.strip())
# Naive word tokenizer (splits on whitespace and punctuation)
def naive_word_tokenize(text):
return re.findall(r'\b\w+\b', text.lower())
# Function to count syllables in a French word (naive method using vowel groups)
def count_syllables(word):
vowels = "aeiouyàâäéèêëîïôöùûüœ"
syllables = re.findall(rf"[{vowels}]+", word)
return max(1, len(syllables))
# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices
def compute_french_readability(text):
sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]
words = [w for w in naive_word_tokenize(text) if re.match(r"\w+", w)]
num_sentences = len(sentences)
num_words = len(words)
num_syllables = sum(count_syllables(w) for w in words)
num_long_words = sum(1 for w in words if len(w) >= 7)
if num_sentences == 0 or num_words == 0:
return {
"Flesch-Douma": 0.0,
"LIX": 0.0,
"Kandel-Moles": 0.0
}
# Flesch-Douma
asl = num_words / num_sentences # Average sentence length
asw = num_syllables / num_words # Average syllables per word
flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)
# LIX
lix = (num_words / num_sentences) + (100 * num_long_words / num_words)
# Kandel-Moles
kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779
return {
"Flesch-Douma": round(flesch_douma, 2),
"LIX": round(lix, 2),
"Kandel-Moles": round(kandel_moles, 2)
}
```
%% Cell type:code id:1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e tags:
``` python
# Test on a sample French text
sample_french_text = """
Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.
"""
compute_french_readability(sample_french_text)
```
%% Output
{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}
%% Cell type:code id:b2cd53c6-1c16-4eaf-8ac8-af166afaa97b tags:
``` python
```
......
%% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:
## Type-token ratio
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
%% Cell type:code id:510c3726-366d-4e26-a2bb-b55391b473bd tags:
``` python
def compute_ttr(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
Only the first column is used (tokens).
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
Parameters:
- text: str, the input text in column format
# Download once if not already
#nltk.download('punkt')
```
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
%% Output
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0]
tokens.append(token)
[nltk_data] Downloading package punkt to
[nltk_data] /home/tatiana.bladier/nltk_data...
[nltk_data] Package punkt is already up-to-date!
if not tokens:
return 0.0
True
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
%% Cell type:code id:ee59c294-fdcd-429e-a126-734480d1b0ba tags:
``` python
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
"""
```
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
``` python
import string
def compute_ttr_cleaned(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
- Only the first column is used (tokens).
- Tokens are lowercased.
- Punctuation tokens are ignored.
Parameters:
- text: str, the input text in column format
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0].lower()
if token not in string.punctuation:
tokens.append(token)
if not tokens:
return 0.0
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
``` python
ttr = compute_ttr(sample_text)
print(f"Type/Token Ratio: {ttr:.3f}")
```
%% Output
Type/Token Ratio: 0.933
%% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:
``` python
def compute_ttr_by_pos(text):
"""
Compute type/token ratios for verbs and nouns in column-formatted text.
- Columns: token \t pos \t ...
- Verbs: POS in {'v', 'vpp', 'vpr'}
- Nouns: POS in {'nc', 'npp'}
- Tokens are lowercased.
- Punctuation is ignored.
Returns:
- A dictionary with TTRs for verbs and nouns.
"""
import string
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
verb_tokens = []
noun_tokens = []
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
token = parts[0].lower()
pos = parts[1]
# Ignore punctuation
if token in string.punctuation:
continue
if pos in verb_pos:
verb_tokens.append(token)
elif pos in noun_pos:
noun_tokens.append(token)
# Compute TTRs
ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0
ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0
return {
'verb_ttr': ttr_verb,
'noun_ttr': ttr_noun
}
```
%% Cell type:code id:1363f307-fa4b-43ba-93d5-2d1c11ceb9e4 tags:
``` python
result = compute_ttr_by_pos(sample_text)
print(f"Verb TTR: {result['verb_ttr']:.3f}")
print(f"Noun TTR: {result['noun_ttr']:.3f}")
```
%% Output
Verb TTR: 0.667
Noun TTR: 1.000
%% Cell type:code id:1362e192-514a-4a77-a8cb-5c012026e2bb tags:
``` python
def compute_nv_ratios(text):
"""
Compute nominal/verb and verb/nominal ratios from column-formatted text.
- Uses the second column (POS).
- Verbs: 'v', 'vpp', 'vpr'
- Nouns: 'nc', 'npp'
Returns:
- Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'
"""
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
verb_count = 0
noun_count = 0
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
pos = parts[1]
if pos in verb_pos:
verb_count += 1
elif pos in noun_pos:
noun_count += 1
nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')
verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')
return {
'nominal_verb_ratio': nominal_verb_ratio,
'verb_nominal_ratio': verb_nominal_ratio
}
```
%% Cell type:code id:544ff6aa-4104-4580-a01f-97429ffcc228 tags:
``` python
ratios = compute_nv_ratios(sample_text)
print(f"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}")
print(f"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}")
```
%% Output
Nominal/Verb Ratio: 1.00
Verb/Nominal Ratio: 1.00
%% Cell type:markdown id:d3a929bf-61cb-4ef8-bc00-6e2a59760d37 tags:
## Readability
%% Cell type:markdown id:3fe25ff0-3f83-40fe-8420-08c09ffe98e6 tags:
### 📚 French Readability MetricsTodo: verify this
This notebook implements and explains three common **readability formulas** tailored for **French texts**:
---
#### 1. 🟦 **Flesch–Douma Index**
An adaptation of the original Flesch Reading Ease formula for French.
$\text{Flesch–Douma} = 207 - (1.015 \times \text{ASL}) - (73.6 \times \text{ASW})$
Where:
- **ASL** = Average Sentence Length = (number of words) / (number of sentences)
- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)
📊 **Interpretation**:
- 90–100: Very easy
- 60–70: Standard
- 30–50: Difficult
- < 30: Very difficult
---
#### 2. 🟨 **LIX Index**
Used widely in French and other European languages. Measures sentence length and lexical complexity.
$\text{LIX} = \frac{\text{number of words}}{\text{number of sentences}} + \frac{100 \times \text{number of long words (≥7 chars)}}{\text{number of words}}$
📊 **Interpretation**:
- $<$ 30: Easy
- 30–40: Medium
- $>$ 50: Difficult
---
#### 3. 🟥 **Kandel–Moles Index**
A linear formula proposed for French readability:
$\text{Kandel–Moles} = 0.1935 \times \text{number of words} + 0.1672 \times \text{number of syllables} - 1.779$
📊 **Interpretation**:
- Higher values indicate more complex texts.
---
These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc.
%% Cell type:code id:b9052dc2-ce45-4af4-a0a0-46c60a13da12 tags:
``` python
# Rewriting the readability metric functions here, without relying on downloading external resources
import re
# Naive sentence splitter (based on punctuation)
def naive_sentence_tokenize(text):
return re.split(r'[.!?]+', text.strip())
# Naive word tokenizer (splits on whitespace and punctuation)
def naive_word_tokenize(text):
return re.findall(r'\b\w+\b', text.lower())
# Function to count syllables in a French word (naive method using vowel groups)
def count_syllables(word):
vowels = "aeiouyàâäéèêëîïôöùûüœ"
syllables = re.findall(rf"[{vowels}]+", word)
return max(1, len(syllables))
# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices
def compute_french_readability(text):
sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]
words = [w for w in naive_word_tokenize(text) if re.match(r"\w+", w)]
num_sentences = len(sentences)
num_words = len(words)
num_syllables = sum(count_syllables(w) for w in words)
num_long_words = sum(1 for w in words if len(w) >= 7)
if num_sentences == 0 or num_words == 0:
return {
"Flesch-Douma": 0.0,
"LIX": 0.0,
"Kandel-Moles": 0.0
}
# Flesch-Douma
asl = num_words / num_sentences # Average sentence length
asw = num_syllables / num_words # Average syllables per word
flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)
# LIX
lix = (num_words / num_sentences) + (100 * num_long_words / num_words)
# Kandel-Moles
kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779
return {
"Flesch-Douma": round(flesch_douma, 2),
"LIX": round(lix, 2),
"Kandel-Moles": round(kandel_moles, 2)
}
```
%% Cell type:code id:1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e tags:
``` python
# Test on a sample French text
sample_french_text = """
Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.
"""
compute_french_readability(sample_french_text)
```
%% Output
{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}
%% Cell type:code id:b2cd53c6-1c16-4eaf-8ac8-af166afaa97b tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment