Skip to content
Snippets Groups Projects
Commit b77603d8 authored by Tatiana BLADIER's avatar Tatiana BLADIER
Browse files

spearman correlation

parent b290b140
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:96858183-3e82-4a33-ba0f-1b21b5f36018 tags:
## Type-token ratio
%% Cell type:code id:510c3726-366d-4e26-a2bb-b55391b473bd tags:
``` python
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# Download once if not already
#nltk.download('punkt')
```
%% Cell type:code id:ee59c294-fdcd-429e-a126-734480d1b0ba tags:
``` python
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
"""
```
%% Cell type:code id:b6ae41ef-116f-473d-b3f3-115d90fe65b7 tags:
``` python
import string
def compute_ttr_cleaned(text):
"""
Compute the type/token ratio (TTR) from column-formatted text.
- Only the first column is used (tokens).
- Tokens are lowercased.
- Punctuation tokens are ignored.
Parameters:
- text: str, the input text in column format
Returns:
- ttr: float, the type/token ratio
"""
tokens = []
for line in text.strip().splitlines():
if line.strip(): # skip empty lines
token = line.split('\t')[0].lower()
if token not in string.punctuation:
tokens.append(token)
if not tokens:
return 0.0
types = set(tokens)
return len(types) / len(tokens)
```
%% Cell type:code id:2a882cc9-8f9d-4457-becb-d2e26ab3f14f tags:
``` python
ttr = compute_ttr_cleaned(sample_text)
print(f"Type/Token Ratio: {ttr:.3f}")
```
%% Output
Type/Token Ratio: 0.923
%% Cell type:code id:8897dcc3-4218-4ee5-9984-17b9a6d8dce2 tags:
``` python
def compute_ttr_by_pos(text):
"""
Compute type/token ratios for verbs and nouns in column-formatted text.
- Columns: token \t pos \t ...
- Verbs: POS in {'v', 'vpp', 'vpr'}
- Nouns: POS in {'nc', 'npp'}
- Tokens are lowercased.
- Punctuation is ignored.
Returns:
- A dictionary with TTRs for verbs and nouns.
"""
import string
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
verb_tokens = []
noun_tokens = []
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
token = parts[0].lower()
pos = parts[1]
# Ignore punctuation
if token in string.punctuation:
continue
if pos in verb_pos:
verb_tokens.append(token)
elif pos in noun_pos:
noun_tokens.append(token)
# Compute TTRs
ttr_verb = len(set(verb_tokens)) / len(verb_tokens) if verb_tokens else 0.0
ttr_noun = len(set(noun_tokens)) / len(noun_tokens) if noun_tokens else 0.0
return {
'verb_ttr': ttr_verb,
'noun_ttr': ttr_noun,
}
```
%% Cell type:code id:1363f307-fa4b-43ba-93d5-2d1c11ceb9e4 tags:
``` python
result = compute_ttr_by_pos(sample_text)
print(f"Verb TTR: {result['verb_ttr']:.3f}")
print(f"Noun TTR: {result['noun_ttr']:.3f}")
```
%% Output
Verb TTR: 0.667
Noun TTR: 1.000
%% Cell type:code id:1362e192-514a-4a77-a8cb-5c012026e2bb tags:
``` python
def compute_nv_ratios(text):
"""
Compute nominal/verb and verb/nominal ratios from column-formatted text.
- Uses the second column (POS).
- Verbs: 'v', 'vpp', 'vpr'
- Nouns: 'nc', 'npp'
Returns:
- Dictionary with 'nominal_verb_ratio' and 'verb_nominal_ratio'
"""
verb_pos = {'v', 'vpp', 'vpr'}
noun_pos = {'nc', 'npp'}
adj_pos = {'adj'}
adv_pos = {'adv'}
verb_count = 0
noun_count = 0
adj_count = 0
adv_count = 0
for line in text.strip().splitlines():
if line.strip():
parts = line.split('\t')
if len(parts) >= 2:
pos = parts[1]
if pos in verb_pos:
verb_count += 1
if pos in noun_pos:
noun_count += 1
if pos in adj_pos:
adj_count += 1
if pos in adv_pos:
adv_count += 1
nominal_verb_ratio = noun_count / verb_count if verb_count else float('inf')
verb_nominal_ratio = verb_count / noun_count if noun_count else float('inf')
adv_verb_ratio = adv_count / verb_count if verb_count else float('inf')
adj_noun_verb_ratio = (adj_count + noun_count) / verb_count if verb_count else float('inf')
return {
'nominal_verb_ratio': nominal_verb_ratio,
'verb_nominal_ratio': verb_nominal_ratio,
'adv_verb_ratio': adv_verb_ratio,
'adj_noun_verb_ratio': adj_noun_verb_ratio
}
```
%% Cell type:code id:544ff6aa-4104-4580-a01f-97429ffcc228 tags:
``` python
ratios = compute_nv_ratios(sample_text)
print(f"Nominal/Verb Ratio: {ratios['nominal_verb_ratio']:.2f}")
print(f"Verb/Nominal Ratio: {ratios['verb_nominal_ratio']:.2f}")
print(f"Adverb/Verb Ratio: {ratios['adv_verb_ratio']:.2f}")
print(f"Adj+Noun/Verb Ratio: {ratios['adj_noun_verb_ratio']:.2f}")
```
%% Output
Nominal/Verb Ratio: 1.00
Verb/Nominal Ratio: 1.00
Adverb/Verb Ratio: 0.00
Adj+Noun/Verb Ratio: 1.00
%% Cell type:markdown id:d3a929bf-61cb-4ef8-bc00-6e2a59760d37 tags:
## Readability
%% Cell type:markdown id:3fe25ff0-3f83-40fe-8420-08c09ffe98e6 tags:
### 📚 French Readability MetricsTodo: verify this
This notebook implements and explains three common **readability formulas** tailored for **French texts**:
---
#### 1. 🟦 **Flesch–Douma Index**
An adaptation of the original Flesch Reading Ease formula for French.
$\text{Flesch–Douma} = 207 - (1.015 \times \text{ASL}) - (73.6 \times \text{ASW})$
Where:
- **ASL** = Average Sentence Length = (number of words) / (number of sentences)
- **ASW** = Average Syllables per Word = (number of syllables) / (number of words)
📊 **Interpretation**:
- 90–100: Very easy
- 60–70: Standard
- 30–50: Difficult
- < 30: Very difficult
---
#### 2. 🟨 **LIX Index**
Used widely in French and other European languages. Measures sentence length and lexical complexity.
$\text{LIX} = \frac{\text{number of words}}{\text{number of sentences}} + \frac{100 \times \text{number of long words (≥7 chars)}}{\text{number of words}}$
📊 **Interpretation**:
- $<$ 30: Easy
- 30–40: Medium
- $>$ 50: Difficult
---
#### 3. 🟥 **Kandel–Moles Index**
A linear formula proposed for French readability:
$\text{Kandel–Moles} = 0.1935 \times \text{number of words} + 0.1672 \times \text{number of syllables} - 1.779$
📊 **Interpretation**:
- Higher values indicate more complex texts.
---
These formulas help estimate how easily a French reader can understand a given passage. The metrics can be used to analyze textbooks, articles, instructional materials, etc.
%% Cell type:code id:b9052dc2-ce45-4af4-a0a0-46c60a13da12 tags:
``` python
# Rewriting the readability metric functions here, without relying on downloading external resources
import re
# Naive sentence splitter (based on punctuation)
def naive_sentence_tokenize(text):
return re.split(r'[.!?]+', text.strip())
# Naive word tokenizer (splits on whitespace and punctuation)
def naive_word_tokenize(text):
return re.findall(r'\b\w+\b', text.lower())
# Function to count syllables in a French word (naive method using vowel groups)
def count_syllables(word):
vowels = "aeiouyàâäéèêëîïôöùûüœ"
syllables = re.findall(rf"[{vowels}]+", word)
return max(1, len(syllables))
# Function to compute Flesch-Douma, LIX, and Kandel-Moles indices
def compute_french_readability(text):
sentences = [s for s in naive_sentence_tokenize(text) if s.strip()]
words = [w for w in naive_word_tokenize(text) if re.match(r"\w+", w)]
num_sentences = len(sentences)
num_words = len(words)
num_syllables = sum(count_syllables(w) for w in words)
num_long_words = sum(1 for w in words if len(w) >= 7)
if num_sentences == 0 or num_words == 0:
return {
"Flesch-Douma": 0.0,
"LIX": 0.0,
"Kandel-Moles": 0.0
}
# Flesch-Douma
asl = num_words / num_sentences # Average sentence length
asw = num_syllables / num_words # Average syllables per word
flesch_douma = 207 - (1.015 * asl) - (73.6 * asw)
# LIX
lix = (num_words / num_sentences) + (100 * num_long_words / num_words)
# Kandel-Moles
kandel_moles = 0.1935 * num_words + 0.1672 * num_syllables - 1.779
return {
"Flesch-Douma": round(flesch_douma, 2),
"LIX": round(lix, 2),
"Kandel-Moles": round(kandel_moles, 2)
}
```
%% Cell type:code id:1e9dd0fb-db6a-47d1-8bfb-1015845f6d3e tags:
``` python
# Test on a sample French text
sample_french_text = """
Aramis était à son poste. Il était tombé de ses bras. Ce n'était pas un accident.
"""
compute_french_readability(sample_french_text)
```
%% Output
{'Flesch-Douma': 88.68, 'LIX': 11.55, 'Kandel-Moles': 5.86}
%% Cell type:markdown id:8a0c0fff-d605-4349-a698-a11fd404e2e8 tags:
## Calculate avg scores
%% Cell type:code id:24bc84a5-b2df-4194-838a-8f24302599bd tags:
``` python
# Define the function to compute average word length and sentence length
def compute_avg_lengths(sample_text):
sentences = []
current_sentence = []
for line in sample_text.strip().split('\n'):
cols = line.strip().split('\t')
if not cols or len(cols) < 1:
continue
token = cols[0]
if token == '<s>':
current_sentence = []
elif token == '</s>':
if current_sentence:
sentences.append(current_sentence)
else:
current_sentence.append(token)
total_words = 0
total_word_length = 0
sentence_lengths = []
for sentence in sentences:
words = [w for w in sentence if re.match(r'\w+', w) and w not in ['<s>', '</s>']]
sentence_lengths.append(len(words))
total_words += len(words)
total_word_length += sum(len(w) for w in words)
avg_word_length = total_word_length / total_words if total_words else 0
avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0
return {
"Average Word Length": round(avg_word_length, 2),
"Average Sentence Length": round(avg_sentence_length, 2)
}
```
%% Cell type:code id:0cdb972f-31b6-4e7e-82a8-371eda344f2c tags:
``` python
# Sample text from the user
sample_text = """
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
son det NP@@<nul> 0
poste nc <nul>@@<nul> 1
, ponct <nul>@@<nul> 0
il cls-suj VN@@<nul> 0
était v <nul>@@<nul> 1
tombé vpp <nul>@@<nul> 1
de p PP-DE_OBJ@@Sint-MOD 1
ses det NP@@<nul> 2
bras nc <nul>@@<nul> 3
. ponct <nul>@@<nul> 0
</s> </s> </s> 0
<s> <s> <s> 0
Aramis npp <nul>@@<nul> 0
était v <nul>@@<nul> 0
à p <nul>@@<nul> 0
</s> </s> </s> 0
"""
# Compute and display the results
compute_avg_lengths(sample_text)
```
%% Output
{'Average Word Length': 3.79, 'Average Sentence Length': 7.0}
%% Cell type:markdown id:bf5b0b52-e5c4-4b40-b925-495f4dd8e3be tags:
## Calculate POS frequencies
%% Cell type:code id:56af520c-d56b-404a-aebf-ad7c2a9ca503 tags:
``` python
def compute_pos_frequency(column_text):
verb_tags = {"v", "vpp", "vpr"}
noun_tags = {'nc', 'npp'}
adj_tags = {'adj'}
adv_tags = {'adv'}
total_tokens = 0
verb_count = 0
noun_count = 0
adj_count = 0
adv_count = 0
for line in column_text.strip().split('\n'):
parts = line.strip().split('\t')
if len(parts) < 2:
continue
token, pos = parts[0], parts[1]
if re.match(r'\w+', token): # ignore punctuation
total_tokens += 1
if pos in verb_tags:
verb_count += 1
if pos in noun_tags:
noun_count += 1
if pos in adj_tags:
adj_count += 1
if pos in adv_tags:
adv_count += 1
if total_tokens == 0:
return 0.0
return {
'verb_freq': round(verb_count / total_tokens, 4),
'noun_freq': round(noun_count / total_tokens, 4),
'adv_freq': round(adv_count / total_tokens, 4),
'adj_freq': round(adj_count / total_tokens, 4),
}
```
%% Cell type:code id:f7c8b125-4651-4b21-bcc4-93ef78a4239b tags:
``` python
freqs = compute_pos_frequency(sample_text)
print(f"Verb Frequency: {freqs['verb_freq']:.2f}")
print(f"Noun Frequency: {freqs['noun_freq']:.2f}")
print(f"Adj Frequency: {freqs['adv_freq']:.2f}")
print(f"Adv Frequency: {freqs['adj_freq']:.2f}")
```
%% Output
Verb Frequency: 0.29
Noun Frequency: 0.29
Adj Frequency: 0.00
Adv Frequency: 0.00
%% Cell type:markdown id:4cd15f8f-5618-4586-bd43-30f4919c7274 tags:
### MSTTR-100 (Mean Segmental Type-Token Ratio)
MSTTR-100 measures lexical diversity by dividing the text into consecutive segments of 100 tokens and computing the type-token ratio (TTR) for each segment. The final MSTTR-100 is the average TTR across all segments.
%% Cell type:code id:daa17c33-adca-4695-90eb-741579382939 tags:
``` python
import re
def msttr(text, segment_size):
words = re.findall(r'\b\w+\b', text.lower())
if len(words) < segment_size:
return len(set(words)) / len(words)
segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]
ttrs = [len(set(segment)) / len(segment) for segment in segments if len(segment) == segment_size]
return sum(ttrs) / len(ttrs)
```
%% Cell type:code id:80d8fa08-6b7d-4ab7-85cd-987823639277 tags:
``` python
print("MSTTR: ", msttr(sample_french_text, 100))
```
%% Output
MSTTR: 0.8823529411764706
%% Cell type:markdown id:91c7969a-3fff-4935-9f26-7e1ebb6b64c6 tags:
### BZIP TXT
"BZIP TXT" refers to the compression ratio achieved by compressing the text using the BZIP2 algorithm. It serves as a proxy for the text's redundancy and complexity.
%% Cell type:code id:c8bd9186-eab8-4ca6-93bd-82b260cd3d19 tags:
``` python
import bz2
def bzip_compression_ratio(text):
original_size = len(text.encode('utf-8'))
compressed_size = len(bz2.compress(text.encode('utf-8')))
return compressed_size / original_size
```
%% Cell type:code id:3f9c7dc7-6820-4013-a85c-2af4f846d4f5 tags:
``` python
print("BZIP: ", bzip_compression_ratio(sample_french_text))
```
%% Output
BZIP: 1.1931818181818181
%% Cell type:markdown id:88b6f5f8-90b7-4dfe-b8ee-d54380bf3194 tags:
### Word Entropy
Word entropy quantifies the unpredictability or information content of words in a text. It's calculated using Shannon's entropy formula over the distribution of word frequencies.
%% Cell type:code id:65e1a630-c46e-4b18-9831-b97864de53ee tags:
``` python
import math
from collections import Counter
def word_entropy(text):
words = re.findall(r'\b\w+\b', text.lower())
total_words = len(words)
word_counts = Counter(words)
return -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())
```
%% Cell type:code id:1612e911-12a8-47c9-b811-b2d6885c3647 tags:
``` python
print("WORD ENTROPY: ", word_entropy(sample_french_text))
```
%% Output
WORD ENTROPY: 3.807763576417195
%% Cell type:markdown id:a58d962f-5d90-4ee9-b347-64f5bb52c24a tags:
### Bigram Entropy
Bigram entropy measures the unpredictability of word pairs (bigrams) in a text, providing insight into the text's syntactic complexity.
%% Cell type:code id:925a3a75-aaaa-4851-b77b-b42cb1e21e11 tags:
``` python
def bigram_entropy(text):
words = re.findall(r'\b\w+\b', text.lower())
bigrams = list(zip(words, words[1:]))
total_bigrams = len(bigrams)
bigram_counts = Counter(bigrams)
return -sum((count/total_bigrams) * math.log2(count/total_bigrams) for count in bigram_counts.values())
```
%% Cell type:code id:6fa60897-ad26-43b4-b8de-861290ca6bd3 tags:
``` python
print("BIGRAM ENTROPY: ", bigram_entropy(sample_french_text))
```
%% Output
BIGRAM ENTROPY: 4.0
%% Cell type:code id:d2afe949-9351-4ec8-aefc-7fe79b7c5a88 tags:
%% Cell type:markdown id:6ac26902-75a5-4824-8c2b-da3f733c820f tags:
## Spearman Correlation between perplexity and stylometric features
%% Cell type:code id:f3678462-e572-4ce5-8d3d-a5389b2356c8 tags:
``` python
#!pip3 install seaborn
#!pip3 install scipy
```
%% Output
Defaulting to user installation because normal site-packages is not writeable
Collecting scipy
Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Requirement already satisfied: numpy<2.5,>=1.23.5 in /public/conda/Miniconda/envs/pytorch-2.6/lib/python3.11/site-packages (from scipy) (2.2.4)
Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 37.7/37.7 MB 75.5 MB/s eta 0:00:00a 0:00:01
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.15.3
%% Cell type:code id:b621b2a8-488f-44db-b085-fe156f453943 tags:
``` python
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
# Sample data (replace with your real values)
data = {
"perplexity": [32.5, 45.2, 28.1, 39.0, 50.3],
"avg_word_length": [4.1, 4.3, 4.0, 4.2, 4.5],
"avg_sentence_length": [12.5, 13.0, 11.0, 12.0, 13.5],
"word_entropy": [6.1, 6.3, 6.0, 6.2, 6.4],
"bigram_entropy": [8.0, 8.2, 7.9, 8.1, 8.3]
}
df = pd.DataFrame(data)
# Compute Spearman correlation
corr, _ = spearmanr(df)
corr_df = pd.DataFrame(corr, index=df.columns, columns=df.columns)
# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_df, annot=True, cmap="coolwarm", fmt=".2f", square=True, linewidths=0.5)
plt.title("Spearman Correlation Heatmap")
plt.tight_layout()
plt.show()
```
%% Output
%% Cell type:code id:3a6e3b53-7104-45ef-a4b5-e831bdd6ca6f tags:
``` python
```
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment