Skip to content
Snippets Groups Projects
Commit f6ba543d authored by SLIMANI Meriem's avatar SLIMANI Meriem
Browse files

text Statistics

parent b9e70a8f
No related branches found
No related tags found
No related merge requests found
"""
Corpus is the result of the conll file parsing
- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
"""
from collections import Counter
import math
import conllu
def get_pos_tags (sents) :
pos_tags = [token['upostag'] for sent in sents for token in sent]
return pos_tags
def sentence_length_distribution(corpus):
sentence_lengths = [len(sent) for sent in corpus]
return dict(Counter(sentence_lengths))
def word_length_distribution(corpus):
word_lengths = [len(token['form']) for sent in corpus for token in sent]
return dict(Counter(word_lengths))
def POS_tags_distribution(corpus):
return Counter(get_pos_tags(corpus))
def lexeme_length_distribution(corpus):
return Counter([token['lemma'] for sent in corpus for token in sent])
def frequency_of_adverbs(pos_tags):
return pos_tags.count('ADV')
def percentage_of_adverbs(pos_tags):
return round(pos_tags.count('ADV') / len(pos_tags) * 100, 2)
def percentage_of_adjectives(pos_tags):
return round(pos_tags.count('ADJ') / len(pos_tags) * 100, 2)
def percentage_of_verbs(pos_tags):
return round(pos_tags.count('VERB') / len(pos_tags) * 100, 2)
def verb_noun_ratio(pos_tags):
num_verbs = pos_tags.count('VERB')
num_nouns = pos_tags.count('NOUN') + pos_tags.count('PROPN')
return round(num_verbs / num_nouns, 2) if num_nouns else 0
def get_tokens_types(corpus):
tokens = []
for sentence in corpus:
for token in sentence:
tokens.append(token['form'])
nb_tokens = len(tokens)
nb_types = len(set(tokens))
return nb_tokens, nb_types
def cttr(corpus) :
nb_tokens, nb_types = get_tokens_types(corpus)
if nb_tokens == 0:
return 0.0
return nb_types / math.sqrt(2 * nb_tokens)
def lexical_redundancy(corpus):
num_tokens, num_types = get_tokens_types(corpus)
return 1 - num_types / num_tokens
# pass a node from the tree after calling .to_tree() on a corpus sentence
def tree_height(node):
if not node.children or len(node.children) == 0:
return 0 # feuille
return 1 + max(tree_height(child) for child in node.children)
def tree_depth_distribution(corpus):
depths = [tree_height(sentence.to_tree()) for sentence in corpus]
return Counter(depths)
def syntactic_func_distribution(corpus):
return Counter([token['deprel'] for sent in corpus for token in sent])
%% Cell type:code id:initial_id tags:
``` python
!pip install conllu
!pip install matplotlib.pyplot
!pip install seaborn
```
%% Cell type:code id:d3c704e6c7a7e168 tags:
``` python
import conllu
import matplotlib.pyplot as plt
import seaborn as sns
import textStat
import importlib
importlib.reload(textStat)
import pandas as pd
from collections import Counter
```
%% Cell type:code id:84d70a1d2682042 tags:
``` python
import matplotlib.pyplot as plt
import seaborn as sns
```
%% Cell type:code id:5b806172bdfbb340 tags:
``` python
# Change path if needed
file_path = "data/Les_compagnons_de_Jéhu.tok.dev.conll"
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
corpus = conllu.parse(content)
```
%% Cell type:code id:ed7fdaffefe1540f tags:
``` python
# Apply the implemented metrics
pos_tags = textStat.get_pos_tags(corpus)
sent_lengths = textStat.sentence_length_distribution(corpus)
word_lengths = textStat.word_length_distribution(corpus)
pos_dist = textStat.POS_tags_distribution(corpus)
lemma_dist = textStat.lexeme_length_distribution(corpus)
adverb_freq = textStat.frequency_of_adverbs(pos_tags)
adverb_pct = textStat.percentage_of_adverbs(pos_tags)
adj_pct = textStat.percentage_of_adjectives(pos_tags)
verb_pct = textStat.percentage_of_verbs(pos_tags)
verb_noun = textStat.verb_noun_ratio(pos_tags)
cttr_val = textStat.cttr(corpus)
redundancy = textStat.lexical_redundancy(corpus)
depth_dist = textStat.tree_depth_distribution(corpus)
deprel_dist = textStat.syntactic_func_distribution(corpus)
```
%% Cell type:code id:dd0f555230944599 tags:
``` python
# pos_tag_distribution
df_pos = Counter(pos_dist).most_common()
df_pos = pd.DataFrame(df_pos, columns=["POS", "Count"])
plt.figure(figsize=(10, 6))
sns.barplot(data=df_pos, x="POS", y="Count")
plt.title("Pos Tags Distribution")
plt.xticks(rotation=45)
plt.show()
```
%% Output
%% Cell type:code id:fe32918ad8bd705c tags:
``` python
# Lemma distribution
df_lemma = Counter(lemma_dist).most_common()
df_lemma = pd.DataFrame(df_lemma, columns=["Lemma", "Count"])
df_lemma = df_lemma.sort_values(by="Count", ascending=False).head(15)
plt.figure(figsize=(10, 6))
sns.barplot(data=df_lemma, x="Lemma", y="Count")
plt.title("Top 15 Lemma Distribution")
plt.xticks(rotation=45)
plt.show()
```
%% Output
%% Cell type:code id:7f0c3d0540cb731f tags:
``` python
# sentence_length distribution
df_sentence_length = Counter(sent_lengths).most_common(15)
df_sentence_length = pd.DataFrame(df_sentence_length, columns=["Sentence Lengths", "Count"])
df_sentence_length = df_sentence_length.sort_values(by= 'Count', ascending=False)
print(df_sentence_length)
plt.figure(figsize=(10, 6))
sns.barplot(data=df_sentence_length, x="Sentence Lengths", y="Count")
plt.title("Sentence Length Distribution")
plt.show()
```
%% Output
Sentence Lengths Count
0 10 49
1 9 45
2 16 42
3 8 40
4 15 39
5 7 39
6 14 37
7 12 33
8 18 32
9 19 31
10 5 31
11 6 30
12 24 30
13 13 30
14 20 29
%% Cell type:code id:69f4124a195511b1 tags:
``` python
# word_length distribution
df_word_length = Counter(word_lengths).most_common()
df_word_length = pd.DataFrame(df_word_length, columns=["Word Lengths", "Count"])
df_word_length = df_word_length.sort_values(by= 'Count', ascending=False)
print(df_word_length)
plt.figure(figsize=(10, 6))
sns.barplot(data=df_word_length, x="Word Lengths", y="Count")
plt.title("Word Length Distribution")
plt.show()
```
%% Output
Word Lengths Count
0 2 5680
1 1 3734
2 3 2762
3 4 2413
4 5 2079
5 6 1917
6 7 1419
7 8 902
8 9 778
9 10 416
10 11 233
11 12 114
12 13 60
13 14 38
14 16 6
15 15 2
16 18 1
17 17 1
18 22 1
%% Cell type:code id:b278abc9f968bf12 tags:
``` python
# Sentences tree depth distribution
df_depth = pd.DataFrame(depth_dist.items(), columns=["Tree Depth", "Count"])
df_depth = df_depth.sort_values(by= 'Count', ascending=False)
print(df_depth)
plt.figure(figsize=(10, 6))
sns.barplot(data=df_depth, x="Tree Depth", y="Count")
plt.title("Sentence Tree Depth Distribution")
plt.show()
```
%% Output
Tree Depth Count
3 3 241
0 4 222
4 2 179
1 5 145
5 6 89
2 1 57
6 7 53
8 8 29
7 9 12
10 11 5
9 10 4
%% Cell type:code id:50c9ca08e334fc7 tags:
``` python
# syntactic function distribution
df_dep = pd.DataFrame(deprel_dist.items(), columns=["Syntactic Function", "Count"])
df_dep = df_dep.sort_values(by="Count", ascending=False).head(15)
plt.figure(figsize=(10, 6))
sns.barplot(data=df_dep, x="Syntactic Function", y="Count")
plt.title("Syntactic Function Distribution")
plt.xticks(rotation=45)
plt.show()
```
%% Output
%% Cell type:code id:81c74d12a8deda2e tags:
``` python
# for single numerical value statistics
print(f"Adverbs : {adverb_pct}%")
print(f"Adjectives : {adj_pct}%")
print(f"Verbs : {verb_pct}%")
print(f"Verb/Noun ratio : {verb_noun}")
print()
print("Lexical Diversity --> :")
print(f"CTTR : {cttr_val:.2f}")
print(f"Lexical redundancy : {redundancy:.2f}")
```
%% Output
Adverbs : 5.1%
Adjectives : 3.52%
Verbs : 12.58%
Verb/Noun ratio : 0.67
Lexical Diversity --> :
CTTR : 14.55
Lexical redundancy : 0.82
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment