Skip to content
Snippets Groups Projects
Commit f6ba543d authored by SLIMANI Meriem's avatar SLIMANI Meriem
Browse files

text Statistics

parent b9e70a8f
No related branches found
No related tags found
No related merge requests found
"""
Corpus is the result of the conll file parsing
- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
"""
from collections import Counter
import math
import conllu
def get_pos_tags (sents) :
pos_tags = [token['upostag'] for sent in sents for token in sent]
return pos_tags
def sentence_length_distribution(corpus):
sentence_lengths = [len(sent) for sent in corpus]
return dict(Counter(sentence_lengths))
def word_length_distribution(corpus):
word_lengths = [len(token['form']) for sent in corpus for token in sent]
return dict(Counter(word_lengths))
def POS_tags_distribution(corpus):
return Counter(get_pos_tags(corpus))
def lexeme_length_distribution(corpus):
return Counter([token['lemma'] for sent in corpus for token in sent])
def frequency_of_adverbs(pos_tags):
return pos_tags.count('ADV')
def percentage_of_adverbs(pos_tags):
return round(pos_tags.count('ADV') / len(pos_tags) * 100, 2)
def percentage_of_adjectives(pos_tags):
return round(pos_tags.count('ADJ') / len(pos_tags) * 100, 2)
def percentage_of_verbs(pos_tags):
return round(pos_tags.count('VERB') / len(pos_tags) * 100, 2)
def verb_noun_ratio(pos_tags):
num_verbs = pos_tags.count('VERB')
num_nouns = pos_tags.count('NOUN') + pos_tags.count('PROPN')
return round(num_verbs / num_nouns, 2) if num_nouns else 0
def get_tokens_types(corpus):
tokens = []
for sentence in corpus:
for token in sentence:
tokens.append(token['form'])
nb_tokens = len(tokens)
nb_types = len(set(tokens))
return nb_tokens, nb_types
def cttr(corpus) :
nb_tokens, nb_types = get_tokens_types(corpus)
if nb_tokens == 0:
return 0.0
return nb_types / math.sqrt(2 * nb_tokens)
def lexical_redundancy(corpus):
num_tokens, num_types = get_tokens_types(corpus)
return 1 - num_types / num_tokens
# pass a node from the tree after calling .to_tree() on a corpus sentence
def tree_height(node):
if not node.children or len(node.children) == 0:
return 0 # feuille
return 1 + max(tree_height(child) for child in node.children)
def tree_depth_distribution(corpus):
depths = [tree_height(sentence.to_tree()) for sentence in corpus]
return Counter(depths)
def syntactic_func_distribution(corpus):
return Counter([token['deprel'] for sent in corpus for token in sent])
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment