Skip to content
Snippets Groups Projects
Commit da1ae3ab authored by Tatiana BLADIER's avatar Tatiana BLADIER
Browse files

Merge branch 'main' of gitlab.lis-lab.fr:COMPO/compo-text-eval

parents 609eaa2a f6ba543d
No related branches found
No related tags found
No related merge requests found
"""
Corpus is the result of the conll file parsing
- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
"""
from collections import Counter
import math
import conllu
def get_pos_tags (sents) :
pos_tags = [token['upostag'] for sent in sents for token in sent]
return pos_tags
def sentence_length_distribution(corpus):
sentence_lengths = [len(sent) for sent in corpus]
return dict(Counter(sentence_lengths))
def word_length_distribution(corpus):
word_lengths = [len(token['form']) for sent in corpus for token in sent]
return dict(Counter(word_lengths))
def POS_tags_distribution(corpus):
return Counter(get_pos_tags(corpus))
def lexeme_length_distribution(corpus):
return Counter([token['lemma'] for sent in corpus for token in sent])
def frequency_of_adverbs(pos_tags):
return pos_tags.count('ADV')
def percentage_of_adverbs(pos_tags):
return round(pos_tags.count('ADV') / len(pos_tags) * 100, 2)
def percentage_of_adjectives(pos_tags):
return round(pos_tags.count('ADJ') / len(pos_tags) * 100, 2)
def percentage_of_verbs(pos_tags):
return round(pos_tags.count('VERB') / len(pos_tags) * 100, 2)
def verb_noun_ratio(pos_tags):
num_verbs = pos_tags.count('VERB')
num_nouns = pos_tags.count('NOUN') + pos_tags.count('PROPN')
return round(num_verbs / num_nouns, 2) if num_nouns else 0
def get_tokens_types(corpus):
tokens = []
for sentence in corpus:
for token in sentence:
tokens.append(token['form'])
nb_tokens = len(tokens)
nb_types = len(set(tokens))
return nb_tokens, nb_types
def cttr(corpus) :
nb_tokens, nb_types = get_tokens_types(corpus)
if nb_tokens == 0:
return 0.0
return nb_types / math.sqrt(2 * nb_tokens)
def lexical_redundancy(corpus):
num_tokens, num_types = get_tokens_types(corpus)
return 1 - num_types / num_tokens
# pass a node from the tree after calling .to_tree() on a corpus sentence
def tree_height(node):
if not node.children or len(node.children) == 0:
return 0 # feuille
return 1 + max(tree_height(child) for child in node.children)
def tree_depth_distribution(corpus):
depths = [tree_height(sentence.to_tree()) for sentence in corpus]
return Counter(depths)
def syntactic_func_distribution(corpus):
return Counter([token['deprel'] for sent in corpus for token in sent])
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment