Merge branch 'main' of gitlab.lis-lab.fr:COMPO/compo-text-eval

da1ae3ab · Tatiana BLADIER · 609eaa2a · f6ba543d · da1ae3ab · da1ae3ab
Commit da1ae3ab authored 2 months ago by Tatiana BLADIER
--- a/textStat.py
+++ b/textStat.py
+"""
+Corpus is the result of the conll file parsing
+- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
+"""
+from collections import Counter
+import math
+import conllu
+def get_pos_tags (sents) :
+    pos_tags = [token['upostag'] for sent in sents for token in sent]
+    return pos_tags
+def sentence_length_distribution(corpus):
+    sentence_lengths = [len(sent) for sent in corpus]
+    return dict(Counter(sentence_lengths))
+def word_length_distribution(corpus):
+    word_lengths = [len(token['form']) for sent in corpus for token in sent]
+    return dict(Counter(word_lengths))
+def POS_tags_distribution(corpus):
+    return Counter(get_pos_tags(corpus))
+def lexeme_length_distribution(corpus):
+    return Counter([token['lemma'] for sent in corpus for token in sent])
+def frequency_of_adverbs(pos_tags):
+    return pos_tags.count('ADV')
+def percentage_of_adverbs(pos_tags):
+    return round(pos_tags.count('ADV') / len(pos_tags) * 100, 2)
+def percentage_of_adjectives(pos_tags):
+    return round(pos_tags.count('ADJ') / len(pos_tags) * 100, 2)
+def percentage_of_verbs(pos_tags):
+    return round(pos_tags.count('VERB') / len(pos_tags) * 100, 2)
+def verb_noun_ratio(pos_tags):
+    num_verbs = pos_tags.count('VERB')
+    num_nouns = pos_tags.count('NOUN') + pos_tags.count('PROPN')
+    return round(num_verbs / num_nouns, 2) if num_nouns else 0
+def get_tokens_types(corpus):
+    tokens = []
+    for sentence in corpus:
+        for token in sentence:
+            tokens.append(token['form'])
+    nb_tokens = len(tokens)
+    nb_types = len(set(tokens))
+    return nb_tokens, nb_types
+def cttr(corpus) :
+    nb_tokens, nb_types = get_tokens_types(corpus)
+    if nb_tokens == 0:
+        return 0.0
+    return nb_types / math.sqrt(2 * nb_tokens)
+def lexical_redundancy(corpus):
+    num_tokens, num_types = get_tokens_types(corpus)
+    return 1 - num_types / num_tokens
+# pass a node from the tree after calling .to_tree() on a corpus sentence
+def tree_height(node):
+    if not node.children or len(node.children) == 0:
+        return 0  # feuille
+    return 1 + max(tree_height(child) for child in node.children)
+def tree_depth_distribution(corpus):
+    depths = [tree_height(sentence.to_tree()) for sentence in corpus]
+    return Counter(depths)
+def syntactic_func_distribution(corpus):
+    return Counter([token['deprel'] for sent in corpus for token in sent])
--- a/text_analysis_visualization.ipynb
+++ b/text_analysis_visualization.ipynb