text Statistics

f6ba543d · SLIMANI Meriem · b9e70a8f · f6ba543d · f6ba543d
Commit f6ba543d authored 1 month ago by SLIMANI Meriem
--- a/textStat.py
+++ b/textStat.py
+"""
+Corpus is the result of the conll file parsing
+- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
+"""
+from collections import Counter
+import math
+import conllu
+
+
+def get_pos_tags (sents) :
+    pos_tags = [token['upostag'] for sent in sents for token in sent]
+    return pos_tags
+
+def sentence_length_distribution(corpus):
+    sentence_lengths = [len(sent) for sent in corpus]
+    return dict(Counter(sentence_lengths))
+
+def word_length_distribution(corpus):
+    word_lengths = [len(token['form']) for sent in corpus for token in sent]
+    return dict(Counter(word_lengths))
+
+def POS_tags_distribution(corpus):
+    return Counter(get_pos_tags(corpus))
+
+def lexeme_length_distribution(corpus):
+    return Counter([token['lemma'] for sent in corpus for token in sent])
+
+def frequency_of_adverbs(pos_tags):
+    return pos_tags.count('ADV')
+
+def percentage_of_adverbs(pos_tags):
+    return round(pos_tags.count('ADV') / len(pos_tags) * 100, 2)
+
+def percentage_of_adjectives(pos_tags):
+    return round(pos_tags.count('ADJ') / len(pos_tags) * 100, 2)
+
+def percentage_of_verbs(pos_tags):
+    return round(pos_tags.count('VERB') / len(pos_tags) * 100, 2)
+
+def verb_noun_ratio(pos_tags):
+    num_verbs = pos_tags.count('VERB')
+    num_nouns = pos_tags.count('NOUN') + pos_tags.count('PROPN')
+    return round(num_verbs / num_nouns, 2) if num_nouns else 0
+
+def get_tokens_types(corpus):
+    tokens = []
+    for sentence in corpus:
+        for token in sentence:
+            tokens.append(token['form'])
+    nb_tokens = len(tokens)
+    nb_types = len(set(tokens))
+    return nb_tokens, nb_types
+
+def cttr(corpus) :
+    nb_tokens, nb_types = get_tokens_types(corpus)
+    if nb_tokens == 0:
+        return 0.0
+    return nb_types / math.sqrt(2 * nb_tokens)
+
+def lexical_redundancy(corpus):
+    num_tokens, num_types = get_tokens_types(corpus)
+    return 1 - num_types / num_tokens
+
+# pass a node from the tree after calling .to_tree() on a corpus sentence
+def tree_height(node):
+    if not node.children or len(node.children) == 0:
+        return 0  # feuille
+    return 1 + max(tree_height(child) for child in node.children)
+
+def tree_depth_distribution(corpus):
+    depths = [tree_height(sentence.to_tree()) for sentence in corpus]
+    return Counter(depths)
+
+def syntactic_func_distribution(corpus):
+    return Counter([token['deprel'] for sent in corpus for token in sent])
+
--- a/text_analysis_visualization.ipynb
+++ b/text_analysis_visualization.ipynb