Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
compo-text-eval
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
COMPO
compo-text-eval
Commits
f6ba543d
Commit
f6ba543d
authored
1 month ago
by
SLIMANI Meriem
Browse files
Options
Downloads
Patches
Plain Diff
text Statistics
parent
b9e70a8f
No related branches found
No related tags found
No related merge requests found
Changes
2
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
textStat.py
+76
-0
76 additions, 0 deletions
textStat.py
text_analysis_visualization.ipynb
+432
-0
432 additions, 0 deletions
text_analysis_visualization.ipynb
with
508 additions
and
0 deletions
textStat.py
0 → 100644
+
76
−
0
View file @
f6ba543d
"""
Corpus is the result of the conll file parsing
- Frequency of passive voice is not implemented until the passsive voice annotation issue is solved
"""
from
collections
import
Counter
import
math
import
conllu
def
get_pos_tags
(
sents
)
:
pos_tags
=
[
token
[
'
upostag
'
]
for
sent
in
sents
for
token
in
sent
]
return
pos_tags
def
sentence_length_distribution
(
corpus
):
sentence_lengths
=
[
len
(
sent
)
for
sent
in
corpus
]
return
dict
(
Counter
(
sentence_lengths
))
def
word_length_distribution
(
corpus
):
word_lengths
=
[
len
(
token
[
'
form
'
])
for
sent
in
corpus
for
token
in
sent
]
return
dict
(
Counter
(
word_lengths
))
def
POS_tags_distribution
(
corpus
):
return
Counter
(
get_pos_tags
(
corpus
))
def
lexeme_length_distribution
(
corpus
):
return
Counter
([
token
[
'
lemma
'
]
for
sent
in
corpus
for
token
in
sent
])
def
frequency_of_adverbs
(
pos_tags
):
return
pos_tags
.
count
(
'
ADV
'
)
def
percentage_of_adverbs
(
pos_tags
):
return
round
(
pos_tags
.
count
(
'
ADV
'
)
/
len
(
pos_tags
)
*
100
,
2
)
def
percentage_of_adjectives
(
pos_tags
):
return
round
(
pos_tags
.
count
(
'
ADJ
'
)
/
len
(
pos_tags
)
*
100
,
2
)
def
percentage_of_verbs
(
pos_tags
):
return
round
(
pos_tags
.
count
(
'
VERB
'
)
/
len
(
pos_tags
)
*
100
,
2
)
def
verb_noun_ratio
(
pos_tags
):
num_verbs
=
pos_tags
.
count
(
'
VERB
'
)
num_nouns
=
pos_tags
.
count
(
'
NOUN
'
)
+
pos_tags
.
count
(
'
PROPN
'
)
return
round
(
num_verbs
/
num_nouns
,
2
)
if
num_nouns
else
0
def
get_tokens_types
(
corpus
):
tokens
=
[]
for
sentence
in
corpus
:
for
token
in
sentence
:
tokens
.
append
(
token
[
'
form
'
])
nb_tokens
=
len
(
tokens
)
nb_types
=
len
(
set
(
tokens
))
return
nb_tokens
,
nb_types
def
cttr
(
corpus
)
:
nb_tokens
,
nb_types
=
get_tokens_types
(
corpus
)
if
nb_tokens
==
0
:
return
0.0
return
nb_types
/
math
.
sqrt
(
2
*
nb_tokens
)
def
lexical_redundancy
(
corpus
):
num_tokens
,
num_types
=
get_tokens_types
(
corpus
)
return
1
-
num_types
/
num_tokens
# pass a node from the tree after calling .to_tree() on a corpus sentence
def
tree_height
(
node
):
if
not
node
.
children
or
len
(
node
.
children
)
==
0
:
return
0
# feuille
return
1
+
max
(
tree_height
(
child
)
for
child
in
node
.
children
)
def
tree_depth_distribution
(
corpus
):
depths
=
[
tree_height
(
sentence
.
to_tree
())
for
sentence
in
corpus
]
return
Counter
(
depths
)
def
syntactic_func_distribution
(
corpus
):
return
Counter
([
token
[
'
deprel
'
]
for
sent
in
corpus
for
token
in
sent
])
This diff is collapsed.
Click to expand it.
text_analysis_visualization.ipynb
0 → 100644
+
432
−
0
View file @
f6ba543d
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment