Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
macaon_data
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Franck Dary
macaon_data
Commits
29b7b6c4
Commit
29b7b6c4
authored
2 years ago
by
Franck Dary
Browse files
Options
Downloads
Patches
Plain Diff
Added script to append column containing lexicon pos information into conllu file
parent
2f3c0537
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/addLefff2Conllu.py
+171
-0
171 additions, 0 deletions
scripts/addLefff2Conllu.py
with
171 additions
and
0 deletions
scripts/addLefff2Conllu.py
0 → 100755
+
171
−
0
View file @
29b7b6c4
#! /usr/bin/env python3
# Take as input the lefff lexicon and conllu files.
# For each word form it will compute the possible POS.
# Then it will add a column encoding these possible POS to output conllu files.
import
sys
import
argparse
from
readMCD
import
readMCD
# List of UD POS tags : https://universaldependencies.org/u/pos/index.html
allPos
=
[
"
adj
"
,
"
adp
"
,
"
adv
"
,
"
aux
"
,
"
cconj
"
,
"
det
"
,
"
intj
"
,
"
noun
"
,
"
num
"
,
"
part
"
,
"
pron
"
,
"
propn
"
,
"
punct
"
,
"
sconj
"
,
"
sym
"
,
"
verb
"
,
"
x
"
]
# Convert lefff part of speech into UD UPOS.
lefffPOS2UD
=
{
"
adj
"
:
"
adj
"
,
"
csu
"
:
"
sconj
"
,
"
que
"
:
"
sconj
"
,
# Not only ?
"
det
"
:
"
det
"
,
"
pres
"
:
"
intj
"
,
# Nothing match ? INTJ or X
"
v
"
:
"
verb
"
,
"
nc
"
:
"
noun
"
,
"
cfi
"
:
"
noun
"
,
"
advPref
"
:
"
x
"
,
# No meaning with UD tokenization
"
adjPref
"
:
"
x
"
,
# same
"
suffAdj
"
:
"
x
"
,
# same
"
cln
"
:
"
pron
"
,
"
ce
"
:
"
pron
"
,
"
clg
"
:
"
adp
"
,
"
cll
"
:
"
pron
"
,
"
ilimp
"
:
"
pron
"
,
"
cla
"
:
"
pron
"
,
"
cld
"
:
"
pron
"
,
"
pro
"
:
"
pron
"
,
"
caimp
"
:
"
pron
"
,
"
pri
"
:
"
adv
"
,
"
prel
"
:
"
pron
"
,
"
clr
"
:
"
pron
"
,
"
clar
"
:
"
pron
"
,
"
cldr
"
:
"
pron
"
,
"
adv
"
:
"
adv
"
,
"
advm
"
:
"
adv
"
,
"
advp
"
:
"
adv
"
,
"
coo
"
:
"
cconj
"
,
"
ponctw
"
:
"
punct
"
,
"
advneg
"
:
"
adv
"
,
"
clneg
"
:
"
adv
"
,
"
que_restr
"
:
"
sconj
"
,
"
np
"
:
"
propn
"
,
"
poncts
"
:
"
punct
"
,
"
parento
"
:
"
punct
"
,
"
epsilon
"
:
"
punct
"
,
"
parentf
"
:
"
punct
"
,
"
prep
"
:
"
adp
"
,
"
auxAvoir
"
:
"
aux
"
,
"
auxEtre
"
:
"
aux
"
,
}
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"
--lefff
"
,
type
=
str
,
help
=
"
Lefff file in tab separated columns: FORM POS LEMMA MORPHO.
"
)
parser
.
add_argument
(
"
--conllu
"
,
nargs
=
"
+
"
,
type
=
str
,
help
=
"
Input conllu files, to find possible POS for each word.
"
)
parser
.
add_argument
(
"
--output
"
,
nargs
=
"
+
"
,
type
=
str
,
help
=
"
Output conllu files. Must be existing conllu files, this script adds a new column in place.
"
)
parser
.
add_argument
(
"
--colName
"
,
type
=
str
,
default
=
"
LEXICON
"
,
help
=
"
Name of the column that will be added by the script. If the column already exists, it will be replaced.
"
)
args
=
parser
.
parse_args
()
if
args
.
lefff
is
None
and
args
.
conllu
is
None
:
print
(
"
ERROR: must provide --lefff and/or --conllu
"
,
file
=
sys
.
stderr
)
exit
(
1
)
if
args
.
output
is
None
:
print
(
"
ERROR: must provide --output
"
,
file
=
sys
.
stderr
)
exit
(
1
)
# Dict with key=FORM and value= dict associationg pos with number of occ
form2pos
=
{}
# Associate each form with a counter, only for conllu files
formCount
=
{}
# Read lefff and populate form2pos with # of occ = 1
if
args
.
lefff
is
not
None
:
for
line
in
open
(
args
.
lefff
,
"
r
"
)
:
splited
=
line
.
strip
().
split
(
"
\t
"
)
form
=
splited
[
0
].
lower
()
pos
=
lefffPOS2UD
[
splited
[
1
]]
# In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
form
=
form
.
replace
(
"
"
,
"
◌
"
)
if
"
"
in
form
:
print
(
"
HERE
'
%s
'"
%
form
,
file
=
sys
.
stderr
)
if
pos
not
in
allPos
:
print
(
"
ERROR: Unknown pos
'
%s
'
(check allPos in the script)
"
%
pos
,
file
=
sys
.
stderr
)
if
form
not
in
form2pos
:
form2pos
[
form
]
=
{}
if
form
not
in
formCount
:
formCount
[
form
]
=
0
if
pos
not
in
form2pos
[
form
]
:
form2pos
[
form
][
pos
]
=
1
# If conllu files are provided, count number of occurences into form2pos
if
args
.
conllu
is
not
None
:
if
args
.
conllu
is
not
None
:
for
filename
in
args
.
conllu
:
baseMCD
=
"
ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
"
conllMCD
,
conllMCDr
=
readMCD
(
baseMCD
)
for
line
in
open
(
filename
,
"
r
"
)
:
line
=
line
.
strip
()
if
"
global.columns =
"
in
line
and
line
[
0
]
==
"
#
"
:
splited
=
line
.
split
(
"
global.columns =
"
)
conllMCD
,
conllMCDr
=
readMCD
(
splited
[
-
1
].
strip
())
continue
if
len
(
line
)
==
0
or
line
[
0
]
==
"
#
"
:
continue
splited
=
line
.
split
(
"
\t
"
)
wordId
=
splited
[
conllMCD
[
"
ID
"
]].
lower
()
if
"
-
"
in
wordId
:
continue
form
=
splited
[
conllMCD
[
"
FORM
"
]].
lower
()
pos
=
splited
[
conllMCD
[
"
UPOS
"
]].
lower
()
form
=
form
.
replace
(
"
"
,
"
◌
"
)
if
pos
not
in
allPos
:
print
(
"
ERROR: Unknown pos
'
%s
'
(check allPos in the script)
"
%
pos
,
file
=
sys
.
stderr
)
if
form
not
in
form2pos
:
form2pos
[
form
]
=
{}
if
pos
not
in
form2pos
[
form
]
:
form2pos
[
form
][
pos
]
=
0
form2pos
[
form
][
pos
]
+=
1
if
form
not
in
formCount
:
formCount
[
form
]
=
0
formCount
[
form
]
+=
1
# Reshape form2pos to be form -> pos set as string (ex. adj|verb)
for
form
in
form2pos
:
posSetStr
=
"
|
"
.
join
([
pos
for
pos
in
form2pos
[
form
]])
form2pos
[
form
]
=
posSetStr
# Read all output conllu files and rewrite them in place with the new column
for
filename
in
args
.
output
:
baseMCD
=
"
ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
"
conllMCD
,
conllMCDr
=
readMCD
(
baseMCD
)
if
args
.
colName
not
in
conllMCD
:
conllMCD
[
args
.
colName
]
=
len
(
conllMCD
)
conllMCDr
[
len
(
conllMCDr
)]
=
args
.
colName
newLines
=
[
"
# global.columns = %s
"
%
(
"
"
.
join
([
conllMCDr
[
i
]
for
i
in
range
(
len
(
conllMCDr
))]))]
for
line
in
open
(
filename
,
"
r
"
)
:
line
=
line
.
strip
()
if
"
global.columns =
"
in
line
and
line
[
0
]
==
"
#
"
:
splited
=
line
.
split
(
"
global.columns =
"
)
conllMCD
,
conllMCDr
=
readMCD
(
splited
[
-
1
].
strip
())
if
args
.
colName
not
in
conllMCD
:
conllMCD
[
args
.
colName
]
=
len
(
conllMCD
)
conllMCDr
[
len
(
conllMCDr
)]
=
args
.
colName
newLines
=
[
"
# global.columns = %s
"
%
(
"
"
.
join
([
conllMCDr
[
i
]
for
i
in
range
(
len
(
conllMCDr
))]))]
continue
if
len
(
line
)
==
0
or
line
[
0
]
==
"
#
"
:
newLines
.
append
(
line
)
continue
splited
=
line
.
split
(
"
\t
"
)
form
=
splited
[
conllMCD
[
"
FORM
"
]].
lower
()
form
=
form
.
replace
(
"
"
,
"
◌
"
)
posSetStr
=
form2pos
.
get
(
form
,
"
none
"
)
newColIndex
=
conllMCD
[
args
.
colName
]
if
newColIndex
not
in
range
(
len
(
splited
))
:
splited
.
append
(
""
)
splited
[
newColIndex
]
=
posSetStr
newLines
.
append
(
"
\t
"
.
join
(
splited
))
print
(
"
\n
"
.
join
(
newLines
),
file
=
open
(
filename
,
"
w
"
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment