Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Franck Dary
old_macaon_data
Commits
af40bbba
Commit
af40bbba
authored
Oct 01, 2021
by
Franck Dary
Browse files
Updated orfeo data selection
parent
fee91c64
Changes
5
Hide whitespace changes
Inline
Side-by-side
fr_orpheo/data/Makefile
View file @
af40bbba
ORFEO_DIR
=
../../data/fr_orpheo
conllu
:
./prepareOrfeoData.py
$(ORFEO_DIR)
/mcf/
$(ORFEO_DIR)
/meta_data/
./rmBlankLines.py train.conllu test.conllu
./getOrfeoTrainDevTest.py
$(ORFEO_DIR)
/mcf/ 0.15 0.15
~/macaon_data/scripts/conlluCheckProblems.py train.conllu
>
train 2> pbTrain
~/macaon_data/scripts/conlluCheckProblems.py dev.conllu
>
dev 2> pbDev
~/macaon_data/scripts/conlluCheckProblems.py test.conllu
>
test
2> pbTest
mv
train train.conllu
mv
dev dev.conllu
mv test
test.conllu
~/oculometry/scripts/splitTrainDevTest.py
--dev
0.1
--test
0.0 train.conllu
wc
-l
*
\.
conllu
wc
-l
pb
*
clean
:
-
rm
*
\.
conll
*
-
rm
pbTrain
-
rm
pbDev
-
rm
pbTest
fr_orpheo/data/conllu.mcd
deleted
100644 → 0
View file @
fee91c64
0 FILENAME
1 ID
2 FORM
3 LEMMA
4 POS
5 POS2
6 EMPTY
7 GOV
8 LABEL
9 EMPTY
10 EMPTY
11 TIME1
12 TIME2
13 SPKR
14 NBLOCUTEURS
15 MILIEU
16 TYPE
17 SECTEUR
fr_orpheo/data/getOrfeoTrainDevTest.py
View file @
af40bbba
...
...
@@ -13,6 +13,8 @@ if __name__ == "__main__" :
if
len
(
sys
.
argv
)
!=
4
:
printUsageAndExit
()
mcd
=
"# global.columns = FILE ID FORM LEMMA XPOS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER"
random
.
seed
(
0
)
baseDir
=
sys
.
argv
[
1
]
+
"/"
...
...
@@ -27,7 +29,9 @@ if __name__ == "__main__" :
for
line
in
open
(
baseDir
+
filename
,
"r"
)
:
line
=
line
.
strip
()
filesPerFamily
[
family
][
-
1
][
1
].
append
(
line
)
# Remove consecutives blanklines
if
len
(
line
)
>
0
or
len
(
filesPerFamily
[
family
][
-
1
][
1
])
==
0
or
len
(
filesPerFamily
[
family
][
-
1
][
1
][
-
1
])
!=
0
:
filesPerFamily
[
family
][
-
1
][
1
].
append
(
line
)
totalTrain
=
[]
totalDev
=
[]
...
...
@@ -81,14 +85,15 @@ if __name__ == "__main__" :
totalTest
+=
filesPerFamily
[
family
][
i
][
1
]
with
open
(
"train.conllu"
,
"w"
)
as
out
:
print
(
"
\n
"
.
join
(
totalTrain
),
file
=
out
)
print
(
"
\n
"
.
join
(
[
mcd
]
+
totalTrain
),
file
=
out
)
with
open
(
"dev.conllu"
,
"w"
)
as
out
:
print
(
"
\n
"
.
join
(
totalDev
),
file
=
out
)
print
(
"
\n
"
.
join
(
[
mcd
]
+
totalDev
),
file
=
out
)
with
open
(
"test.conllu"
,
"w"
)
as
out
:
print
(
"
\n
"
.
join
(
totalTest
),
file
=
out
)
print
(
"
\n
"
.
join
(
[
mcd
]
+
totalTest
),
file
=
out
)
totalLines
=
len
(
totalTrain
)
+
len
(
totalDev
)
+
len
(
totalTest
)
print
(
"Total : %d lines"
%
totalLines
)
print
(
"Train : %d (%.2f%%)"
%
(
len
(
totalTrain
),
100.0
*
len
(
totalTrain
)
/
totalLines
))
print
(
"Dev : %d (%.2f%%)"
%
(
len
(
totalDev
),
100.0
*
len
(
totalDev
)
/
totalLines
))
print
(
"Test : %d (%.2f%%)"
%
(
len
(
totalTest
),
100.0
*
len
(
totalTest
)
/
totalLines
))
fr_orpheo/data/prepareOrfeoData.py
deleted
100755 → 0
View file @
fee91c64
#! /usr/bin/python3
import
sys
import
os
def
mcd
()
:
return
"# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU"
def
printUsageAndExit
()
:
print
(
"USAGE : %s rawMcfDirectory metaDataDirectory"
%
(
sys
.
argv
[
0
]),
file
=
sys
.
stderr
)
exit
(
1
)
def
cleanString
(
s
)
:
result
=
s
.
strip
()
return
result
.
replace
(
"
\"
"
,
""
).
replace
(
"/>"
,
""
)
def
treatDirectory
(
mcfs
,
metadatas
)
:
trains
=
[]
tests
=
[]
metas
=
{}
features
=
[
"nbLocuteurs"
,
"milieu"
,
"type"
,
"secteur"
]
featuresDecoda
=
[
"2"
,
"assistance"
,
"finalise"
,
"professionnel"
]
for
entry
in
os
.
listdir
(
mcfs
)
:
if
os
.
path
.
isfile
(
os
.
path
.
join
(
mcfs
,
entry
))
:
if
entry
.
endswith
(
".train"
)
:
trains
.
append
(
entry
)
elif
entry
.
endswith
(
".test"
)
:
tests
.
append
(
entry
)
else
:
print
(
"ERROR : unknown file %s"
%
entry
,
file
=
sys
.
stderr
)
exit
(
1
)
for
entry
in
os
.
listdir
(
metadatas
)
:
if
os
.
path
.
isfile
(
os
.
path
.
join
(
metadatas
,
entry
))
:
splited
=
entry
.
split
(
'.'
)
if
len
(
splited
)
!=
2
or
splited
[
1
]
!=
"xml"
:
continue
name
=
splited
[
0
]
for
line
in
open
(
metadatas
+
entry
,
"r"
)
:
if
"corresp"
in
line
:
splited
=
line
.
split
(
' '
)
target
=
""
corresp
=
""
for
s
in
splited
:
splited2
=
s
.
split
(
'='
)
if
len
(
splited2
)
!=
2
:
continue
if
splited2
[
0
]
==
"target"
:
target
=
cleanString
(
splited2
[
1
])
elif
splited2
[
0
]
==
"corresp"
:
corresp
=
cleanString
(
splited2
[
1
])
else
:
print
(
"ERROR : wrong line
\'
%s
\'
."
%
line
,
file
=
sys
.
stderr
)
exit
(
1
)
if
name
not
in
metas
:
metas
[
name
]
=
{}
metas
[
name
][
corresp
]
=
target
output
=
open
(
"train.conllu"
,
"w"
)
print
(
mcd
(),
file
=
output
)
for
mcf
in
trains
:
featsForFile
=
list
.
copy
(
features
)
name
=
mcf
.
split
(
"."
)[
0
]
if
name
not
in
metas
:
if
"RATP"
in
name
:
featsForFile
=
featuresDecoda
else
:
print
(
"ERROR : metadata unknown for file %s."
%
mcf
,
file
=
sys
.
stderr
)
exit
(
1
)
if
"RATP"
not
in
name
:
for
i
in
range
(
len
(
features
))
:
featValue
=
"n/a"
if
features
[
i
]
in
metas
[
name
]
:
featValue
=
metas
[
name
][
features
[
i
]]
featsForFile
[
i
]
=
featValue
for
line
in
open
(
mcfs
+
mcf
,
"r"
)
:
clean
=
line
.
strip
()
if
len
(
line
)
<=
2
:
print
(
file
=
output
)
continue
completeLine
=
clean
for
feat
in
featsForFile
:
completeLine
+=
"
\t
"
+
feat
print
(
completeLine
,
file
=
output
)
output
=
open
(
"test.conllu"
,
"w"
)
print
(
mcd
(),
file
=
output
)
for
mcf
in
tests
:
featsForFile
=
list
.
copy
(
features
)
name
=
mcf
.
split
(
"."
)[
0
]
if
name
not
in
metas
:
if
"RATP"
in
name
:
featsForFile
=
featuresDecoda
else
:
print
(
"ERROR : metadata unknown for file %s."
%
mcf
,
file
=
sys
.
stderr
)
exit
(
1
)
if
"RATP"
not
in
name
:
for
i
in
range
(
len
(
features
))
:
featValue
=
"n/a"
if
features
[
i
]
in
metas
[
name
]
:
featValue
=
metas
[
name
][
features
[
i
]]
featsForFile
[
i
]
=
featValue
for
line
in
open
(
mcfs
+
mcf
,
"r"
)
:
clean
=
line
.
strip
()
if
len
(
line
)
<=
2
:
print
(
file
=
output
)
continue
completeLine
=
clean
for
feat
in
featsForFile
:
completeLine
+=
"
\t
"
+
feat
print
(
completeLine
,
file
=
output
)
def
main
()
:
if
len
(
sys
.
argv
)
!=
3
:
printUsageAndExit
()
treatDirectory
(
sys
.
argv
[
1
],
sys
.
argv
[
2
])
main
()
fr_orpheo/data/rmBlankLines.py
deleted
100755 → 0
View file @
fee91c64
#! /usr/bin/env python3
import
sys
for
filename
in
sys
.
argv
[
1
:]
:
lines
=
[]
for
line
in
open
(
filename
,
"r"
)
:
line
=
line
.
strip
()
if
len
(
line
)
==
0
and
len
(
lines
)
>
0
and
len
(
lines
[
-
1
])
==
0
:
continue
lines
.
append
(
line
)
with
open
(
filename
,
"w"
)
as
out
:
for
line
in
lines
:
print
(
line
,
file
=
out
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment