Skip to content
Snippets Groups Projects
Commit af40bbba authored by Franck Dary's avatar Franck Dary
Browse files

Updated orfeo data selection

parent fee91c64
Branches nouvellesExp
No related tags found
No related merge requests found
ORFEO_DIR=../../data/fr_orpheo
conllu :
./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/
./rmBlankLines.py train.conllu test.conllu
./getOrfeoTrainDevTest.py $(ORFEO_DIR)/mcf/ 0.15 0.15
~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain
~/macaon_data/scripts/conlluCheckProblems.py dev.conllu > dev 2> pbDev
~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest
mv train train.conllu
mv dev dev.conllu
mv test test.conllu
~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu
wc -l *\.conllu
wc -l pb*
clean:
- rm *\.conll*
- rm pbTrain
- rm pbDev
- rm pbTest
0 FILENAME
1 ID
2 FORM
3 LEMMA
4 POS
5 POS2
6 EMPTY
7 GOV
8 LABEL
9 EMPTY
10 EMPTY
11 TIME1
12 TIME2
13 SPKR
14 NBLOCUTEURS
15 MILIEU
16 TYPE
17 SECTEUR
......@@ -13,6 +13,8 @@ if __name__ == "__main__" :
if len(sys.argv) != 4 :
printUsageAndExit()
mcd = "# global.columns = FILE ID FORM LEMMA XPOS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER"
random.seed(0)
baseDir = sys.argv[1] + "/"
......@@ -27,6 +29,8 @@ if __name__ == "__main__" :
for line in open(baseDir+filename, "r") :
line = line.strip()
# Remove consecutives blanklines
if len(line) > 0 or len(filesPerFamily[family][-1][1]) == 0 or len(filesPerFamily[family][-1][1][-1]) != 0 :
filesPerFamily[family][-1][1].append(line)
totalTrain = []
......@@ -81,14 +85,15 @@ if __name__ == "__main__" :
totalTest += filesPerFamily[family][i][1]
with open("train.conllu", "w") as out :
print("\n".join(totalTrain), file=out)
print("\n".join([mcd]+totalTrain), file=out)
with open("dev.conllu", "w") as out :
print("\n".join(totalDev), file=out)
print("\n".join([mcd]+totalDev), file=out)
with open("test.conllu", "w") as out :
print("\n".join(totalTest), file=out)
print("\n".join([mcd]+totalTest), file=out)
totalLines = len(totalTrain) + len(totalDev) + len(totalTest)
print("Total : %d lines"%totalLines)
print("Train : %d (%.2f%%)"%(len(totalTrain), 100.0*len(totalTrain)/totalLines))
print("Dev : %d (%.2f%%)"%(len(totalDev), 100.0*len(totalDev)/totalLines))
print("Test : %d (%.2f%%)"%(len(totalTest), 100.0*len(totalTest)/totalLines))
#! /usr/bin/python3
import sys
import os
def mcd() :
return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU"
def printUsageAndExit() :
print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr)
exit(1)
def cleanString(s) :
result = s.strip()
return result.replace("\"", "").replace("/>", "")
def treatDirectory(mcfs, metadatas) :
trains = []
tests = []
metas = {}
features = ["nbLocuteurs", "milieu", "type", "secteur"]
featuresDecoda = ["2", "assistance", "finalise", "professionnel"]
for entry in os.listdir(mcfs) :
if os.path.isfile(os.path.join(mcfs, entry)) :
if entry.endswith(".train") :
trains.append(entry)
elif entry.endswith(".test") :
tests.append(entry)
else :
print("ERROR : unknown file %s"%entry, file=sys.stderr)
exit(1)
for entry in os.listdir(metadatas) :
if os.path.isfile(os.path.join(metadatas, entry)) :
splited = entry.split('.')
if len(splited) != 2 or splited[1] != "xml" :
continue
name = splited[0]
for line in open(metadatas+entry, "r") :
if "corresp" in line :
splited = line.split(' ')
target = ""
corresp = ""
for s in splited :
splited2 = s.split('=')
if len(splited2) != 2 :
continue
if splited2[0] == "target" :
target = cleanString(splited2[1])
elif splited2[0] == "corresp" :
corresp = cleanString(splited2[1])
else :
print("ERROR : wrong line \'%s\'."%line, file=sys.stderr)
exit(1)
if name not in metas :
metas[name] = {}
metas[name][corresp] = target
output = open("train.conllu", "w")
print(mcd(), file=output)
for mcf in trains :
featsForFile = list.copy(features)
name = mcf.split(".")[0]
if name not in metas :
if "RATP" in name :
featsForFile = featuresDecoda
else :
print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
exit(1)
if "RATP" not in name :
for i in range(len(features)) :
featValue = "n/a"
if features[i] in metas[name] :
featValue = metas[name][features[i]]
featsForFile[i] = featValue
for line in open(mcfs+mcf, "r") :
clean = line.strip()
if len(line) <= 2 :
print(file=output)
continue
completeLine = clean
for feat in featsForFile :
completeLine += "\t" + feat
print(completeLine,file=output)
output = open("test.conllu", "w")
print(mcd(), file=output)
for mcf in tests :
featsForFile = list.copy(features)
name = mcf.split(".")[0]
if name not in metas :
if "RATP" in name :
featsForFile = featuresDecoda
else :
print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
exit(1)
if "RATP" not in name :
for i in range(len(features)) :
featValue = "n/a"
if features[i] in metas[name] :
featValue = metas[name][features[i]]
featsForFile[i] = featValue
for line in open(mcfs+mcf, "r") :
clean = line.strip()
if len(line) <= 2 :
print(file=output)
continue
completeLine = clean
for feat in featsForFile :
completeLine += "\t" + feat
print(completeLine,file=output)
def main() :
if len(sys.argv) != 3 :
printUsageAndExit()
treatDirectory(sys.argv[1], sys.argv[2])
main()
#! /usr/bin/env python3
import sys
for filename in sys.argv[1:] :
lines = []
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 :
continue
lines.append(line)
with open(filename, "w") as out :
for line in lines :
print(line, file=out)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment