Commit af40bbba authored by Franck Dary's avatar Franck Dary
Browse files

Updated orfeo data selection

parent fee91c64
ORFEO_DIR=../../data/fr_orpheo
conllu :
./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/
./rmBlankLines.py train.conllu test.conllu
./getOrfeoTrainDevTest.py $(ORFEO_DIR)/mcf/ 0.15 0.15
~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain
~/macaon_data/scripts/conlluCheckProblems.py dev.conllu > dev 2> pbDev
~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest
mv train train.conllu
mv dev dev.conllu
mv test test.conllu
~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu
wc -l *\.conllu
wc -l pb*
clean:
- rm *\.conll*
- rm pbTrain
- rm pbDev
- rm pbTest
0 FILENAME
1 ID
2 FORM
3 LEMMA
4 POS
5 POS2
6 EMPTY
7 GOV
8 LABEL
9 EMPTY
10 EMPTY
11 TIME1
12 TIME2
13 SPKR
14 NBLOCUTEURS
15 MILIEU
16 TYPE
17 SECTEUR
......@@ -13,6 +13,8 @@ if __name__ == "__main__" :
if len(sys.argv) != 4 :
printUsageAndExit()
mcd = "# global.columns = FILE ID FORM LEMMA XPOS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER"
random.seed(0)
baseDir = sys.argv[1] + "/"
......@@ -27,7 +29,9 @@ if __name__ == "__main__" :
for line in open(baseDir+filename, "r") :
line = line.strip()
filesPerFamily[family][-1][1].append(line)
# Remove consecutives blanklines
if len(line) > 0 or len(filesPerFamily[family][-1][1]) == 0 or len(filesPerFamily[family][-1][1][-1]) != 0 :
filesPerFamily[family][-1][1].append(line)
totalTrain = []
totalDev = []
......@@ -81,14 +85,15 @@ if __name__ == "__main__" :
totalTest += filesPerFamily[family][i][1]
with open("train.conllu", "w") as out :
print("\n".join(totalTrain), file=out)
print("\n".join([mcd]+totalTrain), file=out)
with open("dev.conllu", "w") as out :
print("\n".join(totalDev), file=out)
print("\n".join([mcd]+totalDev), file=out)
with open("test.conllu", "w") as out :
print("\n".join(totalTest), file=out)
print("\n".join([mcd]+totalTest), file=out)
totalLines = len(totalTrain) + len(totalDev) + len(totalTest)
print("Total : %d lines"%totalLines)
print("Train : %d (%.2f%%)"%(len(totalTrain), 100.0*len(totalTrain)/totalLines))
print("Dev : %d (%.2f%%)"%(len(totalDev), 100.0*len(totalDev)/totalLines))
print("Test : %d (%.2f%%)"%(len(totalTest), 100.0*len(totalTest)/totalLines))
#! /usr/bin/python3
import sys
import os
def mcd() :
return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU"
def printUsageAndExit() :
print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr)
exit(1)
def cleanString(s) :
result = s.strip()
return result.replace("\"", "").replace("/>", "")
def treatDirectory(mcfs, metadatas) :
trains = []
tests = []
metas = {}
features = ["nbLocuteurs", "milieu", "type", "secteur"]
featuresDecoda = ["2", "assistance", "finalise", "professionnel"]
for entry in os.listdir(mcfs) :
if os.path.isfile(os.path.join(mcfs, entry)) :
if entry.endswith(".train") :
trains.append(entry)
elif entry.endswith(".test") :
tests.append(entry)
else :
print("ERROR : unknown file %s"%entry, file=sys.stderr)
exit(1)
for entry in os.listdir(metadatas) :
if os.path.isfile(os.path.join(metadatas, entry)) :
splited = entry.split('.')
if len(splited) != 2 or splited[1] != "xml" :
continue
name = splited[0]
for line in open(metadatas+entry, "r") :
if "corresp" in line :
splited = line.split(' ')
target = ""
corresp = ""
for s in splited :
splited2 = s.split('=')
if len(splited2) != 2 :
continue
if splited2[0] == "target" :
target = cleanString(splited2[1])
elif splited2[0] == "corresp" :
corresp = cleanString(splited2[1])
else :
print("ERROR : wrong line \'%s\'."%line, file=sys.stderr)
exit(1)
if name not in metas :
metas[name] = {}
metas[name][corresp] = target
output = open("train.conllu", "w")
print(mcd(), file=output)
for mcf in trains :
featsForFile = list.copy(features)
name = mcf.split(".")[0]
if name not in metas :
if "RATP" in name :
featsForFile = featuresDecoda
else :
print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
exit(1)
if "RATP" not in name :
for i in range(len(features)) :
featValue = "n/a"
if features[i] in metas[name] :
featValue = metas[name][features[i]]
featsForFile[i] = featValue
for line in open(mcfs+mcf, "r") :
clean = line.strip()
if len(line) <= 2 :
print(file=output)
continue
completeLine = clean
for feat in featsForFile :
completeLine += "\t" + feat
print(completeLine,file=output)
output = open("test.conllu", "w")
print(mcd(), file=output)
for mcf in tests :
featsForFile = list.copy(features)
name = mcf.split(".")[0]
if name not in metas :
if "RATP" in name :
featsForFile = featuresDecoda
else :
print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
exit(1)
if "RATP" not in name :
for i in range(len(features)) :
featValue = "n/a"
if features[i] in metas[name] :
featValue = metas[name][features[i]]
featsForFile[i] = featValue
for line in open(mcfs+mcf, "r") :
clean = line.strip()
if len(line) <= 2 :
print(file=output)
continue
completeLine = clean
for feat in featsForFile :
completeLine += "\t" + feat
print(completeLine,file=output)
def main() :
if len(sys.argv) != 3 :
printUsageAndExit()
treatDirectory(sys.argv[1], sys.argv[2])
main()
#! /usr/bin/env python3
import sys
for filename in sys.argv[1:] :
lines = []
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 :
continue
lines.append(line)
with open(filename, "w") as out :
for line in lines :
print(line, file=out)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment