diff --git a/fr_orpheo/data/Makefile b/fr_orpheo/data/Makefile index 3d02d76df48b98ea6bcca1f4cf319a8d3289ef37..eea93cce9cad9b869e160d091000d1011775e1ce 100644 --- a/fr_orpheo/data/Makefile +++ b/fr_orpheo/data/Makefile @@ -1,16 +1,19 @@ ORFEO_DIR=../../data/fr_orpheo conllu : - ./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/ - ./rmBlankLines.py train.conllu test.conllu + ./getOrfeoTrainDevTest.py $(ORFEO_DIR)/mcf/ 0.15 0.15 ~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain + ~/macaon_data/scripts/conlluCheckProblems.py dev.conllu > dev 2> pbDev ~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest mv train train.conllu + mv dev dev.conllu mv test test.conllu - ~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu + wc -l *\.conllu + wc -l pb* clean: - rm *\.conll* - rm pbTrain + - rm pbDev - rm pbTest diff --git a/fr_orpheo/data/conllu.mcd b/fr_orpheo/data/conllu.mcd deleted file mode 100644 index 9becf724f31d4ffef87d805d045dda73a521409d..0000000000000000000000000000000000000000 --- a/fr_orpheo/data/conllu.mcd +++ /dev/null @@ -1,18 +0,0 @@ -0 FILENAME -1 ID -2 FORM -3 LEMMA -4 POS -5 POS2 -6 EMPTY -7 GOV -8 LABEL -9 EMPTY -10 EMPTY -11 TIME1 -12 TIME2 -13 SPKR -14 NBLOCUTEURS -15 MILIEU -16 TYPE -17 SECTEUR diff --git a/fr_orpheo/data/getOrfeoTrainDevTest.py b/fr_orpheo/data/getOrfeoTrainDevTest.py index 4731e0bdf0ea16e32f12b25571b650f368e54597..4e39c4a20b350b634018efaee07c2e2a009abff0 100755 --- a/fr_orpheo/data/getOrfeoTrainDevTest.py +++ b/fr_orpheo/data/getOrfeoTrainDevTest.py @@ -13,6 +13,8 @@ if __name__ == "__main__" : if len(sys.argv) != 4 : printUsageAndExit() + mcd = "# global.columns = FILE ID FORM LEMMA XPOS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER" + random.seed(0) baseDir = sys.argv[1] + "/" @@ -27,7 +29,9 @@ if __name__ == "__main__" : for line in open(baseDir+filename, "r") : line = line.strip() - filesPerFamily[family][-1][1].append(line) + # Remove consecutives blanklines + if len(line) > 0 or len(filesPerFamily[family][-1][1]) == 0 or len(filesPerFamily[family][-1][1][-1]) != 0 : + filesPerFamily[family][-1][1].append(line) totalTrain = [] totalDev = [] @@ -81,14 +85,15 @@ if __name__ == "__main__" : totalTest += filesPerFamily[family][i][1] with open("train.conllu", "w") as out : - print("\n".join(totalTrain), file=out) + print("\n".join([mcd]+totalTrain), file=out) with open("dev.conllu", "w") as out : - print("\n".join(totalDev), file=out) + print("\n".join([mcd]+totalDev), file=out) with open("test.conllu", "w") as out : - print("\n".join(totalTest), file=out) + print("\n".join([mcd]+totalTest), file=out) totalLines = len(totalTrain) + len(totalDev) + len(totalTest) print("Total : %d lines"%totalLines) print("Train : %d (%.2f%%)"%(len(totalTrain), 100.0*len(totalTrain)/totalLines)) print("Dev : %d (%.2f%%)"%(len(totalDev), 100.0*len(totalDev)/totalLines)) print("Test : %d (%.2f%%)"%(len(totalTest), 100.0*len(totalTest)/totalLines)) + diff --git a/fr_orpheo/data/prepareOrfeoData.py b/fr_orpheo/data/prepareOrfeoData.py deleted file mode 100755 index 1438732e76ffb2c531c93f3a97495dc7a33afb8e..0000000000000000000000000000000000000000 --- a/fr_orpheo/data/prepareOrfeoData.py +++ /dev/null @@ -1,127 +0,0 @@ -#! /usr/bin/python3 - -import sys -import os - -def mcd() : - return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU" - -def printUsageAndExit() : - print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr) - exit(1) - -def cleanString(s) : - result = s.strip() - - return result.replace("\"", "").replace("/>", "") - -def treatDirectory(mcfs, metadatas) : - trains = [] - tests = [] - metas = {} - features = ["nbLocuteurs", "milieu", "type", "secteur"] - featuresDecoda = ["2", "assistance", "finalise", "professionnel"] - - for entry in os.listdir(mcfs) : - if os.path.isfile(os.path.join(mcfs, entry)) : - if entry.endswith(".train") : - trains.append(entry) - elif entry.endswith(".test") : - tests.append(entry) - else : - print("ERROR : unknown file %s"%entry, file=sys.stderr) - exit(1) - - for entry in os.listdir(metadatas) : - if os.path.isfile(os.path.join(metadatas, entry)) : - splited = entry.split('.') - if len(splited) != 2 or splited[1] != "xml" : - continue - name = splited[0] - for line in open(metadatas+entry, "r") : - if "corresp" in line : - splited = line.split(' ') - target = "" - corresp = "" - for s in splited : - splited2 = s.split('=') - if len(splited2) != 2 : - continue - if splited2[0] == "target" : - target = cleanString(splited2[1]) - elif splited2[0] == "corresp" : - corresp = cleanString(splited2[1]) - else : - print("ERROR : wrong line \'%s\'."%line, file=sys.stderr) - exit(1) - if name not in metas : - metas[name] = {} - metas[name][corresp] = target - - output = open("train.conllu", "w") - - print(mcd(), file=output) - for mcf in trains : - featsForFile = list.copy(features) - name = mcf.split(".")[0] - if name not in metas : - if "RATP" in name : - featsForFile = featuresDecoda - else : - print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr) - exit(1) - if "RATP" not in name : - for i in range(len(features)) : - featValue = "n/a" - if features[i] in metas[name] : - featValue = metas[name][features[i]] - - featsForFile[i] = featValue - - for line in open(mcfs+mcf, "r") : - clean = line.strip() - if len(line) <= 2 : - print(file=output) - continue - completeLine = clean - for feat in featsForFile : - completeLine += "\t" + feat - print(completeLine,file=output) - - output = open("test.conllu", "w") - - print(mcd(), file=output) - for mcf in tests : - featsForFile = list.copy(features) - name = mcf.split(".")[0] - if name not in metas : - if "RATP" in name : - featsForFile = featuresDecoda - else : - print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr) - exit(1) - if "RATP" not in name : - for i in range(len(features)) : - featValue = "n/a" - if features[i] in metas[name] : - featValue = metas[name][features[i]] - - featsForFile[i] = featValue - - for line in open(mcfs+mcf, "r") : - clean = line.strip() - if len(line) <= 2 : - print(file=output) - continue - completeLine = clean - for feat in featsForFile : - completeLine += "\t" + feat - print(completeLine,file=output) - -def main() : - if len(sys.argv) != 3 : - printUsageAndExit() - - treatDirectory(sys.argv[1], sys.argv[2]) - -main() diff --git a/fr_orpheo/data/rmBlankLines.py b/fr_orpheo/data/rmBlankLines.py deleted file mode 100755 index 641bfe1b3117d20b0d48f7036a572f1fe6c6e175..0000000000000000000000000000000000000000 --- a/fr_orpheo/data/rmBlankLines.py +++ /dev/null @@ -1,16 +0,0 @@ -#! /usr/bin/env python3 - -import sys - -for filename in sys.argv[1:] : - lines = [] - for line in open(filename, "r") : - line = line.strip() - if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 : - continue - lines.append(line) - - with open(filename, "w") as out : - for line in lines : - print(line, file=out) -