diff --git a/scripts/createKFolds.py b/scripts/createKFolds.py new file mode 100755 index 0000000000000000000000000000000000000000..8ef39a3e08a0cab418a9e4baed8c279e1b705af2 --- /dev/null +++ b/scripts/createKFolds.py @@ -0,0 +1,70 @@ +#! /usr/bin/env python3 + +import sys +import os +import random + +################################################################################ +def printUsageAndExit() : + print("USAGE : %s UDDir outputDir nbFolds"%sys.argv[0], file=sys.stderr) + exit(1) +################################################################################ + +################################################################################ +if __name__ == "__main__" : + if len(sys.argv) != 4 : + printUsageAndExit() + + random.seed(100) + + inputFiles = [sys.argv[1]+"/"+filename for filename in os.listdir(sys.argv[1]) if ".conllu" in filename] + + sentences = [] + header = "# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + for filename in inputFiles : + prevWasBlank = True + for line in open(filename, "r") : + line = line.strip() + if len(line) == 0 : + prevWasBlank = True + continue + if "# global.columns =" in line : + header = line + continue + if prevWasBlank : + sentences.append([]) + prevWasBlank = False + sentences[-1].append(line) + + random.shuffle(sentences) + + print(header) + for sentence in sentences : + print("\n".join(sentence)+"\n") + + nbFolds = int(sys.argv[3]) + + testSize = int(len(sentences)/nbFolds) + partition = [i for i in range(0, len(sentences), testSize)] + partition = [[partition[i],partition[i+1]] for i in range(len(partition)-1)] + partition[-1][-1] = len(sentences) + partition = [range(p[0],p[1]) for p in partition] + + for k in range(len(partition)) : + test = [sentences[i] for i in partition[k]] + trainDev = [sentences[i] for i in range(len(sentences)) if i not in partition[k]] + train = trainDev[:-testSize] + dev = trainDev[-testSize:] + + outDir = sys.argv[2]+"/"+sys.argv[1] + while outDir[-1] == '/' : + outDir = outDir[:-1] + outDir = outDir + "_" + str(k) + os.makedirs(outDir, exist_ok=True) + for sents, name in [(train, "train"), (dev, "dev"), (test, "test")] : + with open(outDir + "/" + "%s.conllu"%name, "w") as outFile : + print(header, file=outFile) + for sentence in sents : + print("\n".join(sentence)+"\n", file=outFile) +################################################################################ +