Select Git revision
createKFolds.py 2.41 KiB
#! /usr/bin/env python3
import sys
import os
import random
################################################################################
def printUsageAndExit() :
print("USAGE : %s UDDir outputDir nbFolds"%sys.argv[0], file=sys.stderr)
exit(1)
################################################################################
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 4 :
printUsageAndExit()
random.seed(100)
corpusName = sys.argv[1]
while corpusName[-1] == '/' :
corpusName = corpusName[:-1]
corpusName = corpusName.split('/')[-1]
inputFiles = [sys.argv[1]+"/"+filename for filename in os.listdir(sys.argv[1]) if ".conllu" in filename]
sentences = []
header = "# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
for filename in inputFiles :
prevWasBlank = True
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 :
prevWasBlank = True
continue
if "# global.columns =" in line :
header = line
continue
if prevWasBlank :
sentences.append([])
prevWasBlank = False
sentences[-1].append(line)
random.shuffle(sentences)
print(header)
for sentence in sentences :
print("\n".join(sentence)+"\n")
nbFolds = int(sys.argv[3])
testSize = int(len(sentences)/nbFolds)
partition = [i for i in range(0, len(sentences), testSize)]
partition = [[partition[i],partition[i+1]] for i in range(len(partition)-1)]
partition[-1][-1] = len(sentences)
partition = [range(p[0],p[1]) for p in partition]
for k in range(len(partition)) :
test = [sentences[i] for i in partition[k]]
trainDev = [sentences[i] for i in range(len(sentences)) if i not in partition[k]]
train = trainDev[:-testSize]
dev = trainDev[-testSize:]
outDir = sys.argv[2]+"/"+corpusName
while outDir[-1] == '/' :
outDir = outDir[:-1]
outDir = outDir + "_" + str(k)
print("Creating '%s'"%outDir, file=sys.stderr)
os.makedirs(outDir, exist_ok=True)
for sents, name in [(train, "train"), (dev, "dev"), (test, "test")] :
with open(outDir + "/" + "%s.conllu"%name, "w") as outFile :
print(header, file=outFile)
for sentence in sents :
print("\n".join(sentence)+"\n", file=outFile)
################################################################################