Skip to content
Snippets Groups Projects
Commit 282c818e authored by Franck Dary's avatar Franck Dary
Browse files

Added script to split UD corpus into k folds

parent ada1cdb1
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import sys
import os
import random
################################################################################
def printUsageAndExit() :
print("USAGE : %s UDDir outputDir nbFolds"%sys.argv[0], file=sys.stderr)
exit(1)
################################################################################
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 4 :
printUsageAndExit()
random.seed(100)
inputFiles = [sys.argv[1]+"/"+filename for filename in os.listdir(sys.argv[1]) if ".conllu" in filename]
sentences = []
header = "# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
for filename in inputFiles :
prevWasBlank = True
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 :
prevWasBlank = True
continue
if "# global.columns =" in line :
header = line
continue
if prevWasBlank :
sentences.append([])
prevWasBlank = False
sentences[-1].append(line)
random.shuffle(sentences)
print(header)
for sentence in sentences :
print("\n".join(sentence)+"\n")
nbFolds = int(sys.argv[3])
testSize = int(len(sentences)/nbFolds)
partition = [i for i in range(0, len(sentences), testSize)]
partition = [[partition[i],partition[i+1]] for i in range(len(partition)-1)]
partition[-1][-1] = len(sentences)
partition = [range(p[0],p[1]) for p in partition]
for k in range(len(partition)) :
test = [sentences[i] for i in partition[k]]
trainDev = [sentences[i] for i in range(len(sentences)) if i not in partition[k]]
train = trainDev[:-testSize]
dev = trainDev[-testSize:]
outDir = sys.argv[2]+"/"+sys.argv[1]
while outDir[-1] == '/' :
outDir = outDir[:-1]
outDir = outDir + "_" + str(k)
os.makedirs(outDir, exist_ok=True)
for sents, name in [(train, "train"), (dev, "dev"), (test, "test")] :
with open(outDir + "/" + "%s.conllu"%name, "w") as outFile :
print(header, file=outFile)
for sentence in sents :
print("\n".join(sentence)+"\n", file=outFile)
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment