diff --git a/data/artificial/generate.py b/data/artificial/generate.py new file mode 100755 index 0000000000000000000000000000000000000000..99249acda2db64bac107d53e7f3cd141c968a3c7 --- /dev/null +++ b/data/artificial/generate.py @@ -0,0 +1,33 @@ +#! /usr/bin/env python3 + +import numpy +import random + +size = 10000 +probaB = 0.10 +sentSizeCenter = 20 +labels = ["zero", "one", "two", "three"] + +def genSentSize() : + return int(abs(numpy.random.normal(0.0, sentSizeCenter, 1)))+1 + +for filename in ["train.conllu", "dev.conllu", "test.conllu"] : + with open(filename, "w") as out : + print("# global.columns = ID FORM UPOS", file=out) + totalSize = 0 + sentId = 0 + while totalSize < size : + sentSize = genSentSize() + totalSize += sentSize + sentId += 1 + sentence = [[ID+1, "a" if random.randint(0,10000) > 10000*probaB else "b", len(labels)-1] for ID in range(sentSize)] + for i in [i for i in range(len(sentence))]+[i for i in range(len(sentence))][::-1] : + if sentence[i][1] == "b" : + sentence[i][2] = 0 + else : + sentence[i][2] = min(sentence[i-1][2]+1 if i > 0 else len(labels)-1, len(labels)-1, sentence[i+1][2]+1 if i < len(sentence)-1 else len(labels)-1) + print("# sent_id = %d"%sentId, file=out) + for elem in sentence : + elem[2] = labels[elem[2]] + print("\t".join(map(str,elem)), file=out) + print("", file=out)