Skip to content
Snippets Groups Projects
Commit 24e86989 authored by Franck Dary's avatar Franck Dary
Browse files

Added artificial dataset to test backtrack

parent 6c8dcca4
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import numpy
import random
size = 10000
probaB = 0.10
sentSizeCenter = 20
labels = ["zero", "one", "two", "three"]
def genSentSize() :
return int(abs(numpy.random.normal(0.0, sentSizeCenter, 1)))+1
for filename in ["train.conllu", "dev.conllu", "test.conllu"] :
with open(filename, "w") as out :
print("# global.columns = ID FORM UPOS", file=out)
totalSize = 0
sentId = 0
while totalSize < size :
sentSize = genSentSize()
totalSize += sentSize
sentId += 1
sentence = [[ID+1, "a" if random.randint(0,10000) > 10000*probaB else "b", len(labels)-1] for ID in range(sentSize)]
for i in [i for i in range(len(sentence))]+[i for i in range(len(sentence))][::-1] :
if sentence[i][1] == "b" :
sentence[i][2] = 0
else :
sentence[i][2] = min(sentence[i-1][2]+1 if i > 0 else len(labels)-1, len(labels)-1, sentence[i+1][2]+1 if i < len(sentence)-1 else len(labels)-1)
print("# sent_id = %d"%sentId, file=out)
for elem in sentence :
elem[2] = labels[elem[2]]
print("\t".join(map(str,elem)), file=out)
print("", file=out)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment