Added artificial dataset to test backtrack

24e86989 · Franck Dary · 6c8dcca4 · 24e86989
Commit 24e86989 authored 3 years ago by Franck Dary
--- a/data/artificial/generate.py
+++ b/data/artificial/generate.py
+#! /usr/bin/env python3
+
+import numpy
+import random
+
+size = 10000
+probaB = 0.10
+sentSizeCenter = 20
+labels = ["zero", "one", "two", "three"]
+
+def genSentSize() :
+  return int(abs(numpy.random.normal(0.0, sentSizeCenter, 1)))+1
+
+for filename in ["train.conllu", "dev.conllu", "test.conllu"] :
+  with open(filename, "w") as out :
+    print("# global.columns = ID FORM UPOS", file=out)
+    totalSize = 0
+    sentId = 0
+    while totalSize < size :
+      sentSize = genSentSize()
+      totalSize += sentSize
+      sentId += 1
+      sentence = [[ID+1, "a" if random.randint(0,10000) > 10000*probaB else "b", len(labels)-1] for ID in range(sentSize)]
+      for i in [i for i in range(len(sentence))]+[i for i in range(len(sentence))][::-1] :
+        if sentence[i][1] == "b" :
+          sentence[i][2] = 0
+        else :
+          sentence[i][2] = min(sentence[i-1][2]+1 if i > 0 else len(labels)-1, len(labels)-1, sentence[i+1][2]+1 if i < len(sentence)-1 else len(labels)-1)
+      print("# sent_id = %d"%sentId, file=out)
+      for elem in sentence :
+        elem[2] = labels[elem[2]]
+        print("\t".join(map(str,elem)), file=out)
+      print("", file=out)