diff --git a/UD_fr-GSD/morpho/error_morpho.cla b/UD_fr-GSD/morpho/error_morpho.cla index fd22d1c53e7f8da4e5c6aaa6dd18954767c7e265..0042b16e5e6e8cb0cc5578d01b797b48097612e1 100644 --- a/UD_fr-GSD/morpho/error_morpho.cla +++ b/UD_fr-GSD/morpho/error_morpho.cla @@ -3,5 +3,5 @@ Type : Prediction Oracle : none Feature Model : error_morpho.fm Action Set : error_morpho.as -Topology : (500,RELU,0.3) -Batchsize : 1 +Topology : R(500,RELU,0.3) +Batchsize : 50 diff --git a/UD_fr-GSD/morpho/error_morpho.fm b/UD_fr-GSD/morpho/error_morpho.fm index 2a8344ed89f077d2645e27c26243af97b3fda347..ddfe4426849c82aee0187b858f793f790a4ab266 100644 --- a/UD_fr-GSD/morpho/error_morpho.fm +++ b/UD_fr-GSD/morpho/error_morpho.fm @@ -1,29 +1,30 @@ # Features classiques # FORM -b.0#FORM.fasttext -b.1#FORM.fasttext -b.2#FORM.fasttext -b.-1#FORM.fasttext -b.-2#FORM.fasttext +#b.0#FORM.fasttext +#b.1#FORM.fasttext +#b.2#FORM.fasttext +#b.-1#FORM.fasttext +#b.-2#FORM.fasttext # POS b.0#POS b.-1#POS b.-2#POS b.-3#POS +b.-4#POS # MORPHO b.-1#MORPHO b.-2#MORPHO +b.-3#MORPHO +b.-4#MORPHO # UPPERCASE b.0#FORM.U -b.1#FORM.U -# UPPERCASE +b.-1#FORM.U +b.-2#FORM.U +b.-3#FORM.U +b.-4#FORM.U +# LENGTH b.0#FORM.LEN -# SUFFIXES -b.0#FORM.PART.-4.-4 -b.0#FORM.PART.-3.-3 -b.0#FORM.PART.-2.-2 -b.0#FORM.PART.-1.-1 -b.0#FORM.PART.0.0 -b.0#FORM.PART.1.1 -b.0#FORM.PART.2.2 -b.0#FORM.PART.3.3 +b.-1#FORM.LEN +b.-2#FORM.LEN +b.-3#FORM.LEN +b.-4#FORM.LEN diff --git a/UD_fr-GSD/morpho/machine.tm b/UD_fr-GSD/morpho/machine.tm index 7af22171d2c4151bffc121cd2dc48e6ef3e21205..530fada046f062d3782971c3eb121d17bfa5e67f 100644 --- a/UD_fr-GSD/morpho/machine.tm +++ b/UD_fr-GSD/morpho/machine.tm @@ -1,12 +1,8 @@ -Name : Morpho with error correction +Name : Morpho Machine Dicts : morpho.dicts %CLASSIFIERS morpho morpho.cla -error_morpho error_morpho.cla %STATES -morpho morpho -error_morpho error_morpho +morpho1 morpho %TRANSITIONS -morpho error_morpho 0 * -error_morpho morpho 0 BACK -error_morpho morpho 1 * +morpho1 morpho1 +1 * diff --git a/UD_fr-GSD/morpho/morpho.cla b/UD_fr-GSD/morpho/morpho.cla index 684ebb21140a368ac8a6398ab553cb109405be8d..97000f557666dac107f9615716780df331982222 100644 --- a/UD_fr-GSD/morpho/morpho.cla +++ b/UD_fr-GSD/morpho/morpho.cla @@ -3,4 +3,4 @@ Type : Prediction Oracle : morpho Feature Model : morpho.fm Action Set : morpho.as -Topology : (1000,RELU,0.3) +Topology : R(1000,RELU,0.3) diff --git a/UD_fr-GSD/morpho/morpho.fm b/UD_fr-GSD/morpho/morpho.fm index d52d03c318f202dcdf87ba610183a43dab2db23d..3e542b814ffb5f185c3949d4ce25d8145a1b6273 100644 --- a/UD_fr-GSD/morpho/morpho.fm +++ b/UD_fr-GSD/morpho/morpho.fm @@ -29,10 +29,10 @@ b.0#FORM.PART.2.2 b.0#FORM.PART.3.3 # ERROR CORRECTION b.1#POS -b.1#MORPHO +#b.1#MORPHO b.2#POS -b.2#MORPHO +#b.2#MORPHO b.3#POS -b.3#MORPHO +#b.3#MORPHO b.4#POS -b.4#MORPHO +#b.4#MORPHO diff --git a/toy_backtrack/.gitignore b/toy_backtrack/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0d2b44f555401e5a5f1144224bfa157b15e21448 --- /dev/null +++ b/toy_backtrack/.gitignore @@ -0,0 +1,3 @@ +data/*\.mcf +eval/toy_backtrack.res +eval/stderr.log diff --git a/toy_backtrack/data/Makefile b/toy_backtrack/data/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..86192091f1abcbb98200dc21e189654c32d79131 --- /dev/null +++ b/toy_backtrack/data/Makefile @@ -0,0 +1,17 @@ +MCD=wpmlgfs.mcd +TOOLS=../../tools + +all: train.mcf test.mcf dev.mcf + +train.mcf: + ./createMCF.py $(MCD) 100000 100 > $@ + +test.mcf: + ./createMCF.py $(MCD) 10000 200 > $@ + +dev.mcf: + ./createMCF.py $(MCD) 10000 300 > $@ + +clean: + - rm *\.mcf + diff --git a/toy_backtrack/data/createMCF.py b/toy_backtrack/data/createMCF.py new file mode 100755 index 0000000000000000000000000000000000000000..85f2cafba25132f89848ac822782cc276af82b8a --- /dev/null +++ b/toy_backtrack/data/createMCF.py @@ -0,0 +1,86 @@ +#! /usr/bin/python3 + +import sys +import random + +def printUsageAndExit() : + print("Usage : %s mcd nbLines seed"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +def main() : + if len(sys.argv) != 4 : + printUsageAndExit() + + random.seed(int(sys.argv[3])) + + mcfMCD = readMCD(sys.argv[1]) + mcfMCDr = {v: k for k, v in mcfMCD.items()} + + data = [] + + nbLines = int(sys.argv[2]) + for i in range(nbLines) : + word = random.randint(0,5) + pos = word + random.randint(-2,2) + morpho = (pos+word) % 6 + eos = "_" + if len(data) % 50 == 0 and len(data) > 0 : + eos = "1" + data += [[word,pos,morpho,eos]] + + for i in range(len(data)) : + if random.random() < 0.1 : + continue + + if data[i][0] <= 1 and i > 0 and data[i-1][3] == "_" : + data[i][1] = data[i-1][1] + + if data[i][1] >= 3 and i > 0 and data[i-1][3] == "_" : + data[i][1] = data[i-1][1] + + data[i][2] = (data[i][0] + data[i][1]) % 6 + + for i in range(len(data)) : + if i > 0 and data[i-1][2] == 2 and data[i-1][3] == "_" : + data[i][2] = 0 + continue + if i > 0 and data[i-1][2] == 3 and data[i-1][3] == "_": + data[i][2] = 0 + continue + if i > 0 and data[i-1][2] < 0 and data[i-1][3] == "_" : + data[i][2] = 0 + continue + + if i < len(data)-1 and data[i][0] == data[i+1][0] and data[i][3] == "_" : + data[i][2] = 2 + if i < len(data)-2 and data[i][0] == data[i+2][0] and data[i][3]+data[i+1][3] == "__" : + data[i][2] = 3 + elif i < len(data)-1 and data[i][1] == data[i+1][1] and data[i][3] == "_" : + data[i][2] = -2 + if i < len(data)-2 and data[i][1] == data[i+2][1] and data[i][3]+data[i+1][3] == "__" : + data[i][2] = -3 + + + for i in range(len(data)) : + print(data[i][0],end="\t") + print(data[i][1],end="\t") + print(data[i][2],end="\t") + print(data[i][3],end="\n") + +if __name__ == "__main__" : + main() + diff --git a/toy_backtrack/data/createMCF2.py b/toy_backtrack/data/createMCF2.py new file mode 100755 index 0000000000000000000000000000000000000000..d47b953f77a28865f2ba26d85e764b654ad7383e --- /dev/null +++ b/toy_backtrack/data/createMCF2.py @@ -0,0 +1,49 @@ +#! /usr/bin/python3 + +import sys +import random + +def printUsageAndExit() : + print("Usage : %s mcd nbLines"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +def main() : + if len(sys.argv) != 3 : + printUsageAndExit() + + random.seed(100) + + mcfMCD = readMCD(sys.argv[1]) + mcfMCDr = {v: k for k, v in mcfMCD.items()} + + data = [] + + nbLines = int(sys.argv[2]) + for i in range(nbLines) : + word = random.randint(0,5) + pos = word + random.randint(-2,2) + morpho = (pos+word) % 6 + data += [[word,pos,morpho]] + + for i in range(len(data)) : + print(data[i][0],end="\t") + print(data[i][1],end="\t") + print(data[i][2],end="\n") + +if __name__ == "__main__" : + main() + diff --git a/toy_backtrack/data/wpmlgfs.mcd b/toy_backtrack/data/wpmlgfs.mcd new file mode 100644 index 0000000000000000000000000000000000000000..8ec6631e1558cbe0cfcba45c283a6fa56e062885 --- /dev/null +++ b/toy_backtrack/data/wpmlgfs.mcd @@ -0,0 +1,4 @@ +0 FORM +1 POS +2 MORPHO +3 EOS diff --git a/toy_backtrack/eval/average.sh b/toy_backtrack/eval/average.sh new file mode 100755 index 0000000000000000000000000000000000000000..93eb09e582d87f8506e4961109e742a0bffb193d --- /dev/null +++ b/toy_backtrack/eval/average.sh @@ -0,0 +1,6 @@ +#! /bin/bash + +LANG=fr +RES=$LANG.res + +exec ../../scripts/average.py $RES diff --git a/toy_backtrack/eval/eval.sh b/toy_backtrack/eval/eval.sh new file mode 100755 index 0000000000000000000000000000000000000000..06ba9f8eec071653f6f97f8f732409829ca2a8ac --- /dev/null +++ b/toy_backtrack/eval/eval.sh @@ -0,0 +1,8 @@ +#! /bin/bash + +LANG=toy_backtrack +MCF=../data/test.mcf +MCD=../data/wpmlgfs.mcd +ARGS="--ignore FORM --ignore POS" + +exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS diff --git a/toy_backtrack/eval/totalEval.sh b/toy_backtrack/eval/totalEval.sh new file mode 100755 index 0000000000000000000000000000000000000000..927fb5f44537e0e9851ca839f7f1fb8c1c804eb3 --- /dev/null +++ b/toy_backtrack/eval/totalEval.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +./eval.sh tagger morpho lemmatizer parser tagparser tagger+morpho+lemmatizer+parser diff --git a/toy_backtrack/morpho/errorCorrection.tm b/toy_backtrack/morpho/errorCorrection.tm new file mode 100644 index 0000000000000000000000000000000000000000..7af22171d2c4151bffc121cd2dc48e6ef3e21205 --- /dev/null +++ b/toy_backtrack/morpho/errorCorrection.tm @@ -0,0 +1,12 @@ +Name : Morpho with error correction +Dicts : morpho.dicts +%CLASSIFIERS +morpho morpho.cla +error_morpho error_morpho.cla +%STATES +morpho morpho +error_morpho error_morpho +%TRANSITIONS +morpho error_morpho 0 * +error_morpho morpho 0 BACK +error_morpho morpho 1 * diff --git a/toy_backtrack/morpho/error_morpho.as b/toy_backtrack/morpho/error_morpho.as new file mode 100644 index 0000000000000000000000000000000000000000..e9182f3bc4fd349a75722301d1ee825d7de8fe70 --- /dev/null +++ b/toy_backtrack/morpho/error_morpho.as @@ -0,0 +1,5 @@ +EPSILON +BACK 1 +BACK 2 +BACK 3 +BACK 4 diff --git a/toy_backtrack/morpho/error_morpho.cla b/toy_backtrack/morpho/error_morpho.cla new file mode 100644 index 0000000000000000000000000000000000000000..fd22d1c53e7f8da4e5c6aaa6dd18954767c7e265 --- /dev/null +++ b/toy_backtrack/morpho/error_morpho.cla @@ -0,0 +1,7 @@ +Name : Error_Morpho +Type : Prediction +Oracle : none +Feature Model : error_morpho.fm +Action Set : error_morpho.as +Topology : (500,RELU,0.3) +Batchsize : 1 diff --git a/toy_backtrack/morpho/error_morpho.fm b/toy_backtrack/morpho/error_morpho.fm new file mode 100644 index 0000000000000000000000000000000000000000..2a8344ed89f077d2645e27c26243af97b3fda347 --- /dev/null +++ b/toy_backtrack/morpho/error_morpho.fm @@ -0,0 +1,29 @@ +# Features classiques +# FORM +b.0#FORM.fasttext +b.1#FORM.fasttext +b.2#FORM.fasttext +b.-1#FORM.fasttext +b.-2#FORM.fasttext +# POS +b.0#POS +b.-1#POS +b.-2#POS +b.-3#POS +# MORPHO +b.-1#MORPHO +b.-2#MORPHO +# UPPERCASE +b.0#FORM.U +b.1#FORM.U +# UPPERCASE +b.0#FORM.LEN +# SUFFIXES +b.0#FORM.PART.-4.-4 +b.0#FORM.PART.-3.-3 +b.0#FORM.PART.-2.-2 +b.0#FORM.PART.-1.-1 +b.0#FORM.PART.0.0 +b.0#FORM.PART.1.1 +b.0#FORM.PART.2.2 +b.0#FORM.PART.3.3 diff --git a/toy_backtrack/morpho/machine.tm b/toy_backtrack/morpho/machine.tm new file mode 100644 index 0000000000000000000000000000000000000000..530fada046f062d3782971c3eb121d17bfa5e67f --- /dev/null +++ b/toy_backtrack/morpho/machine.tm @@ -0,0 +1,8 @@ +Name : Morpho Machine +Dicts : morpho.dicts +%CLASSIFIERS +morpho morpho.cla +%STATES +morpho1 morpho +%TRANSITIONS +morpho1 morpho1 +1 * diff --git a/toy_backtrack/morpho/morpho.as b/toy_backtrack/morpho/morpho.as new file mode 100644 index 0000000000000000000000000000000000000000..1d3ff7c1f428ad63e013580f13cd2b6367ccd2ea --- /dev/null +++ b/toy_backtrack/morpho/morpho.as @@ -0,0 +1,8 @@ +WRITE b.0 MORPHO 0 +WRITE b.0 MORPHO 1 +WRITE b.0 MORPHO -2 +WRITE b.0 MORPHO 2 +WRITE b.0 MORPHO -3 +WRITE b.0 MORPHO 3 +WRITE b.0 MORPHO 4 +WRITE b.0 MORPHO 5 diff --git a/toy_backtrack/morpho/morpho.cla b/toy_backtrack/morpho/morpho.cla new file mode 100644 index 0000000000000000000000000000000000000000..8201c433ca18714ef34b83dfb8228ed5cd960c24 --- /dev/null +++ b/toy_backtrack/morpho/morpho.cla @@ -0,0 +1,6 @@ +Name : Morpho +Type : Prediction +Oracle : morpho +Feature Model : morpho.fm +Action Set : morpho.as +Topology : (500,RELU,0.0) diff --git a/toy_backtrack/morpho/morpho.dicts b/toy_backtrack/morpho/morpho.dicts new file mode 100644 index 0000000000000000000000000000000000000000..0285d2c0a02a4752ba08ec58a29b0cdf1919d735 --- /dev/null +++ b/toy_backtrack/morpho/morpho.dicts @@ -0,0 +1,17 @@ +#Name Dimension Mode # +################################### +Morpho_bool 10 Embeddings +Morpho_int 10 Embeddings +Morpho_letters 30 Embeddings +Morpho_pos 30 Embeddings +Morpho_form 100 Embeddings +Morpho_morpho 30 Embeddings +Morpho_actions 30 Embeddings +# ERROR_MORPHO +Error_Morpho_actions 18 Embeddings _ +Error_Morpho_bool 16 Embeddings _ +Error_Morpho_int 16 Embeddings _ +Error_Morpho_pos 18 Embeddings _ +Error_Morpho_form 30 Embeddings _ +Error_Morpho_letters 30 Embeddings _ +Error_Morpho_morpho 22 Embeddings _ diff --git a/toy_backtrack/morpho/morpho.fm b/toy_backtrack/morpho/morpho.fm new file mode 100644 index 0000000000000000000000000000000000000000..a45494d648d05bd9100eb07e6d094eedc67ded04 --- /dev/null +++ b/toy_backtrack/morpho/morpho.fm @@ -0,0 +1,25 @@ +# Features classiques +# FORM +b.0#FORM +b.1#FORM +b.2#FORM +b.-1#FORM +b.-2#FORM +# POS +b.0#POS +b.-1#POS +b.-2#POS +b.-3#POS +# MORPHO +b.-1#MORPHO +b.-2#MORPHO +b.-1#MORPHO.PART.0.0 +# ERROR CORRECTION +#b.1#POS +#b.1#MORPHO +#b.2#POS +#b.2#MORPHO +#b.3#POS +#b.3#MORPHO +#b.4#POS +#b.4#MORPHO diff --git a/toy_backtrack/morpho/normal.tm b/toy_backtrack/morpho/normal.tm new file mode 100644 index 0000000000000000000000000000000000000000..530fada046f062d3782971c3eb121d17bfa5e67f --- /dev/null +++ b/toy_backtrack/morpho/normal.tm @@ -0,0 +1,8 @@ +Name : Morpho Machine +Dicts : morpho.dicts +%CLASSIFIERS +morpho morpho.cla +%STATES +morpho1 morpho +%TRANSITIONS +morpho1 morpho1 +1 * diff --git a/toy_backtrack/morpho/test.bd b/toy_backtrack/morpho/test.bd new file mode 100644 index 0000000000000000000000000000000000000000..bdf2eeb34fe2f59101628ddc41896bbc5479fdf2 --- /dev/null +++ b/toy_backtrack/morpho/test.bd @@ -0,0 +1,6 @@ +#Name ref/hyp dict Policy Must print?# +############################################ +FORM ref form Final 1 +POS ref pos Final 1 +MORPHO hyp morpho Final 1 +EOS ref eos Final 1 diff --git a/toy_backtrack/morpho/train.bd b/toy_backtrack/morpho/train.bd new file mode 100644 index 0000000000000000000000000000000000000000..eeecd23dd992e741440c13f5dafe9b1089edccbf --- /dev/null +++ b/toy_backtrack/morpho/train.bd @@ -0,0 +1,6 @@ +#Name ref/hyp dict Policy Must print?# +############################################ +FORM ref form FromZero 1 +POS ref pos FromZero 1 +MORPHO hyp morpho FromZero 1 +EOS ref eos FromZero 1 diff --git a/toy_backtrack/train.sh b/toy_backtrack/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..21aba79015140dce3360eb2d8741f365b3d3c482 --- /dev/null +++ b/toy_backtrack/train.sh @@ -0,0 +1,5 @@ +#! /bin/bash + +LANG="toy_backtrack" + +../scripts/train.sh $LANG $@