Skip to content
Snippets Groups Projects
Commit 7e80ea34 authored by Franck Dary's avatar Franck Dary
Browse files

added new language

parent b47cccd8
Branches
No related tags found
No related merge requests found
Showing
with 278 additions and 30 deletions
...@@ -3,5 +3,5 @@ Type : Prediction ...@@ -3,5 +3,5 @@ Type : Prediction
Oracle : none Oracle : none
Feature Model : error_morpho.fm Feature Model : error_morpho.fm
Action Set : error_morpho.as Action Set : error_morpho.as
Topology : (500,RELU,0.3) Topology : R(500,RELU,0.3)
Batchsize : 1 Batchsize : 50
# Features classiques # Features classiques
# FORM # FORM
b.0#FORM.fasttext #b.0#FORM.fasttext
b.1#FORM.fasttext #b.1#FORM.fasttext
b.2#FORM.fasttext #b.2#FORM.fasttext
b.-1#FORM.fasttext #b.-1#FORM.fasttext
b.-2#FORM.fasttext #b.-2#FORM.fasttext
# POS # POS
b.0#POS b.0#POS
b.-1#POS b.-1#POS
b.-2#POS b.-2#POS
b.-3#POS b.-3#POS
b.-4#POS
# MORPHO # MORPHO
b.-1#MORPHO b.-1#MORPHO
b.-2#MORPHO b.-2#MORPHO
b.-3#MORPHO
b.-4#MORPHO
# UPPERCASE # UPPERCASE
b.0#FORM.U b.0#FORM.U
b.1#FORM.U b.-1#FORM.U
# UPPERCASE b.-2#FORM.U
b.-3#FORM.U
b.-4#FORM.U
# LENGTH
b.0#FORM.LEN b.0#FORM.LEN
# SUFFIXES b.-1#FORM.LEN
b.0#FORM.PART.-4.-4 b.-2#FORM.LEN
b.0#FORM.PART.-3.-3 b.-3#FORM.LEN
b.0#FORM.PART.-2.-2 b.-4#FORM.LEN
b.0#FORM.PART.-1.-1
b.0#FORM.PART.0.0
b.0#FORM.PART.1.1
b.0#FORM.PART.2.2
b.0#FORM.PART.3.3
Name : Morpho with error correction Name : Morpho Machine
Dicts : morpho.dicts Dicts : morpho.dicts
%CLASSIFIERS %CLASSIFIERS
morpho morpho.cla morpho morpho.cla
error_morpho error_morpho.cla
%STATES %STATES
morpho morpho morpho1 morpho
error_morpho error_morpho
%TRANSITIONS %TRANSITIONS
morpho error_morpho 0 * morpho1 morpho1 +1 *
error_morpho morpho 0 BACK
error_morpho morpho 1 *
...@@ -3,4 +3,4 @@ Type : Prediction ...@@ -3,4 +3,4 @@ Type : Prediction
Oracle : morpho Oracle : morpho
Feature Model : morpho.fm Feature Model : morpho.fm
Action Set : morpho.as Action Set : morpho.as
Topology : (1000,RELU,0.3) Topology : R(1000,RELU,0.3)
...@@ -29,10 +29,10 @@ b.0#FORM.PART.2.2 ...@@ -29,10 +29,10 @@ b.0#FORM.PART.2.2
b.0#FORM.PART.3.3 b.0#FORM.PART.3.3
# ERROR CORRECTION # ERROR CORRECTION
b.1#POS b.1#POS
b.1#MORPHO #b.1#MORPHO
b.2#POS b.2#POS
b.2#MORPHO #b.2#MORPHO
b.3#POS b.3#POS
b.3#MORPHO #b.3#MORPHO
b.4#POS b.4#POS
b.4#MORPHO #b.4#MORPHO
data/*\.mcf
eval/toy_backtrack.res
eval/stderr.log
MCD=wpmlgfs.mcd
TOOLS=../../tools
all: train.mcf test.mcf dev.mcf
train.mcf:
./createMCF.py $(MCD) 100000 100 > $@
test.mcf:
./createMCF.py $(MCD) 10000 200 > $@
dev.mcf:
./createMCF.py $(MCD) 10000 300 > $@
clean:
- rm *\.mcf
#! /usr/bin/python3
import sys
import random
def printUsageAndExit() :
print("Usage : %s mcd nbLines seed"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
def main() :
if len(sys.argv) != 4 :
printUsageAndExit()
random.seed(int(sys.argv[3]))
mcfMCD = readMCD(sys.argv[1])
mcfMCDr = {v: k for k, v in mcfMCD.items()}
data = []
nbLines = int(sys.argv[2])
for i in range(nbLines) :
word = random.randint(0,5)
pos = word + random.randint(-2,2)
morpho = (pos+word) % 6
eos = "_"
if len(data) % 50 == 0 and len(data) > 0 :
eos = "1"
data += [[word,pos,morpho,eos]]
for i in range(len(data)) :
if random.random() < 0.1 :
continue
if data[i][0] <= 1 and i > 0 and data[i-1][3] == "_" :
data[i][1] = data[i-1][1]
if data[i][1] >= 3 and i > 0 and data[i-1][3] == "_" :
data[i][1] = data[i-1][1]
data[i][2] = (data[i][0] + data[i][1]) % 6
for i in range(len(data)) :
if i > 0 and data[i-1][2] == 2 and data[i-1][3] == "_" :
data[i][2] = 0
continue
if i > 0 and data[i-1][2] == 3 and data[i-1][3] == "_":
data[i][2] = 0
continue
if i > 0 and data[i-1][2] < 0 and data[i-1][3] == "_" :
data[i][2] = 0
continue
if i < len(data)-1 and data[i][0] == data[i+1][0] and data[i][3] == "_" :
data[i][2] = 2
if i < len(data)-2 and data[i][0] == data[i+2][0] and data[i][3]+data[i+1][3] == "__" :
data[i][2] = 3
elif i < len(data)-1 and data[i][1] == data[i+1][1] and data[i][3] == "_" :
data[i][2] = -2
if i < len(data)-2 and data[i][1] == data[i+2][1] and data[i][3]+data[i+1][3] == "__" :
data[i][2] = -3
for i in range(len(data)) :
print(data[i][0],end="\t")
print(data[i][1],end="\t")
print(data[i][2],end="\t")
print(data[i][3],end="\n")
if __name__ == "__main__" :
main()
#! /usr/bin/python3
import sys
import random
def printUsageAndExit() :
print("Usage : %s mcd nbLines"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
def main() :
if len(sys.argv) != 3 :
printUsageAndExit()
random.seed(100)
mcfMCD = readMCD(sys.argv[1])
mcfMCDr = {v: k for k, v in mcfMCD.items()}
data = []
nbLines = int(sys.argv[2])
for i in range(nbLines) :
word = random.randint(0,5)
pos = word + random.randint(-2,2)
morpho = (pos+word) % 6
data += [[word,pos,morpho]]
for i in range(len(data)) :
print(data[i][0],end="\t")
print(data[i][1],end="\t")
print(data[i][2],end="\n")
if __name__ == "__main__" :
main()
0 FORM
1 POS
2 MORPHO
3 EOS
#! /bin/bash
LANG=fr
RES=$LANG.res
exec ../../scripts/average.py $RES
#! /bin/bash
LANG=toy_backtrack
MCF=../data/test.mcf
MCD=../data/wpmlgfs.mcd
ARGS="--ignore FORM --ignore POS"
exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS
#! /bin/bash
./eval.sh tagger morpho lemmatizer parser tagparser tagger+morpho+lemmatizer+parser
Name : Morpho with error correction
Dicts : morpho.dicts
%CLASSIFIERS
morpho morpho.cla
error_morpho error_morpho.cla
%STATES
morpho morpho
error_morpho error_morpho
%TRANSITIONS
morpho error_morpho 0 *
error_morpho morpho 0 BACK
error_morpho morpho 1 *
EPSILON
BACK 1
BACK 2
BACK 3
BACK 4
Name : Error_Morpho
Type : Prediction
Oracle : none
Feature Model : error_morpho.fm
Action Set : error_morpho.as
Topology : (500,RELU,0.3)
Batchsize : 1
# Features classiques
# FORM
b.0#FORM.fasttext
b.1#FORM.fasttext
b.2#FORM.fasttext
b.-1#FORM.fasttext
b.-2#FORM.fasttext
# POS
b.0#POS
b.-1#POS
b.-2#POS
b.-3#POS
# MORPHO
b.-1#MORPHO
b.-2#MORPHO
# UPPERCASE
b.0#FORM.U
b.1#FORM.U
# UPPERCASE
b.0#FORM.LEN
# SUFFIXES
b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3
b.0#FORM.PART.-2.-2
b.0#FORM.PART.-1.-1
b.0#FORM.PART.0.0
b.0#FORM.PART.1.1
b.0#FORM.PART.2.2
b.0#FORM.PART.3.3
Name : Morpho Machine
Dicts : morpho.dicts
%CLASSIFIERS
morpho morpho.cla
%STATES
morpho1 morpho
%TRANSITIONS
morpho1 morpho1 +1 *
WRITE b.0 MORPHO 0
WRITE b.0 MORPHO 1
WRITE b.0 MORPHO -2
WRITE b.0 MORPHO 2
WRITE b.0 MORPHO -3
WRITE b.0 MORPHO 3
WRITE b.0 MORPHO 4
WRITE b.0 MORPHO 5
Name : Morpho
Type : Prediction
Oracle : morpho
Feature Model : morpho.fm
Action Set : morpho.as
Topology : (500,RELU,0.0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment