Skip to content
Snippets Groups Projects
Commit 070b6c70 authored by Franck Dary's avatar Franck Dary
Browse files

Changed eval for UD

parent e2049d70
No related branches found
No related tags found
No related merge requests found
...@@ -3,9 +3,7 @@ UD_DIR=../../data/UD_French-GSD ...@@ -3,9 +3,7 @@ UD_DIR=../../data/UD_French-GSD
TRAIN=$(UD_DIR)/train.conllu TRAIN=$(UD_DIR)/train.conllu
TEST=$(UD_DIR)/test.conllu TEST=$(UD_DIR)/test.conllu
DEV=$(UD_DIR)/dev.conllu DEV=$(UD_DIR)/dev.conllu
MCD=wpmlgfs.mcd MCD=conllu.mcd
CONLLUMCD=conllu.mcd
CONLLU2MCF=$(TOOLS)/conllu2mcf.py
CONLL2TXT=$(TOOLS)/conll2text.py CONLL2TXT=$(TOOLS)/conll2text.py
#This part is for lemmatizer rules and excpetions computation #This part is for lemmatizer rules and excpetions computation
...@@ -16,38 +14,29 @@ FP_FILENAME=fP ...@@ -16,38 +14,29 @@ FP_FILENAME=fP
RULES_FILENAME=maca_trans_lemmatizer_rules.txt RULES_FILENAME=maca_trans_lemmatizer_rules.txt
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: train.mcf test.mcf dev.mcf $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME) all: train_tiny.conllu dev_tiny.conllu test_tiny.conllu $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME)
$(CONLLUMCD): $(MCD):
cp $(UD_DIR)/*\.conll* . cp $(UD_DIR)/*\.conll* .
cp $(UD_DIR)/*\.mcd . cp $(UD_DIR)/*\.mcd .
train.mcf: $(TRAIN) $(CONLLUMCD) train_tiny.conllu: $(TRAIN) $(MCD)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) $(TOOLS)/conllu2splits.py $< $(MCD) > splits.txt
$(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt
$(CONLL2TXT) $< $$'\n' > train.txt $(CONLL2TXT) $< $$'\n' > train.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.02 > $@
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf
rm dummy.mcf
test.mcf: $(TEST) test_tiny.conllu: $(TEST) $(MCD)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > test.txt $(CONLL2TXT) $< $$'\n' > test.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.2 > $@
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf
rm dummy.mcf
dev.mcf: $(DEV) dev_tiny.conllu: $(DEV) $(MCD)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > dev.txt $(CONLL2TXT) $< $$'\n' > dev.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.2 > $@
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf
rm dummy.mcf
$(FPLM_FILENAME): train.mcf dev.mcf test.mcf $(FPLM_FILENAME): $(TRAIN) $(TEST) $(DEV)
cat train.mcf dev.mcf test.mcf > all.mcf cat $(TRAIN) $(TEST) $(DEV) > all.conllu
$(TOOLS)/mcf2fplm.py all.mcf $(MCD) > $@ $(TOOLS)/conllu2fplm.py all.conllu $(MCD) > $@
rm all.mcf rm all.conllu
$(FP_FILENAME): $(FPLM_FILENAME) $(FP_FILENAME): $(FPLM_FILENAME)
$(TOOLS)/fplm2fP.py $< > $@ $(TOOLS)/fplm2fP.py $< > $@
...@@ -56,7 +45,6 @@ $(RULES_FILENAME): $(FPLM_FILENAME) ...@@ -56,7 +45,6 @@ $(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r $(RULES_FILENAME) $(STRICT) -t $(THRESHOLD) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r $(RULES_FILENAME) $(STRICT) -t $(THRESHOLD)
clean: clean:
- rm *\.mcf
- rm *\.txt - rm *\.txt
- rm *\.conll* - rm *\.conll*
- rm conll*\.mcd - rm conll*\.mcd
......
0 ID
1 FORM
2 POS
3 MORPHO
4 LEMMA
5 GOV
6 LABEL
7 EOS
8 TEXT
#! /bin/bash #! /bin/bash
LANG=UD_fr-GSD LANG=UD_fr-GSD
MCF=../data/test.mcf TEST=../data/test.conllu
MCD=../data/wpmlgfs.mcd MCD=../data/conllu.mcd
ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT" ARGS=""
exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS exec ../../scripts/eval.py $LANG $TEST $MCD $* $ARGS
...@@ -66,3 +66,25 @@ morpho_corr_batched 100.00% 97.77% ...@@ -66,3 +66,25 @@ morpho_corr_batched 100.00% 97.77%
morpho_corr_batched 100.00% 97.74% morpho_corr_batched 100.00% 97.74%
morpho_corr_form 100.00% 97.57% morpho_corr_form 100.00% 97.57%
tool POS MORPHO LEMMA GOV LABEL|GOV EOS
tagparser_corr_nofuture 96.22% 93.17% 98.02% 82.74% 77.09% 99.79%
Nouvelle version :
parser_test2 100.00% 100.00% 100.00% 88.93% 84.57% 99.76%
Ancienne version :
parser_test 100.00% 100.00% 100.00% 88.93% 84.57% 99.76%
Nouvelle version sans bugs sur l'eos :
parser_test4 100.00% 100.00% 100.00% 88.54% 84.04% 99.79%
parser_test4 100.00% 100.00% 100.00% 87.67% 83.27% 99.73%
Pareil mais pas tiny :
5 iter :
parser_test 100.00% 100.00% 100.00% 90.60% 87.28% 99.79%
6 iter :
parser_test 100.00% 100.00% 100.00% 90.89% 87.30% 99.82%
2 iter :
parser_test 100.00% 100.00% 100.00% 89.80% 86.16% 99.74%
tool POS MORPHO LEMMA GOV LABEL|GOV EOS
1m30 : tagparser_classic 97.24% 94.79% 98.40% 85.73% 80.96% 99.73%
8m08 : tagparser_corr_sys_2 97.46% 94.87% 98.58% 86.42% 81.12% 99.84%
4 iter :
20m : tagparser_corr_pred 97.26% 95.20% 98.53% 84.75% 78.99% 99.81%
5 iter :
11m tagparser_corr_pred 97.28% 95.09% 98.51% 86.77% 80.78% 99.77%
SYSTEMATIC :
--------------------------------------------------------------------------------
Error_Morpho :
90.88% (61381) EPSILON
9.11% ( 6156) BACK 3
0.00% ( 1) BACK 1
0.00% ( 1) BACK 2
--------------------------------------------------------------------------------
Error_Parser :
92.35% (114903)EPSILON
7.65% ( 9514) BACK 2
0.00% ( 1) BACK 1
--------------------------------------------------------------------------------
Error_Tagger :
90.88% (61381) EPSILON
9.11% ( 6155) BACK 4
0.00% ( 2) BACK 1
0.00% ( 1) BACK 2
--------------------------------------------------------------------------------
PREDICTED :
--------------------------------------------------------------------------------
Error_Morpho :
87.43% (42827) EPSILON
12.57% ( 6157) BACK 2
--------------------------------------------------------------------------------
Error_Parser :
89.31% (80321) EPSILON
10.69% ( 9612) BACK 2
--------------------------------------------------------------------------------
Error_Tagger :
87.43% (42827) EPSILON
12.57% ( 6157) BACK 2
--------------------------------------------------------------------------------
...@@ -12,21 +12,19 @@ if len(sys.argv) < 5 : ...@@ -12,21 +12,19 @@ if len(sys.argv) < 5 :
printUsageAndExit() printUsageAndExit()
lang = sys.argv[1] lang = sys.argv[1]
mcf = sys.argv[2] test = sys.argv[2]
testText = test[:-6]+"txt"
mcd = sys.argv[3] mcd = sys.argv[3]
experiences = [] experiences = []
debug = "" debug = ""
evalArgs = " " evalArgs = " "
beamSize = "" beamSize = ""
readSize = "" readSize = ""
rawInput = ""
binpath = os.environ["MACAON_DIR"] + "/" + lang + "/bin" binpath = os.environ["MACAON_DIR"] + "/" + lang + "/bin"
eval_mcf = "../../tools/eval_mcf.py"
evalConll = "../../scripts/conll18_ud_eval.py" evalConll = "../../scripts/conll18_ud_eval.py"
mcf2conllu = "../../tools/mcf2conllu.py" addMissingColumns = "../../tools/conlluAddMissingColumns.py"
addBiWords = "../../tools/addBiWords.py"
conlluMCD = "../data/conllu.mcd"
conlluRef = "../data/test.conll*"
result_file = lang + ".res" result_file = lang + ".res"
output = "output.txt" output = "output.txt"
err = "stderr.log" err = "stderr.log"
...@@ -39,6 +37,8 @@ for i in range(4, len(sys.argv)) : ...@@ -39,6 +37,8 @@ for i in range(4, len(sys.argv)) :
arg = sys.argv[i] arg = sys.argv[i]
if arg == "-d" or arg == "--debug" : if arg == "-d" or arg == "--debug" :
debug = "-d" debug = "-d"
elif arg == "--rawInput" :
rawInput = " --rawInput "
elif arg == "--beamSize" : elif arg == "--beamSize" :
beamSize = " --beamSize " + sys.argv[i+1] + " " beamSize = " --beamSize " + sys.argv[i+1] + " "
toskip = True toskip = True
...@@ -76,13 +76,15 @@ for experience in experiences : ...@@ -76,13 +76,15 @@ for experience in experiences :
print("Evaluation of",abridged_name,"...",end="") print("Evaluation of",abridged_name,"...",end="")
sys.stdout.flush() sys.stdout.flush()
input_file = mcf input_file = test
if len(rawInput) > 0 :
input_file = testText
error_occured = False error_occured = False
for exp in experience : for exp in experience :
process = subprocess.Popen(binpath + "/maca_tm_" + exp + " " + input_file + " " + mcd + " " + debug + beamSize + readSize + " --interactive 0 > " + output + " 2> " + err, shell=True) process = subprocess.Popen(binpath + "/maca_tm_" + exp + " " + input_file + " " + mcd + " " + debug + beamSize + readSize + rawInput + "--interactive 0 > " + output + " 2> " + err, shell=True)
process.wait() process.wait()
subprocess.Popen("cp " + output + " tmp_input", shell=True).wait() subprocess.Popen(addMissingColumns + " " + output + " " + mcd + " > tmp_input", shell=True).wait()
input_file = "tmp_input" input_file = "tmp_input"
if process.returncode != 0 : if process.returncode != 0 :
print(" ERROR (see " + err + " for details)") print(" ERROR (see " + err + " for details)")
...@@ -96,21 +98,14 @@ for experience in experiences : ...@@ -96,21 +98,14 @@ for experience in experiences :
redirect = " > " redirect = " > "
header = " --printHeader " header = " --printHeader "
toolname = " --toolname " + abridged_name + " " toolname = " --toolname " + abridged_name + " "
process = subprocess.Popen(eval_mcf + " " + mcd + " " + mcf + " " + output + header + evalArgs + toolname + redirect + result_file + " 2>> " + err, shell=True) process = subprocess.Popen(evalConll + " " + test + " " + input_file + " -v " + evalArgs + redirect + result_file + " 2>> " + err, shell=True)
process.wait() process.wait()
if process.returncode != 0 : if process.returncode != 0 :
print(" ERROR (see " + err + " for details)") print(" ERROR (see " + err + " for details)")
error_occured = True error_occured = True
break break
subprocess.Popen("sed -i -e \"s/" + "__tool__" + "/" + abridged_name + "/g\" " + result_file, shell=True).wait()
print(" Done !") print(" Done !")
firstWrite = False firstWrite = False
if os.path.isfile(conlluMCD) :
subprocess.Popen(mcf2conllu+" "+output+" "+mcd+" "+conlluMCD +"> " + output+".brut.conllu", shell=True).wait()
subprocess.Popen(addBiWords+" "+output+".brut.conllu "+conlluRef+" > " + output+".conllu", shell=True).wait()
subprocess.Popen(evalConll+" -v "+conlluRef+" "+output+".conllu >> " + result_file, shell=True).wait()
else :
print("not a file <%s>"%conlluMCD)
subprocess.Popen("rm " + output + "* tmp_input", shell=True).wait() subprocess.Popen("rm " + output + "* tmp_input", shell=True).wait()
......
#! /bin/bash #! /bin/bash
TRAIN=../../data/train_tiny.mcf TRAIN=../../data/train_tiny.conllu
DEV=../../data/dev_tiny.mcf DEV=../../data/dev_tiny.conllu
if [ "$2" == "-h" ]; then if [ "$2" == "-h" ]; then
macaon_train "-h" macaon_train "-h"
...@@ -45,4 +45,4 @@ if [ ! -d "$TEMPLATEPATH" ]; then ...@@ -45,4 +45,4 @@ if [ ! -d "$TEMPLATEPATH" ]; then
fi fi
# We start the training # We start the training
macaon_train --tm machine.tm --bd train.bd --mcd ../../data/wpmlgfs.mcd -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG $ARGS --templateName $TEMPLATENAME macaon_train --tm machine.tm --bd train.bd --mcd ../../data/conllu.mcd -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG $ARGS --templateName $TEMPLATENAME
#! /usr/bin/python3
import sys
def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
for line in open(sys.argv[1], "r") :
if len(line.strip()) < 3 :
continue
if line.strip()[0] == '#' :
continue
columns = line.strip().split('\t')
if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
continue
print(columns[int(conllMCDr["FORM"])],end='\t')
print(columns[int(conllMCDr["POS"])],end='\t')
print(columns[int(conllMCDr["LEMMA"])],end='\t')
print(columns[int(conllMCDr["MORPHO"])],end='\n')
#! /usr/bin/python3
import sys
def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
lastWasEmpty = False
for line in open(sys.argv[1], "r") :
lastWasEmpty = False
if len(line.strip()) < 2 :
lastWasEmpty = True
print(line.strip())
continue
elif line[0] == '#' :
print(line.strip())
continue
columns = line.strip().split('\t')
for col in conllMCD :
while len(columns) <= int(col) :
columns.append("")
for i in range(len(columns)) :
suffix = "\t"
if i == len(columns)-1 :
suffix = "\n"
if len(columns[i]) > 0 :
print(columns[i], end=suffix)
elif conllMCD[str(i)] == "GOV" :
id = columns[int(conllMCDr["ID"])]
if id == "1" :
print("0", end=suffix)
elif len(id.split('-')) > 1 :
print("_", end=suffix)
else :
print("1", end=suffix)
else :
print("_", end=suffix)
if not lastWasEmpty :
print("")
#! /usr/bin/python3
import sys
import random
def printUsageAndExit() :
print("USAGE : %s input.conllu ratio (outputRest.conllu)"%sys.argv[0])
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 and len(sys.argv) != 4 :
printUsageAndExit()
inputFile = sys.argv[1]
ratio = sys.argv[2]
sentences = []
for line in open(inputFile, "r") :
if len(line.strip()) < 3 :
continue
if line.strip().split('=')[0] == "# sent_id " :
sentences += [[]]
sentences[-1] += [line.strip()]
random.shuffle(sentences)
for sentence in sentences[:int(len(sentences)*float(ratio))] :
for word in sentence :
print(word)
print("")
if len(sys.argv) == 3 :
exit(0)
outputRest = open(sys.argv[3], "w")
for sentence in sentences[int(len(sentences)*float(ratio))+1:] :
for word in sentence :
print(word, file=outputRest)
print("", file=outputRest)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment