diff --git a/UD_fr-GSD/data/Makefile b/UD_fr-GSD/data/Makefile index 1a62b88521194393a30f64cf424f673f486b4478..c3996c0ead4c288f95d140afd1843e790a19d8a8 100644 --- a/UD_fr-GSD/data/Makefile +++ b/UD_fr-GSD/data/Makefile @@ -3,9 +3,7 @@ UD_DIR=../../data/UD_French-GSD TRAIN=$(UD_DIR)/train.conllu TEST=$(UD_DIR)/test.conllu DEV=$(UD_DIR)/dev.conllu -MCD=wpmlgfs.mcd -CONLLUMCD=conllu.mcd -CONLLU2MCF=$(TOOLS)/conllu2mcf.py +MCD=conllu.mcd CONLL2TXT=$(TOOLS)/conll2text.py #This part is for lemmatizer rules and excpetions computation @@ -16,38 +14,29 @@ FP_FILENAME=fP RULES_FILENAME=maca_trans_lemmatizer_rules.txt EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm -all: train.mcf test.mcf dev.mcf $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME) +all: train_tiny.conllu dev_tiny.conllu test_tiny.conllu $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME) -$(CONLLUMCD): +$(MCD): cp $(UD_DIR)/*\.conll* . cp $(UD_DIR)/*\.mcd . -train.mcf: $(TRAIN) $(CONLLUMCD) - $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) - $(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt +train_tiny.conllu: $(TRAIN) $(MCD) + $(TOOLS)/conllu2splits.py $< $(MCD) > splits.txt $(CONLL2TXT) $< $$'\n' > train.txt - $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf - $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf - rm dummy.mcf + $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.02 > $@ -test.mcf: $(TEST) - $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) +test_tiny.conllu: $(TEST) $(MCD) $(CONLL2TXT) $< $$'\n' > test.txt - $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf - $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf - rm dummy.mcf + $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.2 > $@ -dev.mcf: $(DEV) - $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) +dev_tiny.conllu: $(DEV) $(MCD) $(CONLL2TXT) $< $$'\n' > dev.txt - $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf - $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf - rm dummy.mcf + $(TOOLS)/conlluShuffleAndMakeDev.py $< $ 0.2 > $@ -$(FPLM_FILENAME): train.mcf dev.mcf test.mcf - cat train.mcf dev.mcf test.mcf > all.mcf - $(TOOLS)/mcf2fplm.py all.mcf $(MCD) > $@ - rm all.mcf +$(FPLM_FILENAME): $(TRAIN) $(TEST) $(DEV) + cat $(TRAIN) $(TEST) $(DEV) > all.conllu + $(TOOLS)/conllu2fplm.py all.conllu $(MCD) > $@ + rm all.conllu $(FP_FILENAME): $(FPLM_FILENAME) $(TOOLS)/fplm2fP.py $< > $@ @@ -56,7 +45,6 @@ $(RULES_FILENAME): $(FPLM_FILENAME) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r $(RULES_FILENAME) $(STRICT) -t $(THRESHOLD) clean: - - rm *\.mcf - rm *\.txt - rm *\.conll* - rm conll*\.mcd diff --git a/UD_fr-GSD/data/wpmlgfs.mcd b/UD_fr-GSD/data/wpmlgfs.mcd deleted file mode 100644 index 6bfb75240101b2b2a65b69c548727f5fd30f58ed..0000000000000000000000000000000000000000 --- a/UD_fr-GSD/data/wpmlgfs.mcd +++ /dev/null @@ -1,9 +0,0 @@ -0 ID -1 FORM -2 POS -3 MORPHO -4 LEMMA -5 GOV -6 LABEL -7 EOS -8 TEXT diff --git a/UD_fr-GSD/eval/eval.sh b/UD_fr-GSD/eval/eval.sh index 17e85d89de3a769daddba87e1a389b983900213b..4dc8065edd27ea90ca30b78e0d91f9abd5e6244f 100755 --- a/UD_fr-GSD/eval/eval.sh +++ b/UD_fr-GSD/eval/eval.sh @@ -1,8 +1,8 @@ #! /bin/bash LANG=UD_fr-GSD -MCF=../data/test.mcf -MCD=../data/wpmlgfs.mcd -ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT" +TEST=../data/test.conllu +MCD=../data/conllu.mcd +ARGS="" -exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS +exec ../../scripts/eval.py $LANG $TEST $MCD $* $ARGS diff --git a/UD_fr-GSD/eval/experiences.txt b/UD_fr-GSD/eval/experiences.txt index cf98828e30078e87fa74b7ea7f06f7c0866edbab..fea15a3b021a69a5e1dbbe63f012d21dcbc52ccb 100644 --- a/UD_fr-GSD/eval/experiences.txt +++ b/UD_fr-GSD/eval/experiences.txt @@ -66,3 +66,25 @@ morpho_corr_batched 100.00% 97.77% morpho_corr_batched 100.00% 97.74% morpho_corr_form 100.00% 97.57% + +tool POS MORPHO LEMMA GOV LABEL|GOV EOS +tagparser_corr_nofuture 96.22% 93.17% 98.02% 82.74% 77.09% 99.79% + + +Nouvelle version : +parser_test2 100.00% 100.00% 100.00% 88.93% 84.57% 99.76% +Ancienne version : +parser_test 100.00% 100.00% 100.00% 88.93% 84.57% 99.76% + +Nouvelle version sans bugs sur l'eos : +parser_test4 100.00% 100.00% 100.00% 88.54% 84.04% 99.79% +parser_test4 100.00% 100.00% 100.00% 87.67% 83.27% 99.73% +Pareil mais pas tiny : + + +5 iter : +parser_test 100.00% 100.00% 100.00% 90.60% 87.28% 99.79% +6 iter : +parser_test 100.00% 100.00% 100.00% 90.89% 87.30% 99.82% +2 iter : +parser_test 100.00% 100.00% 100.00% 89.80% 86.16% 99.74% diff --git a/UD_fr-GSD/eval/recap.txt b/UD_fr-GSD/eval/recap.txt deleted file mode 100644 index 51515300dec16e10ecb1beb4f56850f7875cf57a..0000000000000000000000000000000000000000 --- a/UD_fr-GSD/eval/recap.txt +++ /dev/null @@ -1,43 +0,0 @@ - tool POS MORPHO LEMMA GOV LABEL|GOV EOS -1m30 : tagparser_classic 97.24% 94.79% 98.40% 85.73% 80.96% 99.73% -8m08 : tagparser_corr_sys_2 97.46% 94.87% 98.58% 86.42% 81.12% 99.84% -4 iter : -20m : tagparser_corr_pred 97.26% 95.20% 98.53% 84.75% 78.99% 99.81% -5 iter : -11m tagparser_corr_pred 97.28% 95.09% 98.51% 86.77% 80.78% 99.77% - -SYSTEMATIC : --------------------------------------------------------------------------------- -Error_Morpho : - 90.88% (61381) EPSILON - 9.11% ( 6156) BACK 3 - 0.00% ( 1) BACK 1 - 0.00% ( 1) BACK 2 --------------------------------------------------------------------------------- -Error_Parser : - 92.35% (114903)EPSILON - 7.65% ( 9514) BACK 2 - 0.00% ( 1) BACK 1 --------------------------------------------------------------------------------- -Error_Tagger : - 90.88% (61381) EPSILON - 9.11% ( 6155) BACK 4 - 0.00% ( 2) BACK 1 - 0.00% ( 1) BACK 2 --------------------------------------------------------------------------------- - - -PREDICTED : --------------------------------------------------------------------------------- -Error_Morpho : - 87.43% (42827) EPSILON - 12.57% ( 6157) BACK 2 --------------------------------------------------------------------------------- -Error_Parser : - 89.31% (80321) EPSILON - 10.69% ( 9612) BACK 2 --------------------------------------------------------------------------------- -Error_Tagger : - 87.43% (42827) EPSILON - 12.57% ( 6157) BACK 2 --------------------------------------------------------------------------------- diff --git a/scripts/eval.py b/scripts/eval.py index a0736aa705835f0f3180a529e190b75520f4f908..d9ad1ae443c6259ca669dc31ed72c8133f2ad121 100755 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -12,21 +12,19 @@ if len(sys.argv) < 5 : printUsageAndExit() lang = sys.argv[1] -mcf = sys.argv[2] +test = sys.argv[2] +testText = test[:-6]+"txt" mcd = sys.argv[3] experiences = [] debug = "" evalArgs = " " beamSize = "" readSize = "" +rawInput = "" binpath = os.environ["MACAON_DIR"] + "/" + lang + "/bin" -eval_mcf = "../../tools/eval_mcf.py" evalConll = "../../scripts/conll18_ud_eval.py" -mcf2conllu = "../../tools/mcf2conllu.py" -addBiWords = "../../tools/addBiWords.py" -conlluMCD = "../data/conllu.mcd" -conlluRef = "../data/test.conll*" +addMissingColumns = "../../tools/conlluAddMissingColumns.py" result_file = lang + ".res" output = "output.txt" err = "stderr.log" @@ -39,6 +37,8 @@ for i in range(4, len(sys.argv)) : arg = sys.argv[i] if arg == "-d" or arg == "--debug" : debug = "-d" + elif arg == "--rawInput" : + rawInput = " --rawInput " elif arg == "--beamSize" : beamSize = " --beamSize " + sys.argv[i+1] + " " toskip = True @@ -76,13 +76,15 @@ for experience in experiences : print("Evaluation of",abridged_name,"...",end="") sys.stdout.flush() - input_file = mcf + input_file = test + if len(rawInput) > 0 : + input_file = testText error_occured = False for exp in experience : - process = subprocess.Popen(binpath + "/maca_tm_" + exp + " " + input_file + " " + mcd + " " + debug + beamSize + readSize + " --interactive 0 > " + output + " 2> " + err, shell=True) + process = subprocess.Popen(binpath + "/maca_tm_" + exp + " " + input_file + " " + mcd + " " + debug + beamSize + readSize + rawInput + "--interactive 0 > " + output + " 2> " + err, shell=True) process.wait() - subprocess.Popen("cp " + output + " tmp_input", shell=True).wait() + subprocess.Popen(addMissingColumns + " " + output + " " + mcd + " > tmp_input", shell=True).wait() input_file = "tmp_input" if process.returncode != 0 : print(" ERROR (see " + err + " for details)") @@ -96,21 +98,14 @@ for experience in experiences : redirect = " > " header = " --printHeader " toolname = " --toolname " + abridged_name + " " - process = subprocess.Popen(eval_mcf + " " + mcd + " " + mcf + " " + output + header + evalArgs + toolname + redirect + result_file + " 2>> " + err, shell=True) + process = subprocess.Popen(evalConll + " " + test + " " + input_file + " -v " + evalArgs + redirect + result_file + " 2>> " + err, shell=True) process.wait() if process.returncode != 0 : print(" ERROR (see " + err + " for details)") error_occured = True break - subprocess.Popen("sed -i -e \"s/" + "__tool__" + "/" + abridged_name + "/g\" " + result_file, shell=True).wait() print(" Done !") firstWrite = False - if os.path.isfile(conlluMCD) : - subprocess.Popen(mcf2conllu+" "+output+" "+mcd+" "+conlluMCD +"> " + output+".brut.conllu", shell=True).wait() - subprocess.Popen(addBiWords+" "+output+".brut.conllu "+conlluRef+" > " + output+".conllu", shell=True).wait() - subprocess.Popen(evalConll+" -v "+conlluRef+" "+output+".conllu >> " + result_file, shell=True).wait() - else : - print("not a file <%s>"%conlluMCD) subprocess.Popen("rm " + output + "* tmp_input", shell=True).wait() diff --git a/scripts/train.sh b/scripts/train.sh index b614ce4c00aa0b83ae4efc36ec05bc7c7a02569a..cbf7c7f0ec61fe2f5385080b733450e0578d2b41 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -1,7 +1,7 @@ #! /bin/bash -TRAIN=../../data/train_tiny.mcf -DEV=../../data/dev_tiny.mcf +TRAIN=../../data/train_tiny.conllu +DEV=../../data/dev_tiny.conllu if [ "$2" == "-h" ]; then macaon_train "-h" @@ -45,4 +45,4 @@ if [ ! -d "$TEMPLATEPATH" ]; then fi # We start the training -macaon_train --tm machine.tm --bd train.bd --mcd ../../data/wpmlgfs.mcd -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG $ARGS --templateName $TEMPLATENAME +macaon_train --tm machine.tm --bd train.bd --mcd ../../data/conllu.mcd -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG $ARGS --templateName $TEMPLATENAME diff --git a/tools/conllu2fplm.py b/tools/conllu2fplm.py new file mode 100755 index 0000000000000000000000000000000000000000..300475ff4d105499cc132a96579469ac5f795775 --- /dev/null +++ b/tools/conllu2fplm.py @@ -0,0 +1,44 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + for line in open(sys.argv[1], "r") : + if len(line.strip()) < 3 : + continue + if line.strip()[0] == '#' : + continue + + columns = line.strip().split('\t') + if len(columns[int(conllMCDr["ID"])].split('-')) > 1 : + continue + + print(columns[int(conllMCDr["FORM"])],end='\t') + print(columns[int(conllMCDr["POS"])],end='\t') + print(columns[int(conllMCDr["LEMMA"])],end='\t') + print(columns[int(conllMCDr["MORPHO"])],end='\n') + diff --git a/tools/conlluAddMissingColumns.py b/tools/conlluAddMissingColumns.py new file mode 100755 index 0000000000000000000000000000000000000000..e125838251d491fb8ef811011670599e353369ac --- /dev/null +++ b/tools/conlluAddMissingColumns.py @@ -0,0 +1,64 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + lastWasEmpty = False + + for line in open(sys.argv[1], "r") : + lastWasEmpty = False + if len(line.strip()) < 2 : + lastWasEmpty = True + print(line.strip()) + continue + elif line[0] == '#' : + print(line.strip()) + continue + columns = line.strip().split('\t') + for col in conllMCD : + while len(columns) <= int(col) : + columns.append("") + for i in range(len(columns)) : + suffix = "\t" + if i == len(columns)-1 : + suffix = "\n" + if len(columns[i]) > 0 : + print(columns[i], end=suffix) + elif conllMCD[str(i)] == "GOV" : + id = columns[int(conllMCDr["ID"])] + if id == "1" : + print("0", end=suffix) + elif len(id.split('-')) > 1 : + print("_", end=suffix) + else : + print("1", end=suffix) + else : + print("_", end=suffix) + + if not lastWasEmpty : + print("") + diff --git a/tools/conlluShuffleAndMakeDev.py b/tools/conlluShuffleAndMakeDev.py new file mode 100755 index 0000000000000000000000000000000000000000..4918a3ea2e349c3e0ba7334e91cd4409487d1733 --- /dev/null +++ b/tools/conlluShuffleAndMakeDev.py @@ -0,0 +1,41 @@ +#! /usr/bin/python3 + +import sys +import random + +def printUsageAndExit() : + print("USAGE : %s input.conllu ratio (outputRest.conllu)"%sys.argv[0]) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 and len(sys.argv) != 4 : + printUsageAndExit() + + inputFile = sys.argv[1] + ratio = sys.argv[2] + + sentences = [] + + for line in open(inputFile, "r") : + if len(line.strip()) < 3 : + continue + if line.strip().split('=')[0] == "# sent_id " : + sentences += [[]] + sentences[-1] += [line.strip()] + + random.shuffle(sentences) + + for sentence in sentences[:int(len(sentences)*float(ratio))] : + for word in sentence : + print(word) + print("") + + if len(sys.argv) == 3 : + exit(0) + + outputRest = open(sys.argv[3], "w") + for sentence in sentences[int(len(sentences)*float(ratio))+1:] : + for word in sentence : + print(word, file=outputRest) + print("", file=outputRest) +