diff --git a/UD_fr-GSD/data/Makefile b/UD_fr-GSD/data/Makefile index 382877018c97217fc3c6fa2014890b030ce2d3dc..1a62b88521194393a30f64cf424f673f486b4478 100644 --- a/UD_fr-GSD/data/Makefile +++ b/UD_fr-GSD/data/Makefile @@ -6,6 +6,7 @@ DEV=$(UD_DIR)/dev.conllu MCD=wpmlgfs.mcd CONLLUMCD=conllu.mcd CONLLU2MCF=$(TOOLS)/conllu2mcf.py +CONLL2TXT=$(TOOLS)/conll2text.py #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 @@ -23,18 +24,22 @@ $(CONLLUMCD): train.mcf: $(TRAIN) $(CONLLUMCD) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) + $(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt + $(CONLL2TXT) $< $$'\n' > train.txt $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf rm dummy.mcf test.mcf: $(TEST) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) + $(CONLL2TXT) $< $$'\n' > test.txt $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf rm dummy.mcf dev.mcf: $(DEV) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) + $(CONLL2TXT) $< $$'\n' > dev.txt $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf rm dummy.mcf @@ -52,6 +57,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME) clean: - rm *\.mcf + - rm *\.txt - rm *\.conll* - rm conll*\.mcd - rm $(RULES_FILENAME) diff --git a/UD_fr-GSD/data/wpmlgfs.mcd b/UD_fr-GSD/data/wpmlgfs.mcd index 3656ca67232e55af51ee6f68fb3c7de04de7b2a6..6bfb75240101b2b2a65b69c548727f5fd30f58ed 100644 --- a/UD_fr-GSD/data/wpmlgfs.mcd +++ b/UD_fr-GSD/data/wpmlgfs.mcd @@ -1,7 +1,9 @@ -0 FORM -1 POS -2 MORPHO -3 LEMMA -4 GOV -5 LABEL -6 EOS +0 ID +1 FORM +2 POS +3 MORPHO +4 LEMMA +5 GOV +6 LABEL +7 EOS +8 TEXT diff --git a/UD_fr-GSD/eval/eval.sh b/UD_fr-GSD/eval/eval.sh index 762d8f9128b82e846b55b3ea5d3d489c105c04f8..17e85d89de3a769daddba87e1a389b983900213b 100755 --- a/UD_fr-GSD/eval/eval.sh +++ b/UD_fr-GSD/eval/eval.sh @@ -3,6 +3,6 @@ LANG=UD_fr-GSD MCF=../data/test.mcf MCD=../data/wpmlgfs.mcd -ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM" +ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT" exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS diff --git a/UD_fr-GSD/tagger/machine.tm b/UD_fr-GSD/tagger/machine.tm index 8d2fd44a553d55c300d2d6a05e2928bec55bf0ca..c494ebdb843a3f43fd377c86a0faf0b7a340371b 100644 --- a/UD_fr-GSD/tagger/machine.tm +++ b/UD_fr-GSD/tagger/machine.tm @@ -1,19 +1,19 @@ -Name : Tagger with error correction +Name : Tagger Machine Dicts : tagger.dicts %CLASSIFIERS strategy strategy.cla tagger tagger.cla +tokenizer tokenizer.cla signature signature.cla -error_tagger error_tagger.cla %STATES strategy strategy +tokenizer tokenizer signature signature tagger tagger -error_tagger error_tagger %TRANSITIONS strategy signature MOVE signature strategy tagger MOVE tagger -tagger error_tagger * -error_tagger tagger BACK -error_tagger strategy * +strategy tokenizer MOVE tokenizer +tagger strategy * signature strategy * +tokenizer strategy * diff --git a/UD_fr-GSD/tagger/normal.tm b/UD_fr-GSD/tagger/normal.tm index d28513e078d7b9f13acd0ebbc9a4136a8b1c2133..c494ebdb843a3f43fd377c86a0faf0b7a340371b 100644 --- a/UD_fr-GSD/tagger/normal.tm +++ b/UD_fr-GSD/tagger/normal.tm @@ -3,13 +3,17 @@ Dicts : tagger.dicts %CLASSIFIERS strategy strategy.cla tagger tagger.cla +tokenizer tokenizer.cla signature signature.cla %STATES strategy strategy +tokenizer tokenizer signature signature tagger tagger %TRANSITIONS strategy signature MOVE signature strategy tagger MOVE tagger +strategy tokenizer MOVE tokenizer tagger strategy * signature strategy * +tokenizer strategy * diff --git a/UD_fr-GSD/tagger/strategy.cla b/UD_fr-GSD/tagger/strategy.cla index 12765cd204125534b3d99081d44feeea5ff31078..4013fa9851e97466104f86a8b2f1aba532ecbcac 100644 --- a/UD_fr-GSD/tagger/strategy.cla +++ b/UD_fr-GSD/tagger/strategy.cla @@ -1,4 +1,4 @@ Name : Strategy Type : Information -Oracle : strategy_tagger +Oracle : strategy_tokenizer,tagger Oracle Filename : none diff --git a/UD_fr-GSD/tagger/tagger.dicts b/UD_fr-GSD/tagger/tagger.dicts index 1130185fc416f423a0c1d832d7d820b966f884ee..85e77993945dec797f69cb6f5deb9c8773560db1 100644 --- a/UD_fr-GSD/tagger/tagger.dicts +++ b/UD_fr-GSD/tagger/tagger.dicts @@ -18,6 +18,15 @@ Tagger_sgn 10 Embeddings Tagger_actions 05 Embeddings Tagger_entropy 05 Embeddings ######################################################################### +Tokenizer_bool 02 Embeddings +Tokenizer_int 05 Embeddings +Tokenizer_letters 30 Embeddings +Tokenizer_pos 15 Embeddings +Tokenizer_form 30 Embeddings +Tokenizer_sgn 10 Embeddings +Tokenizer_actions 05 Embeddings +Tokenizer_entropy 05 Embeddings +######################################################################### Error_Tagger_bool 02 Embeddings Error_Tagger_int 05 Embeddings Error_Tagger_letters 30 Embeddings diff --git a/UD_fr-GSD/tagger/tagger.fm b/UD_fr-GSD/tagger/tagger.fm index 22f91fcf02536fc31b026966ee02a418d9983311..4e4ff5b2fdc2edea835c185df954dd7320f76c87 100644 --- a/UD_fr-GSD/tagger/tagger.fm +++ b/UD_fr-GSD/tagger/tagger.fm @@ -19,8 +19,6 @@ b.0#FORM.U #b.1#FORM.U # UPPERCASE b.0#FORM.LEN -# EOS -b.-2#EOS # SUFFIXES b.0#FORM.PART.-4.-4 b.0#FORM.PART.-3.-3 diff --git a/UD_fr-GSD/tagger/test.bd b/UD_fr-GSD/tagger/test.bd index 342ce5bfd24c68965b9c63422a6279209f9aff02..11f50fa71c84935b56d7a29fd5806c612f5dc74b 100644 --- a/UD_fr-GSD/tagger/test.bd +++ b/UD_fr-GSD/tagger/test.bd @@ -1,5 +1,7 @@ #Name ref/hyp dict Policy Must print?# ############################################ -FORM ref form Final 1 +ID hyp none FromZero 1 +FORM hyp form Final 1 POS hyp pos Final 1 SGN hyp sgn Final 0 +TEXT ref none Final 0 diff --git a/UD_fr-GSD/tagger/tokenizer.as b/UD_fr-GSD/tagger/tokenizer.as new file mode 100644 index 0000000000000000000000000000000000000000..eb137555c630a43d4417d9ce7797a68b0b68e710 --- /dev/null +++ b/UD_fr-GSD/tagger/tokenizer.as @@ -0,0 +1,16 @@ +Default : IGNORECHAR +SPLITWORD des@de@les +SPLITWORD du@de@le +SPLITWORD au@à@le +SPLITWORD Au@à@le +SPLITWORD aux@à@les +SPLITWORD auxquelles@à@lesquelles +SPLITWORD Des@de@les +SPLITWORD auquel@à@lequel +SPLITWORD Du@de@le +SPLITWORD Aux@à@les +SPLITWORD duquel@de@lequel +SPLITWORD auxquels@à@lesquels +SPLITWORD desquelles@de@lesquelles +ADDCHARTOWORD +ENDWORD diff --git a/UD_fr-GSD/tagger/tokenizer.cla b/UD_fr-GSD/tagger/tokenizer.cla new file mode 100644 index 0000000000000000000000000000000000000000..e0a1578142a5e5c9f4c446f3c3c7dda93dfb92e8 --- /dev/null +++ b/UD_fr-GSD/tagger/tokenizer.cla @@ -0,0 +1,6 @@ +Name : Tokenizer +Type : Prediction +Oracle : tokenizer +Feature Model : tokenizer.fm +Action Set : tokenizer.as +Topology : (500,RELU,0.3) diff --git a/UD_fr-GSD/tagger/tokenizer.fm b/UD_fr-GSD/tagger/tokenizer.fm new file mode 100644 index 0000000000000000000000000000000000000000..e85005759543eee3de06d04bea68eb3603a04614 --- /dev/null +++ b/UD_fr-GSD/tagger/tokenizer.fm @@ -0,0 +1,37 @@ +# Features classiques +# FORM +b.0#FORM.fasttext +b.-1#FORM.fasttext +b.-2#FORM.fasttext +# POS +b.-1#POS +b.-2#POS +b.-3#POS +# SIGNATURES +b.-1#SGN +b.0#SGN +# UPPERCASE +b.0#FORM.U +# UPPERCASE +b.0#FORM.LEN +# SUFFIXES +b.0#FORM.PART.-4.-4 +b.0#FORM.PART.-3.-3 +b.0#FORM.PART.-2.-2 +b.0#FORM.PART.-1.-1 +b.0#FORM.PART.0.0 +b.0#FORM.PART.1.1 +b.0#FORM.PART.2.2 +b.0#FORM.PART.3.3 +# RAW INPUT +raw.-5 +raw.-4 +raw.-3 +raw.-2 +raw.-1 +raw.0 +raw.2 +raw.3 +raw.4 +raw.5 +raw.6 diff --git a/UD_fr-GSD/tagger/train.bd b/UD_fr-GSD/tagger/train.bd index 49dc16a0d1f84d14e8b0f9d5a5da76d238c9b22e..f8765fa78024db9bfd3bb325bea8f3d7fa8626f4 100644 --- a/UD_fr-GSD/tagger/train.bd +++ b/UD_fr-GSD/tagger/train.bd @@ -1,6 +1,8 @@ #Name ref/hyp dict Policy Must print?# ############################################ -FORM ref form FromZero 1 +ID hyp none FromZero 1 +FORM hyp form FromZero 1 POS hyp pos FromZero 1 SGN hyp sgn FromZero 1 EOS ref int FromZero 1 +TEXT ref none Final 0 diff --git a/scripts/train.sh b/scripts/train.sh index c38f6f5c5cb501db7dd087df3e5a4e8ddefdacd7..b614ce4c00aa0b83ae4efc36ec05bc7c7a02569a 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -1,7 +1,7 @@ #! /bin/bash -TRAIN=../../data/train.mcf -DEV=../../data/dev.mcf +TRAIN=../../data/train_tiny.mcf +DEV=../../data/dev_tiny.mcf if [ "$2" == "-h" ]; then macaon_train "-h" diff --git a/tools/conllu2mcf.py b/tools/conllu2mcf.py index 3612feb88e99471ccab04f968a0b6da315db26e4..2ca8c7a8db71dc3a63dcc1040773b717c47b4cc1 100755 --- a/tools/conllu2mcf.py +++ b/tools/conllu2mcf.py @@ -32,16 +32,35 @@ def main() : output = [] previousId = -1 + currentSentence = "" for line in open(sys.argv[1], encoding="utf8") : clean = line.strip() if len(clean) < 2 : continue - if line[0] == '#' : + if line.split('=')[0] == "# sent_id " : + continue + if line.split('=')[0] == "# text " : + currentSentence = line[8:].strip() continue columns = clean.split('\t') if len(columns[int(conllMCDr["ID"])].split('-')) > 1 : + lineInMCF = [] + for index in mcfMCD : + colName = mcfMCD[index] + while len(lineInMCF) < int(index)+1 : + lineInMCF.append("") + value = "_" + if colName == "EOS" : + if int(columns[int(conllMCDr["ID"])].split('-')[0]) < previousId : + value = "1" + previousId = int(columns[int(conllMCDr["ID"])].split('-')[0]) + if mcfMCD[index] in conllMCDr : + indexInColumns = int(conllMCDr[mcfMCD[index]]) + value = columns[indexInColumns] + lineInMCF[int(index)] = value; + output.append(lineInMCF) continue id = int(columns[int(conllMCDr["ID"])]) @@ -50,10 +69,12 @@ def main() : if gov == 0 : relGov = 0 eos = "_" + textValue = "_" if id < previousId : eos = "1" previousId = id + textValue = currentSentence lineInMCF = [] for index in mcfMCD : @@ -63,6 +84,8 @@ def main() : value = eos elif colName == "GOV" : value = relGov + elif colName == "TEXT" : + value = textValue else : indexInColumns = int(conllMCDr[mcfMCD[index]]) value = columns[indexInColumns] @@ -72,11 +95,24 @@ def main() : lineInMCF[int(index)] = value output.append(lineInMCF) + hasText = False + textIndex = 0 EOSIndex = int(mcfMCDr["EOS"]) + + if "TEXT" in mcfMCDr : + hasText = True + textIndex = int(mcfMCDr["TEXT"]) + for i in range(len(output)-1) : output[i][EOSIndex] = output[i+1][EOSIndex] + output[-1][EOSIndex] = "1" + if hasText : + for i in range(len(output)) : + if output[i][EOSIndex] != "1" : + output[i][textIndex] = "_" + outputFile = open(sys.argv[3], "w", encoding="utf8") for outputLine in output : for i in range(len(outputLine)) : diff --git a/tools/mcf2conllu.py b/tools/mcf2conllu.py index ae369143a8dcea77fead862e472b794e96c7bc04..d678e2cb219278552150a274e3141084c7922a0a 100755 --- a/tools/mcf2conllu.py +++ b/tools/mcf2conllu.py @@ -36,9 +36,6 @@ def main() : if len(line) < 2 or line[0] == '#' : continue splited = striped.split('\t') - if len(splited) != len(mcfMCD) : - print("ERROR : line \'%s\' wrong format.\n"%line) - exit(1) toPrint = "" @@ -46,29 +43,42 @@ def main() : col = conllMCD[str(ind)] if col == "EMPTY" : toPrint += "_\t" - elif col == "ID" : + elif col == "ID" and "ID" not in mcfMCDr : toPrint += str(curID) + "\t" elif col == "GOV" : - relInd = int(splited[int(mcfMCDr["GOV"])]) - gov = 0 - if relInd != 0 : - gov = relInd + curID - toPrint += str(gov) + '\t' + if int(mcfMCDr["GOV"]) >= len(splited) : + if "ID" in mcfMCDr : + curID = int(splited[int(mcfMCDr["ID"])].split('-')[0]) + if (curID == 1) and not ("EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited)) : + toPrint += str(0)+'\t' + else : + toPrint += str(1)+'\t' + else : + relInd = int(splited[int(mcfMCDr["GOV"])]) + gov = 0 + if relInd != 0 : + gov = relInd + curID + toPrint += str(gov) + '\t' else : if col not in mcfMCDr : print("ERROR : %s not in mcf.mcd."%col) exit(1) - toPrint += splited[int(mcfMCDr[col])] + '\t' + if int(mcfMCDr[col]) >= len(splited) : + toPrint += '_\t' + else : + toPrint += splited[int(mcfMCDr[col])] + '\t' print(toPrint[:-1]) - curID += 1 + + if "ID" not in mcfMCDr : + curID += 1 - if "EOS" in mcfMCDr : + if "EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited) : if splited[int(mcfMCDr["EOS"])] == "1" : print("") curID = 1 - if __name__ == "__main__" : main() + print("")