Skip to content
Snippets Groups Projects
Commit 5cf44f70 authored by Franck Dary's avatar Franck Dary
Browse files

Added tokenization

parent a3a763db
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,7 @@ DEV=$(UD_DIR)/dev.conllu ...@@ -6,6 +6,7 @@ DEV=$(UD_DIR)/dev.conllu
MCD=wpmlgfs.mcd MCD=wpmlgfs.mcd
CONLLUMCD=conllu.mcd CONLLUMCD=conllu.mcd
CONLLU2MCF=$(TOOLS)/conllu2mcf.py CONLLU2MCF=$(TOOLS)/conllu2mcf.py
CONLL2TXT=$(TOOLS)/conll2text.py
#This part is for lemmatizer rules and excpetions computation #This part is for lemmatizer rules and excpetions computation
THRESHOLD=10 THRESHOLD=10
...@@ -23,18 +24,22 @@ $(CONLLUMCD): ...@@ -23,18 +24,22 @@ $(CONLLUMCD):
train.mcf: $(TRAIN) $(CONLLUMCD) train.mcf: $(TRAIN) $(CONLLUMCD)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt
$(CONLL2TXT) $< $$'\n' > train.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf
rm dummy.mcf rm dummy.mcf
test.mcf: $(TEST) test.mcf: $(TEST)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > test.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf
rm dummy.mcf rm dummy.mcf
dev.mcf: $(DEV) dev.mcf: $(DEV)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD) $(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > dev.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf $(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf $(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf
rm dummy.mcf rm dummy.mcf
...@@ -52,6 +57,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME) ...@@ -52,6 +57,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME)
clean: clean:
- rm *\.mcf - rm *\.mcf
- rm *\.txt
- rm *\.conll* - rm *\.conll*
- rm conll*\.mcd - rm conll*\.mcd
- rm $(RULES_FILENAME) - rm $(RULES_FILENAME)
......
0 FORM 0 ID
1 POS 1 FORM
2 MORPHO 2 POS
3 LEMMA 3 MORPHO
4 GOV 4 LEMMA
5 LABEL 5 GOV
6 EOS 6 LABEL
7 EOS
8 TEXT
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
LANG=UD_fr-GSD LANG=UD_fr-GSD
MCF=../data/test.mcf MCF=../data/test.mcf
MCD=../data/wpmlgfs.mcd MCD=../data/wpmlgfs.mcd
ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM" ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT"
exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS
Name : Tagger with error correction Name : Tagger Machine
Dicts : tagger.dicts Dicts : tagger.dicts
%CLASSIFIERS %CLASSIFIERS
strategy strategy.cla strategy strategy.cla
tagger tagger.cla tagger tagger.cla
tokenizer tokenizer.cla
signature signature.cla signature signature.cla
error_tagger error_tagger.cla
%STATES %STATES
strategy strategy strategy strategy
tokenizer tokenizer
signature signature signature signature
tagger tagger tagger tagger
error_tagger error_tagger
%TRANSITIONS %TRANSITIONS
strategy signature MOVE signature strategy signature MOVE signature
strategy tagger MOVE tagger strategy tagger MOVE tagger
tagger error_tagger * strategy tokenizer MOVE tokenizer
error_tagger tagger BACK tagger strategy *
error_tagger strategy *
signature strategy * signature strategy *
tokenizer strategy *
...@@ -3,13 +3,17 @@ Dicts : tagger.dicts ...@@ -3,13 +3,17 @@ Dicts : tagger.dicts
%CLASSIFIERS %CLASSIFIERS
strategy strategy.cla strategy strategy.cla
tagger tagger.cla tagger tagger.cla
tokenizer tokenizer.cla
signature signature.cla signature signature.cla
%STATES %STATES
strategy strategy strategy strategy
tokenizer tokenizer
signature signature signature signature
tagger tagger tagger tagger
%TRANSITIONS %TRANSITIONS
strategy signature MOVE signature strategy signature MOVE signature
strategy tagger MOVE tagger strategy tagger MOVE tagger
strategy tokenizer MOVE tokenizer
tagger strategy * tagger strategy *
signature strategy * signature strategy *
tokenizer strategy *
Name : Strategy Name : Strategy
Type : Information Type : Information
Oracle : strategy_tagger Oracle : strategy_tokenizer,tagger
Oracle Filename : none Oracle Filename : none
...@@ -18,6 +18,15 @@ Tagger_sgn 10 Embeddings ...@@ -18,6 +18,15 @@ Tagger_sgn 10 Embeddings
Tagger_actions 05 Embeddings Tagger_actions 05 Embeddings
Tagger_entropy 05 Embeddings Tagger_entropy 05 Embeddings
######################################################################### #########################################################################
Tokenizer_bool 02 Embeddings
Tokenizer_int 05 Embeddings
Tokenizer_letters 30 Embeddings
Tokenizer_pos 15 Embeddings
Tokenizer_form 30 Embeddings
Tokenizer_sgn 10 Embeddings
Tokenizer_actions 05 Embeddings
Tokenizer_entropy 05 Embeddings
#########################################################################
Error_Tagger_bool 02 Embeddings Error_Tagger_bool 02 Embeddings
Error_Tagger_int 05 Embeddings Error_Tagger_int 05 Embeddings
Error_Tagger_letters 30 Embeddings Error_Tagger_letters 30 Embeddings
......
...@@ -19,8 +19,6 @@ b.0#FORM.U ...@@ -19,8 +19,6 @@ b.0#FORM.U
#b.1#FORM.U #b.1#FORM.U
# UPPERCASE # UPPERCASE
b.0#FORM.LEN b.0#FORM.LEN
# EOS
b.-2#EOS
# SUFFIXES # SUFFIXES
b.0#FORM.PART.-4.-4 b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3 b.0#FORM.PART.-3.-3
......
#Name ref/hyp dict Policy Must print?# #Name ref/hyp dict Policy Must print?#
############################################ ############################################
FORM ref form Final 1 ID hyp none FromZero 1
FORM hyp form Final 1
POS hyp pos Final 1 POS hyp pos Final 1
SGN hyp sgn Final 0 SGN hyp sgn Final 0
TEXT ref none Final 0
Default : IGNORECHAR
SPLITWORD des@de@les
SPLITWORD du@de@le
SPLITWORD au@à@le
SPLITWORD Au@à@le
SPLITWORD aux@à@les
SPLITWORD auxquelles@à@lesquelles
SPLITWORD Des@de@les
SPLITWORD auquel@à@lequel
SPLITWORD Du@de@le
SPLITWORD Aux@à@les
SPLITWORD duquel@de@lequel
SPLITWORD auxquels@à@lesquels
SPLITWORD desquelles@de@lesquelles
ADDCHARTOWORD
ENDWORD
Name : Tokenizer
Type : Prediction
Oracle : tokenizer
Feature Model : tokenizer.fm
Action Set : tokenizer.as
Topology : (500,RELU,0.3)
# Features classiques
# FORM
b.0#FORM.fasttext
b.-1#FORM.fasttext
b.-2#FORM.fasttext
# POS
b.-1#POS
b.-2#POS
b.-3#POS
# SIGNATURES
b.-1#SGN
b.0#SGN
# UPPERCASE
b.0#FORM.U
# UPPERCASE
b.0#FORM.LEN
# SUFFIXES
b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3
b.0#FORM.PART.-2.-2
b.0#FORM.PART.-1.-1
b.0#FORM.PART.0.0
b.0#FORM.PART.1.1
b.0#FORM.PART.2.2
b.0#FORM.PART.3.3
# RAW INPUT
raw.-5
raw.-4
raw.-3
raw.-2
raw.-1
raw.0
raw.2
raw.3
raw.4
raw.5
raw.6
#Name ref/hyp dict Policy Must print?# #Name ref/hyp dict Policy Must print?#
############################################ ############################################
FORM ref form FromZero 1 ID hyp none FromZero 1
FORM hyp form FromZero 1
POS hyp pos FromZero 1 POS hyp pos FromZero 1
SGN hyp sgn FromZero 1 SGN hyp sgn FromZero 1
EOS ref int FromZero 1 EOS ref int FromZero 1
TEXT ref none Final 0
#! /bin/bash #! /bin/bash
TRAIN=../../data/train.mcf TRAIN=../../data/train_tiny.mcf
DEV=../../data/dev.mcf DEV=../../data/dev_tiny.mcf
if [ "$2" == "-h" ]; then if [ "$2" == "-h" ]; then
macaon_train "-h" macaon_train "-h"
......
...@@ -32,16 +32,35 @@ def main() : ...@@ -32,16 +32,35 @@ def main() :
output = [] output = []
previousId = -1 previousId = -1
currentSentence = ""
for line in open(sys.argv[1], encoding="utf8") : for line in open(sys.argv[1], encoding="utf8") :
clean = line.strip() clean = line.strip()
if len(clean) < 2 : if len(clean) < 2 :
continue continue
if line[0] == '#' : if line.split('=')[0] == "# sent_id " :
continue
if line.split('=')[0] == "# text " :
currentSentence = line[8:].strip()
continue continue
columns = clean.split('\t') columns = clean.split('\t')
if len(columns[int(conllMCDr["ID"])].split('-')) > 1 : if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
lineInMCF = []
for index in mcfMCD :
colName = mcfMCD[index]
while len(lineInMCF) < int(index)+1 :
lineInMCF.append("")
value = "_"
if colName == "EOS" :
if int(columns[int(conllMCDr["ID"])].split('-')[0]) < previousId :
value = "1"
previousId = int(columns[int(conllMCDr["ID"])].split('-')[0])
if mcfMCD[index] in conllMCDr :
indexInColumns = int(conllMCDr[mcfMCD[index]])
value = columns[indexInColumns]
lineInMCF[int(index)] = value;
output.append(lineInMCF)
continue continue
id = int(columns[int(conllMCDr["ID"])]) id = int(columns[int(conllMCDr["ID"])])
...@@ -50,10 +69,12 @@ def main() : ...@@ -50,10 +69,12 @@ def main() :
if gov == 0 : if gov == 0 :
relGov = 0 relGov = 0
eos = "_" eos = "_"
textValue = "_"
if id < previousId : if id < previousId :
eos = "1" eos = "1"
previousId = id previousId = id
textValue = currentSentence
lineInMCF = [] lineInMCF = []
for index in mcfMCD : for index in mcfMCD :
...@@ -63,6 +84,8 @@ def main() : ...@@ -63,6 +84,8 @@ def main() :
value = eos value = eos
elif colName == "GOV" : elif colName == "GOV" :
value = relGov value = relGov
elif colName == "TEXT" :
value = textValue
else : else :
indexInColumns = int(conllMCDr[mcfMCD[index]]) indexInColumns = int(conllMCDr[mcfMCD[index]])
value = columns[indexInColumns] value = columns[indexInColumns]
...@@ -72,11 +95,24 @@ def main() : ...@@ -72,11 +95,24 @@ def main() :
lineInMCF[int(index)] = value lineInMCF[int(index)] = value
output.append(lineInMCF) output.append(lineInMCF)
hasText = False
textIndex = 0
EOSIndex = int(mcfMCDr["EOS"]) EOSIndex = int(mcfMCDr["EOS"])
if "TEXT" in mcfMCDr :
hasText = True
textIndex = int(mcfMCDr["TEXT"])
for i in range(len(output)-1) : for i in range(len(output)-1) :
output[i][EOSIndex] = output[i+1][EOSIndex] output[i][EOSIndex] = output[i+1][EOSIndex]
output[-1][EOSIndex] = "1" output[-1][EOSIndex] = "1"
if hasText :
for i in range(len(output)) :
if output[i][EOSIndex] != "1" :
output[i][textIndex] = "_"
outputFile = open(sys.argv[3], "w", encoding="utf8") outputFile = open(sys.argv[3], "w", encoding="utf8")
for outputLine in output : for outputLine in output :
for i in range(len(outputLine)) : for i in range(len(outputLine)) :
......
...@@ -36,9 +36,6 @@ def main() : ...@@ -36,9 +36,6 @@ def main() :
if len(line) < 2 or line[0] == '#' : if len(line) < 2 or line[0] == '#' :
continue continue
splited = striped.split('\t') splited = striped.split('\t')
if len(splited) != len(mcfMCD) :
print("ERROR : line \'%s\' wrong format.\n"%line)
exit(1)
toPrint = "" toPrint = ""
...@@ -46,9 +43,17 @@ def main() : ...@@ -46,9 +43,17 @@ def main() :
col = conllMCD[str(ind)] col = conllMCD[str(ind)]
if col == "EMPTY" : if col == "EMPTY" :
toPrint += "_\t" toPrint += "_\t"
elif col == "ID" : elif col == "ID" and "ID" not in mcfMCDr :
toPrint += str(curID) + "\t" toPrint += str(curID) + "\t"
elif col == "GOV" : elif col == "GOV" :
if int(mcfMCDr["GOV"]) >= len(splited) :
if "ID" in mcfMCDr :
curID = int(splited[int(mcfMCDr["ID"])].split('-')[0])
if (curID == 1) and not ("EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited)) :
toPrint += str(0)+'\t'
else :
toPrint += str(1)+'\t'
else :
relInd = int(splited[int(mcfMCDr["GOV"])]) relInd = int(splited[int(mcfMCDr["GOV"])])
gov = 0 gov = 0
if relInd != 0 : if relInd != 0 :
...@@ -58,17 +63,22 @@ def main() : ...@@ -58,17 +63,22 @@ def main() :
if col not in mcfMCDr : if col not in mcfMCDr :
print("ERROR : %s not in mcf.mcd."%col) print("ERROR : %s not in mcf.mcd."%col)
exit(1) exit(1)
if int(mcfMCDr[col]) >= len(splited) :
toPrint += '_\t'
else :
toPrint += splited[int(mcfMCDr[col])] + '\t' toPrint += splited[int(mcfMCDr[col])] + '\t'
print(toPrint[:-1]) print(toPrint[:-1])
if "ID" not in mcfMCDr :
curID += 1 curID += 1
if "EOS" in mcfMCDr : if "EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited) :
if splited[int(mcfMCDr["EOS"])] == "1" : if splited[int(mcfMCDr["EOS"])] == "1" :
print("") print("")
curID = 1 curID = 1
if __name__ == "__main__" : if __name__ == "__main__" :
main() main()
print("")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment