Skip to content
Snippets Groups Projects
Commit 5cf44f70 authored by Franck Dary's avatar Franck Dary
Browse files

Added tokenization

parent a3a763db
Branches
No related tags found
No related merge requests found
......@@ -6,6 +6,7 @@ DEV=$(UD_DIR)/dev.conllu
MCD=wpmlgfs.mcd
CONLLUMCD=conllu.mcd
CONLLU2MCF=$(TOOLS)/conllu2mcf.py
CONLL2TXT=$(TOOLS)/conll2text.py
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
......@@ -23,18 +24,22 @@ $(CONLLUMCD):
train.mcf: $(TRAIN) $(CONLLUMCD)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt
$(CONLL2TXT) $< $$'\n' > train.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf
rm dummy.mcf
test.mcf: $(TEST)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > test.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf
rm dummy.mcf
dev.mcf: $(DEV)
$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
$(CONLL2TXT) $< $$'\n' > dev.txt
$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf
$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf
rm dummy.mcf
......@@ -52,6 +57,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME)
clean:
- rm *\.mcf
- rm *\.txt
- rm *\.conll*
- rm conll*\.mcd
- rm $(RULES_FILENAME)
......
0 FORM
1 POS
2 MORPHO
3 LEMMA
4 GOV
5 LABEL
6 EOS
0 ID
1 FORM
2 POS
3 MORPHO
4 LEMMA
5 GOV
6 LABEL
7 EOS
8 TEXT
......@@ -3,6 +3,6 @@
LANG=UD_fr-GSD
MCF=../data/test.mcf
MCD=../data/wpmlgfs.mcd
ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM"
ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT"
exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS
Name : Tagger with error correction
Name : Tagger Machine
Dicts : tagger.dicts
%CLASSIFIERS
strategy strategy.cla
tagger tagger.cla
tokenizer tokenizer.cla
signature signature.cla
error_tagger error_tagger.cla
%STATES
strategy strategy
tokenizer tokenizer
signature signature
tagger tagger
error_tagger error_tagger
%TRANSITIONS
strategy signature MOVE signature
strategy tagger MOVE tagger
tagger error_tagger *
error_tagger tagger BACK
error_tagger strategy *
strategy tokenizer MOVE tokenizer
tagger strategy *
signature strategy *
tokenizer strategy *
......@@ -3,13 +3,17 @@ Dicts : tagger.dicts
%CLASSIFIERS
strategy strategy.cla
tagger tagger.cla
tokenizer tokenizer.cla
signature signature.cla
%STATES
strategy strategy
tokenizer tokenizer
signature signature
tagger tagger
%TRANSITIONS
strategy signature MOVE signature
strategy tagger MOVE tagger
strategy tokenizer MOVE tokenizer
tagger strategy *
signature strategy *
tokenizer strategy *
Name : Strategy
Type : Information
Oracle : strategy_tagger
Oracle : strategy_tokenizer,tagger
Oracle Filename : none
......@@ -18,6 +18,15 @@ Tagger_sgn 10 Embeddings
Tagger_actions 05 Embeddings
Tagger_entropy 05 Embeddings
#########################################################################
Tokenizer_bool 02 Embeddings
Tokenizer_int 05 Embeddings
Tokenizer_letters 30 Embeddings
Tokenizer_pos 15 Embeddings
Tokenizer_form 30 Embeddings
Tokenizer_sgn 10 Embeddings
Tokenizer_actions 05 Embeddings
Tokenizer_entropy 05 Embeddings
#########################################################################
Error_Tagger_bool 02 Embeddings
Error_Tagger_int 05 Embeddings
Error_Tagger_letters 30 Embeddings
......
......@@ -19,8 +19,6 @@ b.0#FORM.U
#b.1#FORM.U
# UPPERCASE
b.0#FORM.LEN
# EOS
b.-2#EOS
# SUFFIXES
b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3
......
#Name ref/hyp dict Policy Must print?#
############################################
FORM ref form Final 1
ID hyp none FromZero 1
FORM hyp form Final 1
POS hyp pos Final 1
SGN hyp sgn Final 0
TEXT ref none Final 0
Default : IGNORECHAR
SPLITWORD des@de@les
SPLITWORD du@de@le
SPLITWORD au@à@le
SPLITWORD Au@à@le
SPLITWORD aux@à@les
SPLITWORD auxquelles@à@lesquelles
SPLITWORD Des@de@les
SPLITWORD auquel@à@lequel
SPLITWORD Du@de@le
SPLITWORD Aux@à@les
SPLITWORD duquel@de@lequel
SPLITWORD auxquels@à@lesquels
SPLITWORD desquelles@de@lesquelles
ADDCHARTOWORD
ENDWORD
Name : Tokenizer
Type : Prediction
Oracle : tokenizer
Feature Model : tokenizer.fm
Action Set : tokenizer.as
Topology : (500,RELU,0.3)
# Features classiques
# FORM
b.0#FORM.fasttext
b.-1#FORM.fasttext
b.-2#FORM.fasttext
# POS
b.-1#POS
b.-2#POS
b.-3#POS
# SIGNATURES
b.-1#SGN
b.0#SGN
# UPPERCASE
b.0#FORM.U
# UPPERCASE
b.0#FORM.LEN
# SUFFIXES
b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3
b.0#FORM.PART.-2.-2
b.0#FORM.PART.-1.-1
b.0#FORM.PART.0.0
b.0#FORM.PART.1.1
b.0#FORM.PART.2.2
b.0#FORM.PART.3.3
# RAW INPUT
raw.-5
raw.-4
raw.-3
raw.-2
raw.-1
raw.0
raw.2
raw.3
raw.4
raw.5
raw.6
#Name ref/hyp dict Policy Must print?#
############################################
FORM ref form FromZero 1
ID hyp none FromZero 1
FORM hyp form FromZero 1
POS hyp pos FromZero 1
SGN hyp sgn FromZero 1
EOS ref int FromZero 1
TEXT ref none Final 0
#! /bin/bash
TRAIN=../../data/train.mcf
DEV=../../data/dev.mcf
TRAIN=../../data/train_tiny.mcf
DEV=../../data/dev_tiny.mcf
if [ "$2" == "-h" ]; then
macaon_train "-h"
......
......@@ -32,16 +32,35 @@ def main() :
output = []
previousId = -1
currentSentence = ""
for line in open(sys.argv[1], encoding="utf8") :
clean = line.strip()
if len(clean) < 2 :
continue
if line[0] == '#' :
if line.split('=')[0] == "# sent_id " :
continue
if line.split('=')[0] == "# text " :
currentSentence = line[8:].strip()
continue
columns = clean.split('\t')
if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
lineInMCF = []
for index in mcfMCD :
colName = mcfMCD[index]
while len(lineInMCF) < int(index)+1 :
lineInMCF.append("")
value = "_"
if colName == "EOS" :
if int(columns[int(conllMCDr["ID"])].split('-')[0]) < previousId :
value = "1"
previousId = int(columns[int(conllMCDr["ID"])].split('-')[0])
if mcfMCD[index] in conllMCDr :
indexInColumns = int(conllMCDr[mcfMCD[index]])
value = columns[indexInColumns]
lineInMCF[int(index)] = value;
output.append(lineInMCF)
continue
id = int(columns[int(conllMCDr["ID"])])
......@@ -50,10 +69,12 @@ def main() :
if gov == 0 :
relGov = 0
eos = "_"
textValue = "_"
if id < previousId :
eos = "1"
previousId = id
textValue = currentSentence
lineInMCF = []
for index in mcfMCD :
......@@ -63,6 +84,8 @@ def main() :
value = eos
elif colName == "GOV" :
value = relGov
elif colName == "TEXT" :
value = textValue
else :
indexInColumns = int(conllMCDr[mcfMCD[index]])
value = columns[indexInColumns]
......@@ -72,11 +95,24 @@ def main() :
lineInMCF[int(index)] = value
output.append(lineInMCF)
hasText = False
textIndex = 0
EOSIndex = int(mcfMCDr["EOS"])
if "TEXT" in mcfMCDr :
hasText = True
textIndex = int(mcfMCDr["TEXT"])
for i in range(len(output)-1) :
output[i][EOSIndex] = output[i+1][EOSIndex]
output[-1][EOSIndex] = "1"
if hasText :
for i in range(len(output)) :
if output[i][EOSIndex] != "1" :
output[i][textIndex] = "_"
outputFile = open(sys.argv[3], "w", encoding="utf8")
for outputLine in output :
for i in range(len(outputLine)) :
......
......@@ -36,9 +36,6 @@ def main() :
if len(line) < 2 or line[0] == '#' :
continue
splited = striped.split('\t')
if len(splited) != len(mcfMCD) :
print("ERROR : line \'%s\' wrong format.\n"%line)
exit(1)
toPrint = ""
......@@ -46,9 +43,17 @@ def main() :
col = conllMCD[str(ind)]
if col == "EMPTY" :
toPrint += "_\t"
elif col == "ID" :
elif col == "ID" and "ID" not in mcfMCDr :
toPrint += str(curID) + "\t"
elif col == "GOV" :
if int(mcfMCDr["GOV"]) >= len(splited) :
if "ID" in mcfMCDr :
curID = int(splited[int(mcfMCDr["ID"])].split('-')[0])
if (curID == 1) and not ("EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited)) :
toPrint += str(0)+'\t'
else :
toPrint += str(1)+'\t'
else :
relInd = int(splited[int(mcfMCDr["GOV"])])
gov = 0
if relInd != 0 :
......@@ -58,17 +63,22 @@ def main() :
if col not in mcfMCDr :
print("ERROR : %s not in mcf.mcd."%col)
exit(1)
if int(mcfMCDr[col]) >= len(splited) :
toPrint += '_\t'
else :
toPrint += splited[int(mcfMCDr[col])] + '\t'
print(toPrint[:-1])
if "ID" not in mcfMCDr :
curID += 1
if "EOS" in mcfMCDr :
if "EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited) :
if splited[int(mcfMCDr["EOS"])] == "1" :
print("")
curID = 1
if __name__ == "__main__" :
main()
print("")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment