Added tokenization

5cf44f70 · Franck Dary · a3a763db · 5cf44f70 · 5cf44f70 · 5cf44f70
Commit 5cf44f70 authored Sep 20, 2019 by Franck Dary
--- a/UD_fr-GSD/data/Makefile
+++ b/UD_fr-GSD/data/Makefile
@@ -6,6 +6,7 @@ DEV=$(UD_DIR)/dev.conllu
 MCD=wpmlgfs.mcd
 CONLLUMCD=conllu.mcd
 CONLLU2MCF=$(TOOLS)/conllu2mcf.py
+CONLL2TXT=$(TOOLS)/conll2text.py

 #This part is for lemmatizer rules and excpetions computation
 THRESHOLD=10
@@ -23,18 +24,22 @@ $(CONLLUMCD):

 train.mcf: $(TRAIN) $(CONLLUMCD)
 	$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
+	$(TOOLS)/conllu2splits.py $< $(CONLLUMCD) > splits.txt
+	$(CONLL2TXT) $< $$'\n' > train.txt
 	$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > train_noponct.mcf
 	$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf train_tiny.mcf
 	rm dummy.mcf

 test.mcf: $(TEST)
 	$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
+	$(CONLL2TXT) $< $$'\n' > test.txt
 	$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > test_noponct.mcf
 	$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.6 dummy.mcf test_tiny.mcf
 	rm dummy.mcf

 dev.mcf: $(DEV)
 	$(CONLLU2MCF) $< $(CONLLUMCD) $@ $(MCD)
+	$(CONLL2TXT) $< $$'\n' > dev.txt
 	$(TOOLS)/mcfRemovePonct.py $@ $(MCD) > dev_noponct.mcf
 	$(TOOLS)/mcfShuffleAndMakeDev.py $@ $(MCD) 0.2 dummy.mcf dev_tiny.mcf
 	rm dummy.mcf
@@ -52,6 +57,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME)

 clean:
 	- rm *\.mcf
+	- rm *\.txt
 	- rm *\.conll*
 	- rm conll*\.mcd
 	- rm $(RULES_FILENAME)

--- a/UD_fr-GSD/data/wpmlgfs.mcd
+++ b/UD_fr-GSD/data/wpmlgfs.mcd
-0 FORM
-1 POS
-2 MORPHO
-3 LEMMA
-4 GOV
-5 LABEL
-6 EOS
+0 ID
+1 FORM
+2 POS
+3 MORPHO
+4 LEMMA
+5 GOV
+6 LABEL
+7 EOS
+8 TEXT
--- a/UD_fr-GSD/eval/eval.sh
+++ b/UD_fr-GSD/eval/eval.sh
@@ -3,6 +3,6 @@
 LANG=UD_fr-GSD
 MCF=../data/test.mcf
 MCD=../data/wpmlgfs.mcd
-ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM"
+ARGS="--keepPunct EOS --relative LABEL GOV --ignore FORM --ignore TEXT"

 exec ../../scripts/eval.py $LANG $MCF $MCD $* $ARGS
--- a/UD_fr-GSD/tagger/machine.tm
+++ b/UD_fr-GSD/tagger/machine.tm
-Name : Tagger with error correction
+Name : Tagger Machine
 Dicts : tagger.dicts
 %CLASSIFIERS
 strategy strategy.cla
 tagger tagger.cla
+tokenizer tokenizer.cla
 signature signature.cla
-error_tagger error_tagger.cla
 %STATES
 strategy strategy
+tokenizer tokenizer
 signature signature
 tagger tagger
-error_tagger error_tagger
 %TRANSITIONS
 strategy signature MOVE signature
 strategy tagger MOVE tagger
-tagger error_tagger *
-error_tagger tagger BACK
-error_tagger strategy *
+strategy tokenizer MOVE tokenizer
+tagger strategy *
 signature strategy *
+tokenizer strategy *
--- a/UD_fr-GSD/tagger/normal.tm
+++ b/UD_fr-GSD/tagger/normal.tm
@@ -3,13 +3,17 @@ Dicts : tagger.dicts
 %CLASSIFIERS
 strategy strategy.cla
 tagger tagger.cla
+tokenizer tokenizer.cla
 signature signature.cla
 %STATES
 strategy strategy
+tokenizer tokenizer
 signature signature
 tagger tagger
 %TRANSITIONS
 strategy signature MOVE signature
 strategy tagger MOVE tagger
+strategy tokenizer MOVE tokenizer
 tagger strategy *
 signature strategy *
+tokenizer strategy *
--- a/UD_fr-GSD/tagger/strategy.cla
+++ b/UD_fr-GSD/tagger/strategy.cla
 Name : Strategy
 Type : Information
-Oracle : strategy_tagger
+Oracle : strategy_tokenizer,tagger
 Oracle Filename : none
--- a/UD_fr-GSD/tagger/tagger.dicts
+++ b/UD_fr-GSD/tagger/tagger.dicts
@@ -18,6 +18,15 @@ Tagger_sgn     10        Embeddings
 Tagger_actions 05        Embeddings
 Tagger_entropy 05        Embeddings
 #########################################################################
+Tokenizer_bool    02        Embeddings
+Tokenizer_int     05        Embeddings
+Tokenizer_letters 30        Embeddings
+Tokenizer_pos     15        Embeddings
+Tokenizer_form    30        Embeddings
+Tokenizer_sgn     10        Embeddings
+Tokenizer_actions 05        Embeddings
+Tokenizer_entropy 05        Embeddings
+#########################################################################
 Error_Tagger_bool    02        Embeddings
 Error_Tagger_int     05        Embeddings
 Error_Tagger_letters 30        Embeddings

--- a/UD_fr-GSD/tagger/tagger.fm
+++ b/UD_fr-GSD/tagger/tagger.fm
@@ -19,8 +19,6 @@ b.0#FORM.U
 #b.1#FORM.U
 # UPPERCASE
 b.0#FORM.LEN
-# EOS
-b.-2#EOS
 # SUFFIXES
 b.0#FORM.PART.-4.-4
 b.0#FORM.PART.-3.-3

--- a/UD_fr-GSD/tagger/test.bd
+++ b/UD_fr-GSD/tagger/test.bd
 #Name  ref/hyp dict    Policy   Must print?#
 ############################################
-FORM   ref     form    Final    1
+ID     hyp     none    FromZero 1
+FORM   hyp     form    Final    1
 POS    hyp     pos     Final    1
 SGN    hyp     sgn     Final    0
+TEXT   ref     none    Final    0
--- a/UD_fr-GSD/tagger/tokenizer.as
+++ b/UD_fr-GSD/tagger/tokenizer.as
+Default : IGNORECHAR
+SPLITWORD des@de@les
+SPLITWORD du@de@le
+SPLITWORD au@à@le
+SPLITWORD Au@à@le
+SPLITWORD aux@à@les
+SPLITWORD auxquelles@à@lesquelles
+SPLITWORD Des@de@les
+SPLITWORD auquel@à@lequel
+SPLITWORD Du@de@le
+SPLITWORD Aux@à@les
+SPLITWORD duquel@de@lequel
+SPLITWORD auxquels@à@lesquels
+SPLITWORD desquelles@de@lesquelles
+ADDCHARTOWORD
+ENDWORD
--- a/UD_fr-GSD/tagger/tokenizer.cla
+++ b/UD_fr-GSD/tagger/tokenizer.cla
+Name : Tokenizer
+Type : Prediction
+Oracle : tokenizer
+Feature Model : tokenizer.fm
+Action Set : tokenizer.as
+Topology : (500,RELU,0.3)
--- a/UD_fr-GSD/tagger/tokenizer.fm
+++ b/UD_fr-GSD/tagger/tokenizer.fm
+# Features classiques
+# FORM
+b.0#FORM.fasttext
+b.-1#FORM.fasttext
+b.-2#FORM.fasttext
+# POS
+b.-1#POS
+b.-2#POS
+b.-3#POS
+# SIGNATURES
+b.-1#SGN
+b.0#SGN
+# UPPERCASE
+b.0#FORM.U
+# UPPERCASE
+b.0#FORM.LEN
+# SUFFIXES
+b.0#FORM.PART.-4.-4
+b.0#FORM.PART.-3.-3
+b.0#FORM.PART.-2.-2
+b.0#FORM.PART.-1.-1
+b.0#FORM.PART.0.0
+b.0#FORM.PART.1.1
+b.0#FORM.PART.2.2
+b.0#FORM.PART.3.3
+# RAW INPUT
+raw.-5
+raw.-4
+raw.-3
+raw.-2
+raw.-1
+raw.0
+raw.2
+raw.3
+raw.4
+raw.5
+raw.6
--- a/UD_fr-GSD/tagger/train.bd
+++ b/UD_fr-GSD/tagger/train.bd
 #Name  ref/hyp dict    Policy   Must print?#
 ############################################
-FORM   ref     form    FromZero 1
+ID     hyp     none    FromZero 1
+FORM   hyp     form    FromZero 1
 POS    hyp     pos     FromZero 1
 SGN    hyp     sgn     FromZero 1
 EOS    ref     int     FromZero 1
+TEXT   ref     none    Final    0
--- a/scripts/train.sh
+++ b/scripts/train.sh
 #! /bin/bash

-TRAIN=../../data/train.mcf
-DEV=../../data/dev.mcf
+TRAIN=../../data/train_tiny.mcf
+DEV=../../data/dev_tiny.mcf

 if [ "$2" == "-h" ]; then
  macaon_train "-h"

--- a/tools/conllu2mcf.py
+++ b/tools/conllu2mcf.py
@@ -32,16 +32,35 @@ def main() :
  output = []
  previousId = -1

+  currentSentence = ""
  for line in open(sys.argv[1], encoding="utf8") :
    clean = line.strip()
    if len(clean) < 2 :
      continue
-    if line[0] == '#' :
+    if line.split('=')[0] == "# sent_id " :
+      continue
+    if line.split('=')[0] == "# text " :
+      currentSentence = line[8:].strip()
      continue

    columns = clean.split('\t')

    if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
+      lineInMCF = []
+      for index in mcfMCD :
+        colName = mcfMCD[index]
+        while  len(lineInMCF) < int(index)+1 :
+          lineInMCF.append("")
+        value = "_"
+        if colName == "EOS" :
+          if int(columns[int(conllMCDr["ID"])].split('-')[0]) < previousId :
+            value = "1"
+            previousId = int(columns[int(conllMCDr["ID"])].split('-')[0])
+        if mcfMCD[index] in conllMCDr :
+          indexInColumns = int(conllMCDr[mcfMCD[index]])
+          value = columns[indexInColumns]
+        lineInMCF[int(index)] = value;
+      output.append(lineInMCF)
      continue

    id = int(columns[int(conllMCDr["ID"])])
@@ -50,10 +69,12 @@ def main() :
    if gov == 0 :
      relGov = 0
    eos = "_"
+    textValue = "_"
    if id < previousId :
      eos = "1"

    previousId = id
+    textValue = currentSentence

    lineInMCF = []
    for index in mcfMCD :
@@ -63,6 +84,8 @@ def main() :
        value = eos
      elif colName == "GOV" :
        value = relGov
+      elif colName == "TEXT" : 
+        value = textValue
      else :
        indexInColumns = int(conllMCDr[mcfMCD[index]])
        value = columns[indexInColumns]
@@ -72,11 +95,24 @@ def main() :
      lineInMCF[int(index)] = value
    output.append(lineInMCF)

+  hasText = False
+  textIndex = 0
  EOSIndex = int(mcfMCDr["EOS"])
+
+  if "TEXT" in mcfMCDr :
+    hasText = True
+    textIndex = int(mcfMCDr["TEXT"])
+
  for i in range(len(output)-1) :
    output[i][EOSIndex] = output[i+1][EOSIndex]
+
  output[-1][EOSIndex] = "1"

+  if hasText :
+    for i in range(len(output)) :
+      if output[i][EOSIndex] != "1" :
+        output[i][textIndex] = "_"
+
  outputFile = open(sys.argv[3], "w", encoding="utf8")
  for outputLine in output :
    for i in range(len(outputLine)) :

--- a/tools/mcf2conllu.py
+++ b/tools/mcf2conllu.py
@@ -36,9 +36,6 @@ def main() :
    if len(line) < 2 or line[0] == '#' :
      continue
    splited = striped.split('\t')
-    if len(splited) != len(mcfMCD) :
-      print("ERROR : line \'%s\' wrong format.\n"%line)
-      exit(1)

    toPrint = ""

@@ -46,9 +43,17 @@ def main() :
      col = conllMCD[str(ind)]
      if col == "EMPTY" :
        toPrint += "_\t"
-      elif col == "ID" :
+      elif col == "ID" and "ID" not in mcfMCDr :
        toPrint += str(curID) + "\t"
      elif col == "GOV" :
+        if int(mcfMCDr["GOV"]) >= len(splited) :
+          if "ID" in mcfMCDr :
+            curID = int(splited[int(mcfMCDr["ID"])].split('-')[0])
+          if (curID == 1) and not ("EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited)) :
+            toPrint += str(0)+'\t'
+          else :
+            toPrint += str(1)+'\t'
+        else :
          relInd = int(splited[int(mcfMCDr["GOV"])])
          gov = 0
          if relInd != 0 :
@@ -58,17 +63,22 @@ def main() :
        if col not in mcfMCDr :
          print("ERROR : %s not in mcf.mcd."%col)
          exit(1)
+        if int(mcfMCDr[col]) >= len(splited) :
+          toPrint += '_\t'
+        else :
          toPrint += splited[int(mcfMCDr[col])] + '\t'

    print(toPrint[:-1])
+    
+    if "ID" not in mcfMCDr :
      curID += 1

-    if "EOS" in mcfMCDr :
+    if "EOS" in mcfMCDr and int(mcfMCDr["EOS"]) < len(splited) :
      if splited[int(mcfMCDr["EOS"])] == "1" :
        print("")
        curID = 1

-
 if __name__ == "__main__" :
  main()
+  print("")