Added UD_any

2066e73e · Franck Dary · 42b497b6 · 2066e73e · 2066e73e · 2066e73e
Commit 2066e73e authored Oct 14, 2019 by Franck Dary
--- a/UD_any/.gitignore
+++ b/UD_any/.gitignore
+data/fP
+data/fplm
+data/maca_trans_lemmatizer_exceptions.fplm
+data/*\.mcf
+data/*\.txt
+data/*\.as
+data/*conll*
+eval/UD_any-GSD.res
+eval/stderr.log
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
+TOOLS=../../tools
+UD_ROOT=~/Downloads/ud/ud-treebanks-all/
+CONLL2TXT=$(TOOLS)/conll2text.py
+MCD=conllu.mcd
+#This part is for lemmatizer rules and excpetions computation
+THRESHOLD=10
+STRICT=-s
+FPLM_FILENAME=fplm
+FP_FILENAME=fP
+RULES_FILENAME=lemmatizer_rules.as
+EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
+all: tokenizer.as texts all.conllu columns $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME)
+	rm col_*\.txt
+	rm all.conllu
+all.conllu:
+	cat $(UD_ROOT)*/*\.conllu > $@
+tokenizer.as: all.conllu $(MCD)
+	echo "Default : IGNORECHAR" > $@
+	$(TOOLS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
+	echo "ENDWORD" >> $@
+	echo "ADDCHARTOWORD" >> $@
+columns: all.conllu $(MCD)
+	for number in 1 2 3 4 5 6 7 8 9 10 ; do \
+		cat all.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
+	done
+	./getActionSets.py $(MCD) col_*\.txt
+texts:
+	./getRawText.py $(CONLL2TXT) $(UD_ROOT)*/*\.conllu
+$(FPLM_FILENAME): all.conllu $(MCD)
+	$(TOOLS)/conllu2fplm.py $< $(MCD) > $@
+$(FP_FILENAME): $(FPLM_FILENAME)
+	$(TOOLS)/fplm2fP.py $< > $@
+$(RULES_FILENAME): $(FPLM_FILENAME)
+	macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt $(STRICT) -t $(THRESHOLD)
+	cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@
+	rm tmp.txt
+clean:
+	- rm *\.txt
+	- rm *\.conll*
+	- rm *\.as
+	- rm $(RULES_FILENAME)
+	- rm $(EXCEPTIONS_FPLM_FILENAME)
+	- rm $(FP_FILENAME)
+	- rm $(FPLM_FILENAME)
--- a/UD_any/data/getActionSets.py
+++ b/UD_any/data/getActionSets.py
+#! /usr/bin/python3
+import sys
+def printUsageAndExit() :
+  print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr)
+  exit(1)
+def readMCD(mcdFilename) :
+  mcd = {}
+  for line in open(mcdFilename, "r", encoding="utf8") :
+    clean = line.strip()
+    if len(line) < 2 or line[0] == '#' :
+      continue
+    splited = line.split(' ')
+    if len(splited) != 2 :
+      print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
+      exit(1)
+    mcd[splited[0].strip()] = splited[1].strip()
+  return mcd
+if __name__ == "__main__" :
+  sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
+  if len(sys.argv) < 3 :
+    printUsageAndExit()
+  conllMCD = readMCD(sys.argv[1])
+  conllMCDr = {v: k for k, v in conllMCD.items()} 
+  for colFile in sys.argv[2:] :
+    numCol = int(colFile.split('.')[0].split('_')[-1]) - 1
+    if not str(numCol) in conllMCD :
+      continue
+    nameCol = conllMCD[str(numCol)]
+    if nameCol == "POS" :
+      output = open("tagger.as", 'w', encoding='utf-8')
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        print("WRITE b.0 " + striped, file=output)
+      output.close()
+    elif nameCol == "MORPHO" :
+      output = open("morpho.as", 'w', encoding='utf-8')
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        print("WRITE b.0 " + striped, file=output)
+      output.close()
+    elif nameCol == "LABEL" :
+      output = open("parser.as", 'w', encoding='utf-8')
+      print("REDUCE", file=output)
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 or striped == "root" or striped == "_" :
+          continue
+        print("LEFT " + striped, file=output)
+        print("RIGHT " + striped, file=output)
+      print("EOS", file=output)
+      print("Default : SHIFT", file=output)
+      output.close()
--- a/UD_any/data/getRawText.py
+++ b/UD_any/data/getRawText.py
+#! /usr/bin/python3
+import sys
+import os
+import subprocess
+def printUsageAndExit() :
+  print("USAGE : %s conll2text.py file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr)
+  exit(1)
+if __name__ == "__main__" :
+  if len(sys.argv) < 3 :
+    printUsageAndExit()
+  for pathToFile in sys.argv[2:] :
+    splited = os.path.splitext(pathToFile)
+    target = splited[0] + ".txt"
+    targetFile = open(target, "w")
+    command = sys.argv[1] + " " + pathToFile
+    p = subprocess.Popen(command, stdout=targetFile, stderr=sys.stderr, shell=True)
+    p.wait()
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
+#! /bin/bash
+function has_space {
+  [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
+}
+function print_usage_and_exit {
+  >&2 echo "USAGE : language_keyword templateName expName [arguments]"
+  exit 1
+}
+LANG=UD_any
+LANGPATH=$MACAON_DIR/$LANG
+UD_ROOT=~/Downloads/ud/ud-treebanks-all/
+MCD=$LANGPATH/data/conllu.mcd
+KEYWORD=$1
+TEMPLATENAME=$2
+EXPNAME=$3
+if [ -z "$KEYWORD" ];
+then
+  >&2 echo "ERROR : missing argument 1 (keyword)"
+  print_usage_and_exit
+fi
+if [ -z "$TEMPLATENAME" ];
+then
+  >&2 echo "ERROR : missing argument 2 (templateName)"
+  print_usage_and_exit
+fi
+if [ -z "$EXPNAME" ];
+then
+  >&2 echo "ERROR : missing argument 3 (expName)"
+  print_usage_and_exit
+fi
+shift
+shift
+shift
+TRAIN=$(echo $UD_ROOT*$KEYWORD*/*train*\.conllu)
+DEV=$(echo $UD_ROOT*$KEYWORD*/*dev*\.conllu)
+TEST=$(echo $UD_ROOT*$KEYWORD*/*test*\.conllu)
+if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
+then
+  >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
+  >&2 echo "TRAIN : " $TRAIN
+  >&2 echo "DEV : " $DEV
+  >&2 echo "TEST : " $TEST
+  print_usage_and_exit
+fi
+if test ! -f $TRAIN;
+then
+  >&2 echo "ERROR : no train file found with keyword" $KEYWORD
+  >&2 echo "$TRAIN"
+  print_usage_and_exit
+fi
+TEMPLATEPATH=$LANGPATH/$TEMPLATENAME
+mkdir -p $LANGPATH/bin
+if [ ! -d "$TEMPLATEPATH" ]; then
+  >&2 echo "ERROR : directory $TEMPLATEPATH doesn't exist"
+  print_usage_and_exit
+fi
+macaon_train --tm machine.tm --bd train.bd --mcd $MCD -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG --templateName $TEMPLATENAME $@