diff --git a/UD_any/.gitignore b/UD_any/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f9f8e822fe2956395078574906738904b4c9daa4 --- /dev/null +++ b/UD_any/.gitignore @@ -0,0 +1,9 @@ +data/fP +data/fplm +data/maca_trans_lemmatizer_exceptions.fplm +data/*\.mcf +data/*\.txt +data/*\.as +data/*conll* +eval/UD_any-GSD.res +eval/stderr.log diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..574ebf8b1ea5d4d07fa7aac7971403e587a721dc --- /dev/null +++ b/UD_any/data/Makefile @@ -0,0 +1,55 @@ +TOOLS=../../tools +UD_ROOT=~/Downloads/ud/ud-treebanks-all/ +CONLL2TXT=$(TOOLS)/conll2text.py +MCD=conllu.mcd + +#This part is for lemmatizer rules and excpetions computation +THRESHOLD=10 +STRICT=-s +FPLM_FILENAME=fplm +FP_FILENAME=fP +RULES_FILENAME=lemmatizer_rules.as +EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm + +all: tokenizer.as texts all.conllu columns $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME) + rm col_*\.txt + rm all.conllu + +all.conllu: + cat $(UD_ROOT)*/*\.conllu > $@ + +tokenizer.as: all.conllu $(MCD) + echo "Default : IGNORECHAR" > $@ + $(TOOLS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt + echo "ENDWORD" >> $@ + echo "ADDCHARTOWORD" >> $@ + +columns: all.conllu $(MCD) + for number in 1 2 3 4 5 6 7 8 9 10 ; do \ + cat all.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \ + done + ./getActionSets.py $(MCD) col_*\.txt + +texts: + ./getRawText.py $(CONLL2TXT) $(UD_ROOT)*/*\.conllu + +$(FPLM_FILENAME): all.conllu $(MCD) + $(TOOLS)/conllu2fplm.py $< $(MCD) > $@ + +$(FP_FILENAME): $(FPLM_FILENAME) + $(TOOLS)/fplm2fP.py $< > $@ + +$(RULES_FILENAME): $(FPLM_FILENAME) + macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt $(STRICT) -t $(THRESHOLD) + cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@ + rm tmp.txt + +clean: + - rm *\.txt + - rm *\.conll* + - rm *\.as + - rm $(RULES_FILENAME) + - rm $(EXCEPTIONS_FPLM_FILENAME) + - rm $(FP_FILENAME) + - rm $(FPLM_FILENAME) + diff --git a/UD_any/data/getActionSets.py b/UD_any/data/getActionSets.py new file mode 100755 index 0000000000000000000000000000000000000000..23396d828dce5861657a18ca79940101847a5346 --- /dev/null +++ b/UD_any/data/getActionSets.py @@ -0,0 +1,69 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +if __name__ == "__main__" : + + sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) + + if len(sys.argv) < 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[1]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + for colFile in sys.argv[2:] : + numCol = int(colFile.split('.')[0].split('_')[-1]) - 1 + if not str(numCol) in conllMCD : + continue + nameCol = conllMCD[str(numCol)] + + if nameCol == "POS" : + output = open("tagger.as", 'w', encoding='utf-8') + for line in open(colFile, "r", encoding='utf-8') : + striped = line.strip() + if len(striped) == 0 : + continue + print("WRITE b.0 " + striped, file=output) + output.close() + + elif nameCol == "MORPHO" : + output = open("morpho.as", 'w', encoding='utf-8') + for line in open(colFile, "r", encoding='utf-8') : + striped = line.strip() + if len(striped) == 0 : + continue + print("WRITE b.0 " + striped, file=output) + output.close() + + elif nameCol == "LABEL" : + output = open("parser.as", 'w', encoding='utf-8') + print("REDUCE", file=output) + for line in open(colFile, "r", encoding='utf-8') : + striped = line.strip() + if len(striped) == 0 or striped == "root" or striped == "_" : + continue + print("LEFT " + striped, file=output) + print("RIGHT " + striped, file=output) + print("EOS", file=output) + print("Default : SHIFT", file=output) + output.close() + diff --git a/UD_any/data/getRawText.py b/UD_any/data/getRawText.py new file mode 100755 index 0000000000000000000000000000000000000000..313e8d9268f4bbffbd767763482bfdc15bce604a --- /dev/null +++ b/UD_any/data/getRawText.py @@ -0,0 +1,22 @@ +#! /usr/bin/python3 + +import sys +import os +import subprocess + +def printUsageAndExit() : + print("USAGE : %s conll2text.py file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) < 3 : + printUsageAndExit() + + for pathToFile in sys.argv[2:] : + splited = os.path.splitext(pathToFile) + target = splited[0] + ".txt" + + targetFile = open(target, "w") + command = sys.argv[1] + " " + pathToFile + p = subprocess.Popen(command, stdout=targetFile, stderr=sys.stderr, shell=True) + p.wait() diff --git a/UD_any/train.sh b/UD_any/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..de8f519d62230a016e731760f91883446e08fd43 --- /dev/null +++ b/UD_any/train.sh @@ -0,0 +1,71 @@ +#! /bin/bash + +function has_space { + [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 +} + +function print_usage_and_exit { + >&2 echo "USAGE : language_keyword templateName expName [arguments]" + exit 1 +} + +LANG=UD_any +LANGPATH=$MACAON_DIR/$LANG +UD_ROOT=~/Downloads/ud/ud-treebanks-all/ +MCD=$LANGPATH/data/conllu.mcd +KEYWORD=$1 +TEMPLATENAME=$2 +EXPNAME=$3 + +if [ -z "$KEYWORD" ]; +then + >&2 echo "ERROR : missing argument 1 (keyword)" + print_usage_and_exit +fi + +if [ -z "$TEMPLATENAME" ]; +then + >&2 echo "ERROR : missing argument 2 (templateName)" + print_usage_and_exit +fi + +if [ -z "$EXPNAME" ]; +then + >&2 echo "ERROR : missing argument 3 (expName)" + print_usage_and_exit +fi + +shift +shift +shift + +TRAIN=$(echo $UD_ROOT*$KEYWORD*/*train*\.conllu) +DEV=$(echo $UD_ROOT*$KEYWORD*/*dev*\.conllu) +TEST=$(echo $UD_ROOT*$KEYWORD*/*test*\.conllu) + +if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; +then + >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD + >&2 echo "TRAIN : " $TRAIN + >&2 echo "DEV : " $DEV + >&2 echo "TEST : " $TEST + print_usage_and_exit +fi + +if test ! -f $TRAIN; +then + >&2 echo "ERROR : no train file found with keyword" $KEYWORD + >&2 echo "$TRAIN" + print_usage_and_exit +fi + +TEMPLATEPATH=$LANGPATH/$TEMPLATENAME +mkdir -p $LANGPATH/bin + +if [ ! -d "$TEMPLATEPATH" ]; then + >&2 echo "ERROR : directory $TEMPLATEPATH doesn't exist" + print_usage_and_exit +fi + +macaon_train --tm machine.tm --bd train.bd --mcd $MCD -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG --templateName $TEMPLATENAME $@ +