Skip to content
Snippets Groups Projects
Commit 2066e73e authored by Franck Dary's avatar Franck Dary
Browse files

Added UD_any

parent 42b497b6
Branches
Tags
No related merge requests found
data/fP
data/fplm
data/maca_trans_lemmatizer_exceptions.fplm
data/*\.mcf
data/*\.txt
data/*\.as
data/*conll*
eval/UD_any-GSD.res
eval/stderr.log
TOOLS=../../tools
UD_ROOT=~/Downloads/ud/ud-treebanks-all/
CONLL2TXT=$(TOOLS)/conll2text.py
MCD=conllu.mcd
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
STRICT=-s
FPLM_FILENAME=fplm
FP_FILENAME=fP
RULES_FILENAME=lemmatizer_rules.as
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: tokenizer.as texts all.conllu columns $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME)
rm col_*\.txt
rm all.conllu
all.conllu:
cat $(UD_ROOT)*/*\.conllu > $@
tokenizer.as: all.conllu $(MCD)
echo "Default : IGNORECHAR" > $@
$(TOOLS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
echo "ENDWORD" >> $@
echo "ADDCHARTOWORD" >> $@
columns: all.conllu $(MCD)
for number in 1 2 3 4 5 6 7 8 9 10 ; do \
cat all.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
done
./getActionSets.py $(MCD) col_*\.txt
texts:
./getRawText.py $(CONLL2TXT) $(UD_ROOT)*/*\.conllu
$(FPLM_FILENAME): all.conllu $(MCD)
$(TOOLS)/conllu2fplm.py $< $(MCD) > $@
$(FP_FILENAME): $(FPLM_FILENAME)
$(TOOLS)/fplm2fP.py $< > $@
$(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt $(STRICT) -t $(THRESHOLD)
cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@
rm tmp.txt
clean:
- rm *\.txt
- rm *\.conll*
- rm *\.as
- rm $(RULES_FILENAME)
- rm $(EXCEPTIONS_FPLM_FILENAME)
- rm $(FP_FILENAME)
- rm $(FPLM_FILENAME)
#! /usr/bin/python3
import sys
def printUsageAndExit() :
print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) < 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[1])
conllMCDr = {v: k for k, v in conllMCD.items()}
for colFile in sys.argv[2:] :
numCol = int(colFile.split('.')[0].split('_')[-1]) - 1
if not str(numCol) in conllMCD :
continue
nameCol = conllMCD[str(numCol)]
if nameCol == "POS" :
output = open("tagger.as", 'w', encoding='utf-8')
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 " + striped, file=output)
output.close()
elif nameCol == "MORPHO" :
output = open("morpho.as", 'w', encoding='utf-8')
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 " + striped, file=output)
output.close()
elif nameCol == "LABEL" :
output = open("parser.as", 'w', encoding='utf-8')
print("REDUCE", file=output)
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 or striped == "root" or striped == "_" :
continue
print("LEFT " + striped, file=output)
print("RIGHT " + striped, file=output)
print("EOS", file=output)
print("Default : SHIFT", file=output)
output.close()
#! /usr/bin/python3
import sys
import os
import subprocess
def printUsageAndExit() :
print("USAGE : %s conll2text.py file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) < 3 :
printUsageAndExit()
for pathToFile in sys.argv[2:] :
splited = os.path.splitext(pathToFile)
target = splited[0] + ".txt"
targetFile = open(target, "w")
command = sys.argv[1] + " " + pathToFile
p = subprocess.Popen(command, stdout=targetFile, stderr=sys.stderr, shell=True)
p.wait()
#! /bin/bash
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
function print_usage_and_exit {
>&2 echo "USAGE : language_keyword templateName expName [arguments]"
exit 1
}
LANG=UD_any
LANGPATH=$MACAON_DIR/$LANG
UD_ROOT=~/Downloads/ud/ud-treebanks-all/
MCD=$LANGPATH/data/conllu.mcd
KEYWORD=$1
TEMPLATENAME=$2
EXPNAME=$3
if [ -z "$KEYWORD" ];
then
>&2 echo "ERROR : missing argument 1 (keyword)"
print_usage_and_exit
fi
if [ -z "$TEMPLATENAME" ];
then
>&2 echo "ERROR : missing argument 2 (templateName)"
print_usage_and_exit
fi
if [ -z "$EXPNAME" ];
then
>&2 echo "ERROR : missing argument 3 (expName)"
print_usage_and_exit
fi
shift
shift
shift
TRAIN=$(echo $UD_ROOT*$KEYWORD*/*train*\.conllu)
DEV=$(echo $UD_ROOT*$KEYWORD*/*dev*\.conllu)
TEST=$(echo $UD_ROOT*$KEYWORD*/*test*\.conllu)
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
>&2 echo "TRAIN : " $TRAIN
>&2 echo "DEV : " $DEV
>&2 echo "TEST : " $TEST
print_usage_and_exit
fi
if test ! -f $TRAIN;
then
>&2 echo "ERROR : no train file found with keyword" $KEYWORD
>&2 echo "$TRAIN"
print_usage_and_exit
fi
TEMPLATEPATH=$LANGPATH/$TEMPLATENAME
mkdir -p $LANGPATH/bin
if [ ! -d "$TEMPLATEPATH" ]; then
>&2 echo "ERROR : directory $TEMPLATEPATH doesn't exist"
print_usage_and_exit
fi
macaon_train --tm machine.tm --bd train.bd --mcd $MCD -T $TRAIN --dev $DEV --expName $EXPNAME --lang $LANG --templateName $TEMPLATENAME $@
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment