Skip to content
Snippets Groups Projects
Commit 53f196fc authored by Franck Dary's avatar Franck Dary
Browse files

Allowing to pass mcd to training

parent 1e001f9b
No related branches found
No related tags found
No related merge requests found
# TODO give mcd argument to all scripts
SCRIPTS=../../../../scripts SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh
TRAIN_FILES=$(shell find . -type f -name '*train*.conllu') TRAIN_FILES=$(shell find . -name '*train*.conllu')
DEV_FILES=$(shell find . -type f -name '*dev*.conllu') DEV_FILES=$(shell find . -name '*dev*.conllu')
TEST_FILES=$(shell find . -type f -name '*test*.conllu') TEST_FILES=$(shell find . -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation #This part is for lemmatizer rules and excpetions computation
THRESHOLD=10 THRESHOLD=10
FPLM_FILENAME=fplm FPLM_FILENAME=fplm
all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts transitions pretrain
rm -f all_no_test.conllu rm -f all_no_test.conllu
all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines transitions pretrain
rm -f all_no_test.conllu rm -f all_no_test.conllu
all_no_test.conllu: tokenizer.ts: train.conllu
cat $(TRAIN_FILES) $(DEV_FILES) > $@
tokenizer.ts: all_no_test.conllu
echo "ENDWORD" > $@ echo "ENDWORD" > $@
$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt
echo "SPLIT 0" >> $@ echo "SPLIT 0" >> $@
echo "SPLIT 1" >> $@ echo "SPLIT 1" >> $@
echo "SPLIT 2" >> $@ echo "SPLIT 2" >> $@
...@@ -59,8 +57,8 @@ writescore_TRT.ts: ...@@ -59,8 +57,8 @@ writescore_TRT.ts:
writescore_FIXPROP.ts: writescore_FIXPROP.ts:
echo "WRITESCORE b.0 FIXPROP" > $@ echo "WRITESCORE b.0 FIXPROP" > $@
transitions: all_no_test.conllu transitions: train.conllu
./getTransitionSets.py $< ./getTransitionSets.py $< $(MCD)
texts: texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
...@@ -73,7 +71,7 @@ pretrain: ...@@ -73,7 +71,7 @@ pretrain:
./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 128 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 128 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \
done done
$(FPLM_FILENAME): all_no_test.conllu $(FPLM_FILENAME): train.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@ $(SCRIPTS)/conllu2fplm.py $< > $@
clean: clean:
......
...@@ -7,17 +7,20 @@ sys.path.insert(1, '../../../../scripts') ...@@ -7,17 +7,20 @@ sys.path.insert(1, '../../../../scripts')
from readMCD import readMCD from readMCD import readMCD
def printUsageAndExit() : def printUsageAndExit() :
print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
exit(1) exit(1)
if __name__ == "__main__" : if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 2 : if len(sys.argv) < 2 :
printUsageAndExit() printUsageAndExit()
mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
if len(sys.argv) == 3 :
mcd = sys.argv[2].replace(",", " ")
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") col2index, index2col = readMCD(mcd)
fileContent = [] fileContent = []
......
...@@ -45,9 +45,9 @@ if [ ! -d "$CORPUS" ]; then ...@@ -45,9 +45,9 @@ if [ ! -d "$CORPUS" ]; then
print_usage_and_exit print_usage_and_exit
fi fi
TRAIN=$(find $CORPUS -type f -name '*train*.conllu') TRAIN=$(find $CORPUS -name '*train*.conllu')
DEV=$(find $CORPUS -type f -name '*dev*.conllu') DEV=$(find $CORPUS -name '*dev*.conllu')
TEST=$(find $CORPUS -type f -name '*test*.conllu') TEST=$(find $CORPUS -name '*test*.conllu')
W2V=$(find $CORPUS -name '*.w2v') W2V=$(find $CORPUS -name '*.w2v')
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
...@@ -65,17 +65,17 @@ if [ ! -d "bin/$EXPNAME" ]; then ...@@ -65,17 +65,17 @@ if [ ! -d "bin/$EXPNAME" ]; then
cp -r $TEMPLATENAME bin/$EXPNAME cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/. cp -r "data" bin/$EXPNAME/.
if [ -f "$TRAIN" ]; then if [ -f "$TRAIN" ]; then
cp -P $TRAIN bin/$EXPNAME/data/train.conllu ln -s $(readlink -f $TRAIN) bin/$EXPNAME/data/train.conllu
fi fi
if [ -f "$DEV" ]; then if [ -f "$DEV" ]; then
cp -P $DEV bin/$EXPNAME/data/dev.conllu ln -s $(readlink -f $DEV) bin/$EXPNAME/data/dev.conllu
fi fi
if [ -f "$TEST" ]; then if [ -f "$TEST" ]; then
cp -P $TEST bin/$EXPNAME/data/test.conllu ln -s $(readlink -f $TEST) bin/$EXPNAME/data/test.conllu
fi fi
if [ ! -z "$W2V" ]; then if [ ! -z "$W2V" ]; then
mkdir -p bin/$EXPNAME/data/W2V/ mkdir -p bin/$EXPNAME/data/W2V/
cp -P $W2V bin/$EXPNAME/data/W2V/ ln -s $(readlink -f $W2V) bin/$EXPNAME/data/W2V/
fi fi
fi fi
...@@ -36,6 +36,21 @@ shift ...@@ -36,6 +36,21 @@ shift
shift shift
shift shift
MCD="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"
NO=""
for arg in "$@"
do
if [ "$NO" = "1" ]
then
MCD="$arg"
NO=""
fi
if [ "$arg" = "--mcd" ]
then
NO="1"
fi
done
if [ ! -d "$EXPPATH" ]; then if [ ! -d "$EXPPATH" ]; then
>&2 echo "ERROR : directory $EXPPATH doesn't exist" >&2 echo "ERROR : directory $EXPPATH doesn't exist"
print_usage_and_exit print_usage_and_exit
...@@ -48,7 +63,7 @@ then ...@@ -48,7 +63,7 @@ then
fi fi
CURDIR=$(pwd) CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED make $TARGET -s cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED MCD=$MCD make $TARGET -s
cd $CURDIR cd $CURDIR
TRAIN=$EXPPATH"/data/train.conllu" TRAIN=$EXPPATH"/data/train.conllu"
......
...@@ -731,13 +731,15 @@ def main() : ...@@ -731,13 +731,15 @@ def main() :
help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").") help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").")
parser.add_argument("--extra", "-x", default="", parser.add_argument("--extra", "-x", default="",
help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").") help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").")
parser.add_argument("--mcd", default="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC",
help="Comma separated list of column.")
args = parser.parse_args() args = parser.parse_args()
errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',') errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
global col2index global col2index
global index2col global index2col
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") col2index, index2col = readMCD(args.mcd.replace(",", " "))
# Evaluate # Evaluate
gold_ud, evaluations = evaluate_wrapper(args) gold_ud, evaluations = evaluate_wrapper(args)
......
...@@ -7,7 +7,7 @@ rules = {} ...@@ -7,7 +7,7 @@ rules = {}
prefix = "SPLITWORD " prefix = "SPLITWORD "
def printUsageAndExit() : def printUsageAndExit() :
print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
exit(1) exit(1)
def computeRules(sentence) : def computeRules(sentence) :
...@@ -36,10 +36,14 @@ def computeRules(sentence) : ...@@ -36,10 +36,14 @@ def computeRules(sentence) :
def main() : def main() :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 2 : if len(sys.argv) < 2 :
printUsageAndExit() printUsageAndExit()
if len(sys.argv) == 3 :
mcd = sys.argv[2].replace(",", " ")
else :
mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") col2index, index2col = readMCD(mcd)
sentence = [] sentence = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment