diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index f852ead4aa20f9f348c99965dce20b9091bc72a2..fcd2ef1b4a5de00e3d51edd3fd331421bca9075b 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,26 +1,24 @@ +# TODO give mcd argument to all scripts SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh -TRAIN_FILES=$(shell find . -type f -name '*train*.conllu') -DEV_FILES=$(shell find . -type f -name '*dev*.conllu') -TEST_FILES=$(shell find . -type f -name '*test*.conllu') +TRAIN_FILES=$(shell find . -name '*train*.conllu') +DEV_FILES=$(shell find . -name '*dev*.conllu') +TEST_FILES=$(shell find . -name '*test*.conllu') #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 FPLM_FILENAME=fplm -all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain +all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts transitions pretrain rm -f all_no_test.conllu -all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain +all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines transitions pretrain rm -f all_no_test.conllu -all_no_test.conllu: - cat $(TRAIN_FILES) $(DEV_FILES) > $@ - -tokenizer.ts: all_no_test.conllu +tokenizer.ts: train.conllu echo "ENDWORD" > $@ - $(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt + $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt echo "SPLIT 0" >> $@ echo "SPLIT 1" >> $@ echo "SPLIT 2" >> $@ @@ -59,8 +57,8 @@ writescore_TRT.ts: writescore_FIXPROP.ts: echo "WRITESCORE b.0 FIXPROP" > $@ -transitions: all_no_test.conllu - ./getTransitionSets.py $< +transitions: train.conllu + ./getTransitionSets.py $< $(MCD) texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) @@ -73,7 +71,7 @@ pretrain: ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 128 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ done -$(FPLM_FILENAME): all_no_test.conllu +$(FPLM_FILENAME): train.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ clean: diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 48292f2641123c6b5ec1918ea6223008e794391f..d8b0e938b35b2eb1a741067c29e48b1ecb606388 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -7,17 +7,20 @@ sys.path.insert(1, '../../../../scripts') from readMCD import readMCD def printUsageAndExit() : - print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr) exit(1) if __name__ == "__main__" : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) - if len(sys.argv) != 2 : + if len(sys.argv) < 2 : printUsageAndExit() + mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + if len(sys.argv) == 3 : + mcd = sys.argv[2].replace(",", " ") - col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + col2index, index2col = readMCD(mcd) fileContent = [] diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh index e43c7fab603f6a67b0b4a3607463b2eba1404230..56ea2563386a3fd4cd0710bdbcd0aca37cd7d7bf 100755 --- a/UD_any/prepareExperiment.sh +++ b/UD_any/prepareExperiment.sh @@ -45,9 +45,9 @@ if [ ! -d "$CORPUS" ]; then print_usage_and_exit fi -TRAIN=$(find $CORPUS -type f -name '*train*.conllu') -DEV=$(find $CORPUS -type f -name '*dev*.conllu') -TEST=$(find $CORPUS -type f -name '*test*.conllu') +TRAIN=$(find $CORPUS -name '*train*.conllu') +DEV=$(find $CORPUS -name '*dev*.conllu') +TEST=$(find $CORPUS -name '*test*.conllu') W2V=$(find $CORPUS -name '*.w2v') if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; @@ -65,17 +65,17 @@ if [ ! -d "bin/$EXPNAME" ]; then cp -r $TEMPLATENAME bin/$EXPNAME cp -r "data" bin/$EXPNAME/. if [ -f "$TRAIN" ]; then - cp -P $TRAIN bin/$EXPNAME/data/train.conllu + ln -s $(readlink -f $TRAIN) bin/$EXPNAME/data/train.conllu fi if [ -f "$DEV" ]; then - cp -P $DEV bin/$EXPNAME/data/dev.conllu + ln -s $(readlink -f $DEV) bin/$EXPNAME/data/dev.conllu fi if [ -f "$TEST" ]; then - cp -P $TEST bin/$EXPNAME/data/test.conllu + ln -s $(readlink -f $TEST) bin/$EXPNAME/data/test.conllu fi if [ ! -z "$W2V" ]; then mkdir -p bin/$EXPNAME/data/W2V/ - cp -P $W2V bin/$EXPNAME/data/W2V/ + ln -s $(readlink -f $W2V) bin/$EXPNAME/data/W2V/ fi fi diff --git a/UD_any/train.sh b/UD_any/train.sh index ced64375ff358b3dd5c4d7311ff5919d5b61f17a..bd0e8029e250e46f8909cea24fdaff05c88be5fa 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -36,6 +36,21 @@ shift shift shift +MCD="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL" +NO="" +for arg in "$@" +do + if [ "$NO" = "1" ] + then + MCD="$arg" + NO="" + fi + if [ "$arg" = "--mcd" ] + then + NO="1" + fi +done + if [ ! -d "$EXPPATH" ]; then >&2 echo "ERROR : directory $EXPPATH doesn't exist" print_usage_and_exit @@ -48,7 +63,7 @@ then fi CURDIR=$(pwd) -cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED make $TARGET -s +cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED MCD=$MCD make $TARGET -s cd $CURDIR TRAIN=$EXPPATH"/data/train.conllu" diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 8dd5ea89e0a23d232d1c79a1b42bcb9cd4426de7..51d2c974d53c887822072979b6406e1de54d0842 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -731,13 +731,15 @@ def main() : help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").") parser.add_argument("--extra", "-x", default="", help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").") + parser.add_argument("--mcd", default="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC", + help="Comma separated list of column.") args = parser.parse_args() errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',') global col2index global index2col - col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + col2index, index2col = readMCD(args.mcd.replace(",", " ")) # Evaluate gold_ud, evaluations = evaluate_wrapper(args) diff --git a/scripts/conllu2splits.py b/scripts/conllu2splits.py index 76a8cec1bdbf9f9df784fd2e22f809f9a23a2c60..87bbff04adc48c976569bf9571d79c5cc01c690d 100755 --- a/scripts/conllu2splits.py +++ b/scripts/conllu2splits.py @@ -7,7 +7,7 @@ rules = {} prefix = "SPLITWORD " def printUsageAndExit() : - print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr) exit(1) def computeRules(sentence) : @@ -36,10 +36,14 @@ def computeRules(sentence) : def main() : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) - if len(sys.argv) != 2 : + if len(sys.argv) < 2 : printUsageAndExit() + if len(sys.argv) == 3 : + mcd = sys.argv[2].replace(",", " ") + else : + mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" - col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + col2index, index2col = readMCD(mcd) sentence = []