From 53f196fc94d835b5c34600fd822c113e1f755b48 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 12 Jan 2022 14:15:25 +0100
Subject: [PATCH] Allowing to pass mcd to training

---
 UD_any/data/Makefile             | 24 +++++++++++-------------
 UD_any/data/getTransitionSets.py |  9 ++++++---
 UD_any/prepareExperiment.sh      | 14 +++++++-------
 UD_any/train.sh                  | 17 ++++++++++++++++-
 scripts/conll18_ud_eval.py       |  4 +++-
 scripts/conllu2splits.py         | 10 +++++++---
 6 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index f852ead..fcd2ef1 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -1,26 +1,24 @@
+# TODO give mcd argument to all scripts
 SCRIPTS=../../../../scripts
 CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
 CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh
 
-TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
-DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
-TEST_FILES=$(shell find . -type f -name '*test*.conllu')
+TRAIN_FILES=$(shell find . -name '*train*.conllu')
+DEV_FILES=$(shell find . -name '*dev*.conllu')
+TEST_FILES=$(shell find . -name '*test*.conllu')
 
 #This part is for lemmatizer rules and excpetions computation
 THRESHOLD=10
 FPLM_FILENAME=fplm
 
-all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
+all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts transitions pretrain
 	rm -f all_no_test.conllu
-all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain
+all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines transitions pretrain
 	rm -f all_no_test.conllu
 
-all_no_test.conllu:
-	cat $(TRAIN_FILES) $(DEV_FILES) > $@
-
-tokenizer.ts: all_no_test.conllu
+tokenizer.ts: train.conllu
 	echo "ENDWORD" > $@
-	$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt
+	$(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt
 	echo "SPLIT 0" >> $@
 	echo "SPLIT 1" >> $@
 	echo "SPLIT 2" >> $@
@@ -59,8 +57,8 @@ writescore_TRT.ts:
 writescore_FIXPROP.ts:
 	echo "WRITESCORE b.0 FIXPROP" > $@
  
-transitions: all_no_test.conllu
-	./getTransitionSets.py $<
+transitions: train.conllu
+	./getTransitionSets.py $< $(MCD)
 
 texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
@@ -73,7 +71,7 @@ pretrain:
 	./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 128 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \
 	done
 
-$(FPLM_FILENAME): all_no_test.conllu
+$(FPLM_FILENAME): train.conllu
 	$(SCRIPTS)/conllu2fplm.py $< > $@
 
 clean:
diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py
index 48292f2..d8b0e93 100755
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
@@ -7,17 +7,20 @@ sys.path.insert(1, '../../../../scripts')
 from readMCD import readMCD
 
 def printUsageAndExit() :
-  print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 if __name__ == "__main__" :
 
   sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-  if len(sys.argv) != 2 :
+  if len(sys.argv) < 2 :
     printUsageAndExit()
+  mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+  if len(sys.argv) == 3 :
+    mcd = sys.argv[2].replace(",", " ")
 
-  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
+  col2index, index2col = readMCD(mcd)
 
   fileContent = []
 
diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh
index e43c7fa..56ea256 100755
--- a/UD_any/prepareExperiment.sh
+++ b/UD_any/prepareExperiment.sh
@@ -45,9 +45,9 @@ if [ ! -d "$CORPUS" ]; then
   print_usage_and_exit
 fi
 
-TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
-DEV=$(find $CORPUS -type f -name '*dev*.conllu')
-TEST=$(find $CORPUS -type f -name '*test*.conllu')
+TRAIN=$(find $CORPUS -name '*train*.conllu')
+DEV=$(find $CORPUS -name '*dev*.conllu')
+TEST=$(find $CORPUS -name '*test*.conllu')
 W2V=$(find $CORPUS -name '*.w2v')
 
 if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
@@ -65,17 +65,17 @@ if [ ! -d "bin/$EXPNAME" ]; then
 	cp -r $TEMPLATENAME bin/$EXPNAME
 	cp -r "data" bin/$EXPNAME/.
 	if [ -f "$TRAIN" ]; then
-		cp -P $TRAIN bin/$EXPNAME/data/train.conllu
+    ln -s $(readlink -f $TRAIN) bin/$EXPNAME/data/train.conllu
 	fi
 	if [ -f "$DEV" ]; then
-		cp -P $DEV bin/$EXPNAME/data/dev.conllu
+    ln -s $(readlink -f $DEV) bin/$EXPNAME/data/dev.conllu
 	fi
 	if [ -f "$TEST" ]; then
-		cp -P $TEST bin/$EXPNAME/data/test.conllu
+    ln -s $(readlink -f $TEST) bin/$EXPNAME/data/test.conllu
 	fi
 	if [ ! -z "$W2V" ]; then
 		mkdir -p bin/$EXPNAME/data/W2V/
-		cp -P $W2V bin/$EXPNAME/data/W2V/
+    ln -s $(readlink -f $W2V) bin/$EXPNAME/data/W2V/
 	fi
 fi
 
diff --git a/UD_any/train.sh b/UD_any/train.sh
index ced6437..bd0e802 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -36,6 +36,21 @@ shift
 shift
 shift
 
+MCD="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"
+NO=""
+for arg in "$@"
+do
+  if [ "$NO" = "1" ]
+  then
+    MCD="$arg" 
+    NO=""
+  fi
+  if [ "$arg" = "--mcd" ]
+  then
+    NO="1"
+  fi
+done
+
 if [ ! -d "$EXPPATH" ]; then
   >&2 echo "ERROR : directory $EXPPATH doesn't exist"
   print_usage_and_exit
@@ -48,7 +63,7 @@ then
 fi
 
 CURDIR=$(pwd)
-cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED make $TARGET -s
+cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED MCD=$MCD make $TARGET -s
 cd $CURDIR
 
 TRAIN=$EXPPATH"/data/train.conllu"
diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py
index 8dd5ea8..51d2c97 100755
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
@@ -731,13 +731,15 @@ def main() :
     help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").")
   parser.add_argument("--extra", "-x", default="",
     help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").")
+  parser.add_argument("--mcd", default="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC",
+    help="Comma separated list of column.")
   args = parser.parse_args()
 
   errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
 
   global col2index
   global index2col
-  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
+  col2index, index2col = readMCD(args.mcd.replace(",", " "))
 
   # Evaluate
   gold_ud, evaluations = evaluate_wrapper(args)
diff --git a/scripts/conllu2splits.py b/scripts/conllu2splits.py
index 76a8cec..87bbff0 100755
--- a/scripts/conllu2splits.py
+++ b/scripts/conllu2splits.py
@@ -7,7 +7,7 @@ rules = {}
 prefix = "SPLITWORD "
 
 def printUsageAndExit() :
-  print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 def computeRules(sentence) :
@@ -36,10 +36,14 @@ def computeRules(sentence) :
 def main() :
   sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-  if len(sys.argv) != 2 :
+  if len(sys.argv) < 2 :
     printUsageAndExit()
+  if len(sys.argv) == 3 :
+    mcd = sys.argv[2].replace(",", " ")
+  else :
+    mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
 
-  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
+  col2index, index2col = readMCD(mcd)
 
   sentence = []
 
-- 
GitLab