diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 7f76b6509954ac1cca4d0945fc510be2ef0f27ec..22240cf1eefd07b58e5206f43f5a566762269807 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -10,9 +10,9 @@ TEST_FILES=$(shell find . -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all_text: writescore_TIME.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain +all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain rm -f all_no_test.conllu -all_lines: writescore_TIME.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain +all_lines: writescore_FFD.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain rm -f all_no_test.conllu all_no_test.conllu: @@ -44,8 +44,20 @@ segmenter.ts: echo "NOTEOS b.0" >> $@ sed -i -e 's/^/<segmenter> /' $@ -writescore_TIME.ts: - echo "WRITESCORE b.0 TOTAL_FIXATION_DURATION" > $@ +writescore_NFIX.ts: + echo "WRITESCORE b.0 NFIX" > $@ + +writescore_FFD.ts: + echo "WRITESCORE b.0 FFD" > $@ + +writescore_GPT.ts: + echo "WRITESCORE b.0 GPT" > $@ + +writescore_TRT.ts: + echo "WRITESCORE b.0 RTR" > $@ + +writescore_FIXPROP.ts: + echo "WRITESCORE b.0 FIXPROP" > $@ transitions: all_no_test.conllu ./getTransitionSets.py $< diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 2b760bdf32bd516b4536f60f8bab6febdff49417..ee553acdac36d0a1f68b700764a6b5d0ae3b09ed 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -246,6 +246,7 @@ def load_conllu(file) : # Load the CoNLL-U file index, sentence_start = 0, None + id_starts_at_zero = False while True : line = file.readline() if not line : @@ -337,9 +338,11 @@ def load_conllu(file) : else : try : word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_" + if word_id == 0 : + id_starts_at_zero = True except : raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]]))) - if word_id != len(ud.words) - sentence_start + 1 : + if word_id != len(ud.words) - sentence_start + (0 if id_starts_at_zero else 1) : raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format( _encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1)) diff --git a/scripts/conllu2splits.py b/scripts/conllu2splits.py index d4d939c770e5c2139750b00595de8b398bfcb423..76a8cec1bdbf9f9df784fd2e22f809f9a23a2c60 100755 --- a/scripts/conllu2splits.py +++ b/scripts/conllu2splits.py @@ -53,6 +53,9 @@ def main() : sentence = [] continue + if "ID" not in col2index or "FORM" not in col2index : + break + idId = int(col2index["ID"]) idForm = int(col2index["FORM"]) diff --git a/scripts/launchSlurmArray.py b/scripts/launchSlurmArray.py index ef6ac5d2f5262645987961c086888f04ca3b7c98..69d9eb1a3ce4c473a631757e2a87f924c5fa8799 100644 --- a/scripts/launchSlurmArray.py +++ b/scripts/launchSlurmArray.py @@ -69,7 +69,6 @@ eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" #SBATCH --cpus-per-task=1 #SBATCH --hint=nomultithread #SBATCH --partition={} -#SBATCH --exclude=sensei1,lifnode1,asfalda1 #SBATCH --time={}:00:00 module purge