diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index ba996647fd7ab8eee980cea24703a76039743d66..86d3e5cc58a2c97ee696abef0c521a34d2ff5b13 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -2,7 +2,6 @@ include ../config SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl -MCD=conllu.mcd TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') @@ -12,16 +11,16 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain +all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain rm -f col_*\.txt rm -f all_no_test.conllu all_no_test.conllu: cat $(TRAIN_FILES) $(DEV_FILES) > $@ -tokenizer.ts: all_no_test.conllu $(MCD) +tokenizer.ts: all_no_test.conllu echo "ENDWORD" > $@ - $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt + $(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt echo "SPLIT 0" >> $@ echo "SPLIT 1" >> $@ echo "SPLIT 2" >> $@ @@ -40,11 +39,8 @@ segmenter.ts: echo "NOTHING" >> $@ sed -i -e 's/^/<segmenter> /' $@ -columns: all_no_test.conllu $(MCD) - for number in 1 2 3 4 5 6 7 8 9 10 ; do \ - cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \ - done - ./getTransitionSets.py $(MCD) col_*\.txt +transitions: all_no_test.conllu + ./getTransitionSets.py $< texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) @@ -52,8 +48,8 @@ texts: pretrain: texts ./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64 -$(FPLM_FILENAME): all_no_test.conllu $(MCD) - $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ +$(FPLM_FILENAME): all_no_test.conllu + $(SCRIPTS)/conllu2fplm.py $< > $@ clean: - rm -f *\.txt diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 8041e43157e3cbb2a4143f2005c1adf68b546ee6..39bf020b1ad63736dad953507f5841593e6cdc24 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -7,125 +7,118 @@ sys.path.insert(1, '../../../../scripts') from readMCD import readMCD def printUsageAndExit() : - print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) exit(1) if __name__ == "__main__" : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) - if len(sys.argv) < 3 : + if len(sys.argv) != 2 : printUsageAndExit() - conllMCD, conllMCDr = readMCD(sys.argv[1]) + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") - for colFile in sys.argv[2:] : - numCol = int(colFile.split('.')[0].split('_')[-1]) -1 - if not numCol in conllMCD : + fileContent = [] + + for line in open(sys.argv[1], "r", encoding="utf8") : + if len(line.strip()) < 3 : + continue + if line.strip()[0] == '#' : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) continue - nameCol = conllMCD[numCol] - - if nameCol == "UPOS" : - output = open("tagger.ts", 'w', encoding='utf-8') - for line in open(colFile, "r", encoding='utf-8') : - striped = line.strip() - if len(striped) == 0 : - continue - print("<tagger> WRITE b.0 UPOS " + striped, file=output) - output.close() - - elif nameCol == "XPOS" : - output = open("taggerx.ts", 'w', encoding='utf-8') - for line in open(colFile, "r", encoding='utf-8') : - striped = line.strip() - if len(striped) == 0 : - continue - print("WRITE b.0 XPOS " + striped, file=output) - output.close() - - elif nameCol == "FEATS" : - output = open("morpho_whole.ts", 'w', encoding='utf-8') - for line in open(colFile, "r", encoding='utf-8') : - striped = line.strip() - if len(striped) == 0 : - continue - print("<morpho> WRITE b.0 FEATS " + striped, file=output) - output.close() - output = open("morpho_parts.ts", 'w', encoding='utf-8') - allParts = set() - allPartsList = [] - for line in open(colFile, "r", encoding='utf-8') : - striped = line.strip() - if len(striped) == 0 : - continue - parts = striped.split('|') - for part in parts : - allParts.add(part) - for part in allParts : - allPartsList.append(part) - allPartsList.sort() - for part in allPartsList : - print("<morpho> ADD b.0 FEATS " + part, file=output) - print("<morpho> NOTHING", file=output) - output.close() - - elif nameCol == "DEPREL" : - output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8') - print("<parser> REDUCE_strict", file=output) - labels = set() - labelsList = [] - for line in open(colFile, "r", encoding='utf-8') : - striped = line.strip() - if len(striped) == 0 or striped == "root" or striped == "_" : - continue - label = striped - if label not in labels : - labels.add(striped) - labelsList.append(striped) - labelsList.sort() - for label in labelsList : - print("<parser> eager_LEFT_rel " + label, file=output) - print("<parser> eager_RIGHT_rel " + label, file=output) - print("<parser> eager_SHIFT", file=output) - output.close() - - output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8') - print("<parser> REDUCE_relaxed", file=output) - for label in labelsList : - print("<parser> eager_LEFT_rel " + label, file=output) - print("<parser> eager_RIGHT_rel " + label, file=output) - print("<parser> eager_SHIFT", file=output) - output.close() - - output = open("parser_eager_strict.ts", 'w', encoding='utf-8') - print("<parser> REDUCE_strict", file=output) - print("<parser> eager_LEFT", file=output) - print("<parser> eager_RIGHT", file=output) - print("<parser> eager_SHIFT", file=output) - output.close() - - output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8') - print("<parser> REDUCE_relaxed", file=output) - print("<parser> eager_LEFT", file=output) - print("<parser> eager_RIGHT", file=output) - print("<parser> eager_SHIFT", file=output) - output.close() - - output = open("parser_standard_rel.ts", 'w', encoding='utf-8') - for label in labelsList : - print("<parser> standard_LEFT_rel " + label, file=output) - print("<parser> standard_RIGHT_rel " + label, file=output) - print("<parser> standard_SHIFT", file=output) - output.close() - - output = open("parser_standard.ts", 'w', encoding='utf-8') - print("<parser> standard_LEFT_rel", file=output) - print("<parser> standard_RIGHT_rel", file=output) - print("<parser> standard_SHIFT", file=output) - output.close() - - output = open("deprel.ts", 'w', encoding='utf-8') - for label in labelsList : - print("deprel " + label, file=output) - output.close() + + columns = line.strip().split('\t') + fileContent.append(columns) + + if "UPOS" in col2index : + values = [] + for columns in fileContent : + values.append(columns[col2index["UPOS"]]) + values = sorted(set(values)) + output = open("tagger.ts", 'w', encoding='utf-8') + for value in values : + print("<tagger> WRITE b.0 UPOS " + value, file=output) + output.close() + + if "XPOS" in col2index : + values = [] + for columns in fileContent : + values.append(columns[col2index["XPOS"]]) + values = sorted(set(values)) + output = open("taggerx.ts", 'w', encoding='utf-8') + for value in values : + print("<taggerx> WRITE b.0 XPOS " + value, file=output) + output.close() + + if "FEATS" in col2index : + values = [] + for columns in fileContent : + values.append(columns[col2index["FEATS"]]) + values = sorted(set(values)) + parts = [] + for value in values : + for part in value.split("|") : + parts.append(part) + parts = sorted(set(parts)) + output = open("morpho_whole.ts", 'w', encoding='utf-8') + for value in values : + print("<morpho> WRITE b.0 FEATS " + value, file=output) + output.close() + output = open("morpho_parts.ts", 'w', encoding='utf-8') + for value in parts : + print("<morpho> ADD b.0 FEATS " + value, file=output) + print("<morpho> NOTHING", file=output) + output.close() + + if "DEPREL" in col2index : + labelsList = [] + for columns in fileContent : + label = columns[col2index["DEPREL"]] + if not (label == "_" or label == "root") : + labelsList.append(label) + labelsList = sorted(set(labelsList)) + output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8') + print("<parser> REDUCE_strict", file=output) + for label in labelsList : + print("<parser> eager_LEFT_rel " + label, file=output) + print("<parser> eager_RIGHT_rel " + label, file=output) + print("<parser> eager_SHIFT", file=output) + output.close() + output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8') + print("<parser> REDUCE_relaxed", file=output) + for label in labelsList : + print("<parser> eager_LEFT_rel " + label, file=output) + print("<parser> eager_RIGHT_rel " + label, file=output) + print("<parser> eager_SHIFT", file=output) + output.close() + output = open("parser_eager_strict.ts", 'w', encoding='utf-8') + print("<parser> REDUCE_strict", file=output) + print("<parser> eager_LEFT", file=output) + print("<parser> eager_RIGHT", file=output) + print("<parser> eager_SHIFT", file=output) + output.close() + output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8') + print("<parser> REDUCE_relaxed", file=output) + print("<parser> eager_LEFT", file=output) + print("<parser> eager_RIGHT", file=output) + print("<parser> eager_SHIFT", file=output) + output.close() + output = open("parser_standard_rel.ts", 'w', encoding='utf-8') + for label in labelsList : + print("<parser> standard_LEFT_rel " + label, file=output) + print("<parser> standard_RIGHT_rel " + label, file=output) + print("<parser> standard_SHIFT", file=output) + output.close() + output = open("parser_standard.ts", 'w', encoding='utf-8') + print("<parser> standard_LEFT_rel", file=output) + print("<parser> standard_RIGHT_rel", file=output) + print("<parser> standard_SHIFT", file=output) + output.close() + output = open("deprel.ts", 'w', encoding='utf-8') + for label in labelsList : + print("deprel " + label, file=output) + output.close() diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh index 59bbd185a83c0061bc92391d5cf0394576fcf79c..4fcaa3823119bd83ef4199a6276179efbaf7ca35 100755 --- a/UD_any/evaluate.sh +++ b/UD_any/evaluate.sh @@ -40,17 +40,15 @@ DEV=$(find $CORPUS -type f -name '*dev*.conllu') DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') TEST=$(find $CORPUS -type f -name '*test*.conllu') TESTRAW=$(find $CORPUS -type f -name '*test*.txt') -MCD=$(find $CORPUS -type f -name '*.mcd') REF=$TEST REFRAW=$TESTRAW -if has_space "$REF" || has_space "$REFRAW" || has_space "$MCD"; +if has_space "$REF" || has_space "$REFRAW"; then >&2 echo "ERROR : more than 1 match" >&2 echo "REF : " $REF >&2 echo "REFRAW : " $REFRAW - >&2 echo "MCD : " $MCD print_usage_and_exit fi @@ -67,21 +65,16 @@ then print_usage_and_exit fi -if test -z $MCD; -then - MCD=$EXPPATH"/data/*\.mcd" -fi - EVALCONLL="../scripts/conll18_ud_eval.py" OUTPUT=$EXPPATH"/predicted_eval.tsv" if [ "$MODE" = "tsv" ]; then -macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1 +macaon decode --model $EXPPATH --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1 +macaon decode --model $EXPPATH --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1 exit 0 fi diff --git a/UD_any/train.sh b/UD_any/train.sh index 8747a2812d90e277f4e9a7aa4778edc56022b9d4..3bc401baa918c82e11d2cb5a97f5df8f59473d55 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -43,16 +43,14 @@ DEV=$(find $CORPUS -type f -name '*dev*.conllu') DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') TEST=$(find $CORPUS -type f -name '*test*.conllu') TESTRAW=$(find $CORPUS -type f -name '*test*.txt') -MCD=$(find $CORPUS -type f -name '*.mcd') W2V=$(find $CORPUS -type f -name '*.w2v') -if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST" || has_space "$MCD"; +if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; then >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD >&2 echo "TRAIN : " $TRAIN >&2 echo "DEV : " $DEV >&2 echo "TEST : " $TEST - >&2 echo "MCD : " $MCD print_usage_and_exit fi @@ -72,13 +70,6 @@ then fi fi -if test -z $MCD; -then - MCD=$EXPPATH"/data/*\.mcd" -fi - ->&2 echo "Using MCD :" $MCD - if test -f $W2V; then >&2 echo "Using W2V :" $W2V @@ -86,12 +77,12 @@ then fi if [ "$MODE" = "tsv" ]; then -macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 exit 0 fi diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 2554a2be6e90f252fe421defd7d0b0e47ec1f111..08662415c7475dbd28fd3be611f93cc73aafadc1 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -92,6 +92,8 @@ from __future__ import division from __future__ import print_function +from readMCD import readMCD + import argparse import io import os @@ -101,7 +103,8 @@ import unittest import math # CoNLL-U column names -ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) +col2index = {} +index2col = {} # Content and functional relations CONTENT_DEPRELS = { @@ -125,10 +128,10 @@ UNIVERSAL_FEATURES = { ################################################################################ def filter_columns(columns) : res = [] - indexes = [0, 1, 3, 6, 7] - lengths = [4, 8, 8, 4, 8] + cols = [("ID",4), ("FORM",8), ("UPOS",8), ("HEAD",4), ("DEPREL", 8)] + contents = [(columns[col2index[col]], max_size) for (col, max_size) in cols if col in col2index] - for (content, max_len) in [(columns[indexes[index]], lengths[index]) for index in range(len(indexes))] : + for (content, max_len) in contents : res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):]))) return res @@ -158,6 +161,8 @@ def _encode(text) : ################################################################################ # Load given CoNLL-U file into internal representation def load_conllu(file) : + global col2index + global index2col # Internal representation classes class UDRepresentation : def __init__(self) : @@ -195,15 +200,18 @@ def load_conllu(file) : self.parent = None # List of references to UDWord instances representing functional-deprel children. self.functional_children = [] + # Only consider universal FEATS. # TODO consider all feats - self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") - if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) - # Let's ignore language-specific deprel subtypes. - self.columns[DEPREL] = columns[DEPREL].split(":")[0] - # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS - self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS - self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS + if "FEATS" in col2index : + self.columns[col2index["FEATS"]] = "|".join(sorted(feat for feat in columns[col2index["FEATS"]].split("|") + if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) + if "DEPREL" in col2index : + # Let's ignore language-specific deprel subtypes. + self.columns[col2index["DEPREL"]] = columns[col2index["DEPREL"]].split(":")[0] + # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS + self.is_content_deprel = self.columns[col2index["DEPREL"]] in CONTENT_DEPRELS + self.is_functional_deprel = self.columns[col2index["DEPREL"]] in FUNCTIONAL_DEPRELS ud = UDRepresentation() ud.filename = file.name @@ -220,36 +228,41 @@ def load_conllu(file) : if sentence_start is None : # Skip comments if line.startswith("#") : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) continue # Start a new sentence sentence_start = len(ud.words) ud.sentences.append(UDSpan(index, 0)) ud.sentences_words.append(UDSpan(sentence_start, 0)) + if not line : # Add parent and children UDWord links and check there are no cycles def process_word(word) : - if word.parent == "remapping" : - raise UDError("There is a cycle in a sentence") - if word.parent is None : - head = int(word.columns[HEAD]) - if head < 0 or head > len(ud.words) - sentence_start : - raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[HEAD]))) - if head : - parent = ud.words[sentence_start + head - 1] - word.parent = "remapping" - process_word(parent) - word.parent = parent + if "HEAD" in col2index : + if word.parent == "remapping" : + raise UDError("There is a cycle in a sentence") + if word.parent is None : + head = int(word.columns[col2index["HEAD"]]) + if head < 0 or head > len(ud.words) - sentence_start : + raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[col2index["HEAD"]]))) + if head : + parent = ud.words[sentence_start + head - 1] + word.parent = "remapping" + process_word(parent) + word.parent = parent for word in ud.words[sentence_start:] : process_word(word) # func_children cannot be assigned within process_word # because it is called recursively and may result in adding one child twice. for word in ud.words[sentence_start:] : - if word.parent and word.is_functional_deprel : + if "HEAD" in col2index and word.parent and word.is_functional_deprel : word.parent.functional_children.append(word) # Check there is a single root node - if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 : + if "HEAD" in col2index and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 : raise UDError("There are multiple roots in a sentence") # End the sentence @@ -260,53 +273,52 @@ def load_conllu(file) : # Read next token/word columns = line.split("\t") - if len(columns) < 10 : - raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line))) # Skip empty nodes - if "." in columns[ID] : + if "ID" in col2index and "." in columns[col2index["ID"]] : continue # Delete spaces from FORM, so gold.characters == system.characters # even if one of them tokenizes the space. Use any Unicode character # with category Zs. - columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) - if not columns[FORM] : - raise UDError("There is an empty FORM in the CoNLL-U file") + if "FORM" in col2index : + columns[col2index["FORM"]] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[col2index["FORM"]])) + if not columns[col2index["FORM"]] : + raise UDError("There is an empty FORM in the CoNLL-U file") # Save token - ud.characters.extend(columns[FORM]) - ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) - index += len(columns[FORM]) + form_value = columns[col2index["FORM"]] if "FORM" in col2index else "_" + ud.characters.extend(form_value) + ud.tokens.append(UDSpan(index, index + len(form_value))) + index += len(form_value) # Handle multi-word tokens to save word(s) - if "-" in columns[ID] : + if "ID" in col2index and "-" in columns[col2index["ID"]] : try : - start, end = map(int, columns[ID].split("-")) + start, end = map(int, columns[col2index["ID"]].split("-")) except : - raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[ID]))) + raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[col2index["ID"]]))) for _ in range(start, end + 1) : word_line = _decode(file.readline().rstrip("\r\n")) word_columns = word_line.split("\t") - if len(word_columns) < 10 : - raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line))) + ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) ud.words[-1].sentence = len(ud.sentences)-1 # Basic tokens/words else : try : - word_id = int(columns[ID]) + word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_" except : - raise UDError("Cannot parse word ID '{}'".format(_encode(columns[ID]))) + raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]]))) if word_id != len(ud.words) - sentence_start + 1 : raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format( - _encode(columns[ID]), _encode(columns[FORM]), len(ud.words) - sentence_start + 1)) + _encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1)) try : - head_id = int(columns[HEAD]) + head_id = int(columns[col2index["HEAD"]]) if "HEAD" in col2index else 0 except : - raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[HEAD]))) + raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[col2index["HEAD"]]))) if head_id < 0 : raise UDError("HEAD cannot be negative") @@ -433,7 +445,7 @@ def evaluate(gold_ud, system_ud) : lcs = [[0] * (si - ss) for i in range(gi - gs)] for g in reversed(range(gi - gs)) : for s in reversed(range(si - ss)) : - if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower() : + if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() : lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) @@ -454,7 +466,7 @@ def evaluate(gold_ud, system_ud) : # Store aligned words s, g = 0, 0 while g < gi - gs and s < si - ss : - if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower() : + if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() : alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) g += 1 s += 1 @@ -494,27 +506,26 @@ def evaluate(gold_ud, system_ud) : alignment = align_words(gold_ud.words, system_ud.words) # Compute the F1-scores - return { - "Tokens" : spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences" : spans_score(gold_ud.sentences, system_ud.sentences), - "Words" : alignment_score(alignment), - "UPOS" : alignment_score(alignment, lambda w, _ : w.columns[UPOS]), - "XPOS" : alignment_score(alignment, lambda w, _ : w.columns[XPOS]), - "UFeats" : alignment_score(alignment, lambda w, _ : w.columns[FEATS]), - "AllTags" : alignment_score(alignment, lambda w, _ : (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), - "Lemmas" : alignment_score(alignment, lambda w, ga : w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), - "UAS" : alignment_score(alignment, lambda w, ga : ga(w.parent)), - "LAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL])), - "CLAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL]), - filter_fn=lambda w : w.is_content_deprel), - "MLAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], - [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) - for c in w.functional_children]), - filter_fn=lambda w : w.is_content_deprel), - "BLEX" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL], - w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), - filter_fn=lambda w : w.is_content_deprel), - } + result = {} + if "FORM" in col2index : + result["Tokens"] = spans_score(gold_ud.tokens, system_ud.tokens) + result["Words"] = alignment_score(alignment) + if "UPOS" in col2index : + result["UPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["UPOS"]]) + if "XPOS" in col2index : + result["XPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["XPOS"]]) + if "FEATS" in col2index : + result["UFeats"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["FEATS"]]) + if "LEMMA" in col2index : + result["Lemmas"] = alignment_score(alignment, lambda w, ga : w.columns[col2index["LEMMA"]] if ga(w).columns[col2index["LEMMA"]] != "_" else "_") + if "HEAD" in col2index : + result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent)) + if "DEPREL" in col2index : + result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]])) + if "ID" in col2index : + result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences) + + return result ################################################################################ @@ -546,7 +557,7 @@ class Error : self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] # TODO : do it for other than UPOS - self.type = self.gold.columns[UPOS]+"->"+self.pred.columns[UPOS] + self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]] def __str__(self) : result = [] gold_lines = [] @@ -557,7 +568,7 @@ class Error : pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns))) for index in range(max(len(gold_lines), len(pred_lines))) : - result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else "", pred_lines[index] if index < len(pred_lines) else "")) + result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else " "*len(pred_lines[index]), pred_lines[index] if index < len(pred_lines) else " "*len(gold_lines[index]))) return "\n".join(result) class Errors : @@ -635,6 +646,10 @@ def main() : errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',') + global col2index + global index2col + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + # Evaluate gold_ud, evaluations = evaluate_wrapper(args) errors_by_file = [] @@ -654,7 +669,7 @@ def main() : else : print("Metric | Precision | Recall | F1 Score | AligndAcc") print("-----------+-----------+-----------+-----------+-----------") - for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] : + for metric in evaluation : if args.counts : print("{:11}|{:10} |{:10} |{:10} |{:10}".format( metric, diff --git a/scripts/conllu2fplm.py b/scripts/conllu2fplm.py index d5d16130d8567972bc06ec6dd518c3a703506240..c42ef390bd5524662b6c943540018f5bcc2162e3 100755 --- a/scripts/conllu2fplm.py +++ b/scripts/conllu2fplm.py @@ -4,7 +4,7 @@ import sys from readMCD import readMCD def printUsageAndExit() : - print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) exit(1) def sameLineWithoutLemma(l1, l2) : @@ -16,10 +16,10 @@ if __name__ == "__main__" : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) - if len(sys.argv) != 3 : + if len(sys.argv) != 2 : printUsageAndExit() - conllMCD, conllMCDr = readMCD(sys.argv[2]) + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") entriesCount = {} entriesList = [] @@ -28,15 +28,18 @@ if __name__ == "__main__" : if len(line.strip()) < 3 : continue if line.strip()[0] == '#' : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) continue columns = line.strip().split('\t') - if len(columns[int(conllMCDr["ID"])].split('-')) > 1 : + if len(columns[col2index["ID"]].split('-')) > 1 : continue entry = "" for col in ["FORM", "UPOS", "LEMMA", "FEATS"] : - entry = entry + columns[int(conllMCDr[col])] + '\t' + entry = entry + (columns[col2index[col]] if col in col2index else "_") + '\t' entry = entry[:-1] if entry not in entriesCount : diff --git a/scripts/conllu2splits.py b/scripts/conllu2splits.py index 253160cdaa4926c035233bbda6ab5e9d4fd54010..d4d939c770e5c2139750b00595de8b398bfcb423 100755 --- a/scripts/conllu2splits.py +++ b/scripts/conllu2splits.py @@ -7,7 +7,7 @@ rules = {} prefix = "SPLITWORD " def printUsageAndExit() : - print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) exit(1) def computeRules(sentence) : @@ -34,26 +34,28 @@ def computeRules(sentence) : rules[word[1]][rule] = 1 def main() : - sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) - if len(sys.argv) != 3 : + if len(sys.argv) != 2 : printUsageAndExit() - conllMCD, conllMCDr = readMCD(sys.argv[2]) - - idId = int(conllMCDr["ID"]) - idForm = int(conllMCDr["FORM"]) + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") sentence = [] for line in open(sys.argv[1], "r", encoding="utf8") : if len(line.strip()) < 2 or line[0] == '#' : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) if len(sentence) > 0 : computeRules(sentence) sentence = [] continue + idId = int(col2index["ID"]) + idForm = int(col2index["FORM"]) + splited = line.strip().split('\t') sentence += [[splited[idId], splited[idForm]]] diff --git a/scripts/readMCD.py b/scripts/readMCD.py index fa5c5a01640e55a19ad62ccfc54d5d98615ec518..2ced082d26b47f86008990e81de9fc1cf25ac020 100644 --- a/scripts/readMCD.py +++ b/scripts/readMCD.py @@ -1,10 +1,9 @@ -def readMCD(mcdFilename) : - mcd = {} - for line in open(mcdFilename, "r", encoding="utf8") : - clean = line.strip() - if len(line) == 2 or line[0] == '#' : - continue - mcd[len(mcd)] = clean +def readMCD(mcd) : + col2index = {} + index2col = {} - return mcd, {v: k for k, v in mcd.items()} + for col in mcd.split(' ') : + col2index[col] = len(col2index) + index2col[len(index2col)] = col + return col2index, index2col