diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 903f57d8a437739634f4457143fc225d7297c43e..2554a2be6e90f252fe421defd7d0b0e47ec1f111 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -122,6 +122,19 @@ UNIVERSAL_FEATURES = { } +################################################################################ +def filter_columns(columns) : + res = [] + indexes = [0, 1, 3, 6, 7] + lengths = [4, 8, 8, 4, 8] + + for (content, max_len) in [(columns[indexes[index]], lengths[index]) for index in range(len(indexes))] : + res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):]))) + + return res +################################################################################ + + ################################################################################ # UD Error is used when raising exceptions in this module class UDError(Exception) : @@ -526,43 +539,72 @@ def evaluate_wrapper(args) : ################################################################################ -def compute_errors(gold_file, system_file, evaluation, metric) : - class Error : - def __init__(self, gold_file, system_file, gold_word, system_word, metric) : - self.gold = gold_word - self.pred = system_word - self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] - self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] - # TODO : do it for other than UPOS - self.type = gold.columns[UPOS]+"->"+pred.columns[UPOS] - - class Errors : - def __init__(self, metric) : - self.types = [] - self.nb_errors = 0 - self.metric = metric - def __len__(self) : - return self.nb_errors - def add(self, error) : - self.nb_errors += 1 - for t in self.types : - if t.type == error.type : - t.add(error) - return - self.types.append(ErrorType(error.type)) - self.types[-1].add(error) - def sort(self) : - self.types.sort(key=len, reverse=True) - - class ErrorType : - def __init__(self, error_type) : - self.type = error_type - self.errors = [] - def __len__(self) : - return len(self.errors) - def add(self, error) : - self.errors.append(error) +class Error : + def __init__(self, gold_file, system_file, gold_word, system_word, metric) : + self.gold = gold_word + self.pred = system_word + self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] + self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] + # TODO : do it for other than UPOS + self.type = self.gold.columns[UPOS]+"->"+self.pred.columns[UPOS] + def __str__(self) : + result = [] + gold_lines = [] + pred_lines = [] + for word in self.gold_sentence : + gold_lines.append((">" if word == self.gold else " ") + " ".join(filter_columns(word.columns))) + for word in self.pred_sentence : + pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns))) + + for index in range(max(len(gold_lines), len(pred_lines))) : + result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else "", pred_lines[index] if index < len(pred_lines) else "")) + return "\n".join(result) + +class Errors : + def __init__(self, metric, errors1=None, errors2=None) : + self.types = [] + self.nb_errors = 0 + self.metric = metric + if errors1 is not None and errors2 is not None : + for type in errors1.types : + for error in type.errors : + if not errors2.has(error) : + self.add(error) + def __len__(self) : + return self.nb_errors + def add(self, error) : + self.nb_errors += 1 + for t in self.types : + if t.type == error.type : + t.add(error) + return + self.types.append(ErrorType(error.type)) + self.types[-1].add(error) + def has(self, error) : + for t in self.types : + if t.type == error.type : + return t.has(error) + def sort(self) : + self.types.sort(key=len, reverse=True) + +class ErrorType : + def __init__(self, error_type) : + self.type = error_type + self.errors = [] + def __len__(self) : + return len(self.errors) + def add(self, error) : + self.errors.append(error) + def has(self, error) : + for other_error in self.errors : + if other_error.gold == error.gold : + return True + return False +################################################################################ + +################################################################################ +def compute_errors(gold_file, system_file, evaluation, metric) : errors = Errors(metric) for alignment_word in evaluation[metric][1] : gold = alignment_word.gold_word @@ -595,12 +637,16 @@ def main() : # Evaluate gold_ud, evaluations = evaluate_wrapper(args) + errors_by_file = [] + examples_list = [] - for (system_ud, evaluation) in evaluations : + for id1 in range(len(evaluations)) : + (system_ud, evaluation) = evaluations[id1] fnamelen = len(system_ud.filename) print("*"*math.ceil((80-2-fnamelen)/2),system_ud.filename,"*"*math.floor((80-2-fnamelen)/2)) # Compute errors errors_list = [compute_errors(gold_ud, system_ud, evaluation, metric) for metric in errors_metrics] + errors_by_file.append(errors_list) # Print the evaluation if args.counts : @@ -626,15 +672,61 @@ def main() : "{:10.2f}".format(100 * evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else "" )) - for errors in errors_list : + for id2 in range(len(errors_list)) : + errors = errors_list[id2] errors.sort() - print("") print("Most frequent errors for metric '{}' :".format(errors.metric)) - for error_type in errors.types[:10] : + print("{:>12} {:>5} {:>6} {}\n {:->37}".format("ID", "NB", "%AGE", "GOLD->SYSTEM", "")) + + print("{:>12} {:5} {:6.2f}%".format("Total", len(errors), 100)) + for id3 in range(len(errors.types[:10])) : + error_type = errors.types[:10][id3] t = error_type.type nb = len(error_type) percent = 100.0*nb/len(errors) - print("{:5} {:5.2f}% {}".format(nb, percent, t)) + id = ":".join(map(str,[id1,id2,id3,"*"])) + print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t)) + for id4 in range(len(error_type)) : + examples_list.append((":".join(map(str,[id1,id2,id3,id4])), error_type.errors[id4])) + print("") + + for id1 in range(len(evaluations)) : + (system1_ud, evaluation) = evaluations[id1] + for id2 in range(len(evaluations)) : + if id1 == id2 : + continue + (system2_ud, evaluation) = evaluations[id2] + errors1 = errors_by_file[id1] + errors2 = errors_by_file[id2] + + if len(errors1) > 0 : + print("{} Error comparison {}".format("*"*31, "*"*31)) + print("{:>30} : {}".format("These errors are present in", system1_ud.filename)) + print("{:>30} : {}".format("and not in", system2_ud.filename)) + for id3 in range(len(errors1)) : + metric = errors1[id3].metric + errors_diff = Errors(metric, errors1[id3], errors2[id3]) + errors_diff.sort() + print("{:>12} {:5} {:6.2f}%".format("Total", len(errors_diff), 100)) + for id4 in range(len(errors_diff.types[:10])) : + error_type = errors_diff.types[:10][id4] + t = error_type.type + nb = len(error_type) + percent = 100.0*nb/len(errors) + id = ":".join(map(str,["d"+str(id1),id3,id4,"*"])) + print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t)) + for id5 in range(len(error_type)) : + examples_list.append((":".join(map(str,["d"+str(id1),id3,id4,id5])), error_type.errors[id5])) + print("") + + if len(examples_list) > 0 : + print("{}List of all errors by their ID{}".format("*"*25,"*"*25)) + print("{}{:^30}{}\n".format("*"*25,"Format is GOLD | PREDICTED","*"*25)) + + for (id,error) in examples_list : + print("ID="+id) + print(error) + print("") ################################################################################