import sys def align(ref_words, hyp_words, sub_cost=None, ins_cost=None, del_cost=None): num_ref = len(ref_words) num_hyp = len(hyp_words) if num_hyp == 0: return num_ref, num_ref, [], [] if num_ref == 0: return 0, 0, [], [] score = [] backtrack = [] OK = 0 SUB = 1 DEL = 2 INS = 3 for i in range(num_ref + 1): score.append([0] * (num_hyp + 1)) backtrack.append([0] * (num_hyp + 1)) if del_cost == None or i == 0: score[i][0] = i else: score[i][0] = score[i - 1][0] + del_cost[i - 1][0] backtrack[i][0] = DEL for i in range(num_hyp + 1): if ins_cost == None or i == 0: score[0][i] = i else: score[0][i] = score[0][i - 1] + ins_cost[0][i - 1] backtrack[0][i] = INS for i in range(1, num_ref + 1): for j in range(1, num_hyp + 1): sub_type = OK sub_value = score[i - 1][j - 1] if ref_words[i - 1] != hyp_words[j - 1]: if sub_cost != None: sub_value += sub_cost[i - 1][j - 1] else: sub_value += 1 sub_type = SUB if del_cost != None: del_value = score[i - 1][j] + del_cost[i - 1][j - 1] else: del_value = score[i - 1][j] + 0.75 if ins_cost != None: ins_value = score[i][j - 1] + ins_cost[i - 1][j - 1] else: ins_value = score[i][j - 1] + 0.75 if sub_value <= del_value: if sub_value <= ins_value: score[i][j] = sub_value backtrack[i][j] = sub_type else: score[i][j] = ins_value backtrack[i][j] = INS else: if del_value < ins_value: score[i][j] = del_value; backtrack[i][j] = DEL else: score[i][j] = ins_value; backtrack[i][j] = INS alignment = [] i = num_ref j = num_hyp num_errors = 0 while i > 0 or j > 0: if backtrack[i][j] == OK: alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]]) i = i - 1 j = j - 1 elif backtrack[i][j] == SUB: num_errors += 1 alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]]) i = i - 1 j = j - 1 elif backtrack[i][j] == INS: num_errors += 1 alignment.insert(0, [None, hyp_words[j - 1]]) j = j - 1 elif backtrack[i][j] == DEL: num_errors += 1 alignment.insert(0, [ref_words[i - 1], None]) i = i - 1 return num_errors, num_ref, alignment, score def print_alignment(alignment): ref = [] hyp = [] for pair in alignment: if pair[0] == None: ref.append('*' * len(pair[1])) hyp.append(pair[1]) elif pair[1] == None: ref.append(pair[0]) hyp.append('*' * len(pair[0])) else: if len(pair[0]) > len(pair[1]): ref.append(pair[0]) hyp.append(pair[1] + ' ' * (len(pair[0]) - len(pair[1]))) else: ref.append(pair[0] + ' ' * (len(pair[1]) - len(pair[0]))) hyp.append(pair[1]) print ' '.join(ref) print ' '.join(hyp) def wer(ref, hyp): num_errors, num_ref, alignment, score = align(ref, hyp) return num_errors if __name__ == '__main__': ref = "hello" hyp = "hollow" num_errors, num_ref, alignment, score = align(ref, hyp) print_alignment(alignment) print "error_rate:", float(num_errors) / num_ref