levenstein.py

import sys

def align(ref_words, hyp_words, sub_cost=None, ins_cost=None, del_cost=None):
    num_ref = len(ref_words)
    num_hyp = len(hyp_words)
    if num_hyp == 0:
        return num_ref, num_ref, [], []
    if num_ref == 0:
        return 0, 0, [], []
    score = []
    backtrack = []
    OK = 0
    SUB = 1
    DEL = 2
    INS = 3
    for i in range(num_ref + 1):
        score.append([0] * (num_hyp + 1))
        backtrack.append([0] * (num_hyp + 1))
        if del_cost == None or i == 0:
            score[i][0] = i
        else:
            score[i][0] = score[i - 1][0] + del_cost[i - 1][0]
        backtrack[i][0] = DEL
    for i in range(num_hyp + 1):
        if ins_cost == None or i == 0:
            score[0][i] = i
        else:
            score[0][i] = score[0][i - 1] + ins_cost[0][i - 1]
        backtrack[0][i] = INS
    for i in range(1, num_ref + 1):
        for j in range(1, num_hyp + 1):
            sub_type = OK
            sub_value = score[i - 1][j - 1]
            if ref_words[i - 1] != hyp_words[j - 1]:
                if sub_cost != None:
                    sub_value += sub_cost[i - 1][j - 1]
                else:
                    sub_value += 1
                sub_type = SUB
            if del_cost != None:
                del_value = score[i - 1][j] + del_cost[i - 1][j - 1]
            else:
                del_value = score[i - 1][j] + 0.75
            if ins_cost != None:
                ins_value = score[i][j - 1] + ins_cost[i - 1][j - 1]
            else:
                ins_value = score[i][j - 1] + 0.75
            if sub_value <= del_value:
                if sub_value <= ins_value:
                    score[i][j] = sub_value
                    backtrack[i][j] = sub_type
                else:
                    score[i][j] = ins_value
                    backtrack[i][j] = INS
            else:
                if del_value < ins_value:
                    score[i][j] = del_value;
                    backtrack[i][j] = DEL
                else:
                    score[i][j] = ins_value;
                    backtrack[i][j] = INS
    alignment = []
    i = num_ref
    j = num_hyp
    num_errors = 0
    while i > 0 or j > 0:
        if backtrack[i][j] == OK:
            alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
            i = i - 1
            j = j - 1
        elif backtrack[i][j] == SUB:
            num_errors += 1
            alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
            i = i - 1
            j = j - 1
        elif backtrack[i][j] == INS:
            num_errors += 1
            alignment.insert(0, [None, hyp_words[j - 1]])
            j = j - 1
        elif backtrack[i][j] == DEL:
            num_errors += 1
            alignment.insert(0, [ref_words[i - 1], None])
            i = i - 1

    return num_errors, num_ref, alignment, score

def print_alignment(alignment):
    ref = []
    hyp = []
    for pair in alignment:
        if pair[0] == None:
            ref.append('*' * len(pair[1]))
            hyp.append(pair[1])
        elif pair[1] == None:
            ref.append(pair[0])
            hyp.append('*' * len(pair[0]))
        else:
            if len(pair[0]) > len(pair[1]):
                ref.append(pair[0])
                hyp.append(pair[1] + ' ' * (len(pair[0]) - len(pair[1])))
            else:
                ref.append(pair[0] + ' ' * (len(pair[1]) - len(pair[0])))
                hyp.append(pair[1])
    print ' '.join(ref)
    print ' '.join(hyp)

def wer(ref, hyp):
    num_errors, num_ref, alignment, score = align(ref, hyp)
    return num_errors

if __name__ == '__main__':
    ref = "hello"
    hyp = "hollow"
    num_errors, num_ref, alignment, score = align(ref, hyp)
    print_alignment(alignment)
    print "error_rate:", float(num_errors) / num_ref