Skip to content
Snippets Groups Projects
levenstein.py 3.72 KiB
Newer Older
  • Learn to ignore specific revisions
  • Benoit Favre's avatar
    Benoit Favre committed
    import sys
    
    def align(ref_words, hyp_words, sub_cost=None, ins_cost=None, del_cost=None):
        num_ref = len(ref_words)
        num_hyp = len(hyp_words)
        if num_hyp == 0:
            return num_ref, num_ref, [], []
        if num_ref == 0:
            return 0, 0, [], []
        score = []
        backtrack = []
        OK = 0
        SUB = 1
        DEL = 2
        INS = 3
        for i in range(num_ref + 1):
            score.append([0] * (num_hyp + 1))
            backtrack.append([0] * (num_hyp + 1))
            if del_cost == None or i == 0:
                score[i][0] = i
            else:
                score[i][0] = score[i - 1][0] + del_cost[i - 1][0]
            backtrack[i][0] = DEL
        for i in range(num_hyp + 1):
            if ins_cost == None or i == 0:
                score[0][i] = i
            else:
                score[0][i] = score[0][i - 1] + ins_cost[0][i - 1]
            backtrack[0][i] = INS
        for i in range(1, num_ref + 1):
            for j in range(1, num_hyp + 1):
                sub_type = OK
                sub_value = score[i - 1][j - 1]
                if ref_words[i - 1] != hyp_words[j - 1]:
                    if sub_cost != None:
                        sub_value += sub_cost[i - 1][j - 1]
                    else:
                        sub_value += 1
                    sub_type = SUB
                if del_cost != None:
                    del_value = score[i - 1][j] + del_cost[i - 1][j - 1]
                else:
                    del_value = score[i - 1][j] + 0.75
                if ins_cost != None:
                    ins_value = score[i][j - 1] + ins_cost[i - 1][j - 1]
                else:
                    ins_value = score[i][j - 1] + 0.75
                if sub_value <= del_value:
                    if sub_value <= ins_value:
                        score[i][j] = sub_value
                        backtrack[i][j] = sub_type
                    else:
                        score[i][j] = ins_value
                        backtrack[i][j] = INS
                else:
                    if del_value < ins_value:
                        score[i][j] = del_value;
                        backtrack[i][j] = DEL
                    else:
                        score[i][j] = ins_value;
                        backtrack[i][j] = INS
        alignment = []
        i = num_ref
        j = num_hyp
        num_errors = 0
        while i > 0 or j > 0:
            if backtrack[i][j] == OK:
                alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
                i = i - 1
                j = j - 1
            elif backtrack[i][j] == SUB:
                num_errors += 1
                alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
                i = i - 1
                j = j - 1
            elif backtrack[i][j] == INS:
                num_errors += 1
                alignment.insert(0, [None, hyp_words[j - 1]])
                j = j - 1
            elif backtrack[i][j] == DEL:
                num_errors += 1
                alignment.insert(0, [ref_words[i - 1], None])
                i = i - 1
    
        return num_errors, num_ref, alignment, score
    
    def print_alignment(alignment):
        ref = []
        hyp = []
        for pair in alignment:
            if pair[0] == None:
                ref.append('*' * len(pair[1]))
                hyp.append(pair[1])
            elif pair[1] == None:
                ref.append(pair[0])
                hyp.append('*' * len(pair[0]))
            else:
                if len(pair[0]) > len(pair[1]):
                    ref.append(pair[0])
                    hyp.append(pair[1] + ' ' * (len(pair[0]) - len(pair[1])))
                else:
                    ref.append(pair[0] + ' ' * (len(pair[1]) - len(pair[0])))
                    hyp.append(pair[1])
        print ' '.join(ref)
        print ' '.join(hyp)
    
    def wer(ref, hyp):
        num_errors, num_ref, alignment, score = align(ref, hyp)
        return num_errors
    
    if __name__ == '__main__':
        ref = "hello"
        hyp = "hollow"
        num_errors, num_ref, alignment, score = align(ref, hyp)
        print_alignment(alignment)
        print "error_rate:", float(num_errors) / num_ref