Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys
def align(ref_words, hyp_words, sub_cost=None, ins_cost=None, del_cost=None):
num_ref = len(ref_words)
num_hyp = len(hyp_words)
if num_hyp == 0:
return num_ref, num_ref, [], []
if num_ref == 0:
return 0, 0, [], []
score = []
backtrack = []
OK = 0
SUB = 1
DEL = 2
INS = 3
for i in range(num_ref + 1):
score.append([0] * (num_hyp + 1))
backtrack.append([0] * (num_hyp + 1))
if del_cost == None or i == 0:
score[i][0] = i
else:
score[i][0] = score[i - 1][0] + del_cost[i - 1][0]
backtrack[i][0] = DEL
for i in range(num_hyp + 1):
if ins_cost == None or i == 0:
score[0][i] = i
else:
score[0][i] = score[0][i - 1] + ins_cost[0][i - 1]
backtrack[0][i] = INS
for i in range(1, num_ref + 1):
for j in range(1, num_hyp + 1):
sub_type = OK
sub_value = score[i - 1][j - 1]
if ref_words[i - 1] != hyp_words[j - 1]:
if sub_cost != None:
sub_value += sub_cost[i - 1][j - 1]
else:
sub_value += 1
sub_type = SUB
if del_cost != None:
del_value = score[i - 1][j] + del_cost[i - 1][j - 1]
else:
del_value = score[i - 1][j] + 0.75
if ins_cost != None:
ins_value = score[i][j - 1] + ins_cost[i - 1][j - 1]
else:
ins_value = score[i][j - 1] + 0.75
if sub_value <= del_value:
if sub_value <= ins_value:
score[i][j] = sub_value
backtrack[i][j] = sub_type
else:
score[i][j] = ins_value
backtrack[i][j] = INS
else:
if del_value < ins_value:
score[i][j] = del_value;
backtrack[i][j] = DEL
else:
score[i][j] = ins_value;
backtrack[i][j] = INS
alignment = []
i = num_ref
j = num_hyp
num_errors = 0
while i > 0 or j > 0:
if backtrack[i][j] == OK:
alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
i = i - 1
j = j - 1
elif backtrack[i][j] == SUB:
num_errors += 1
alignment.insert(0, [ref_words[i - 1], hyp_words[j - 1]])
i = i - 1
j = j - 1
elif backtrack[i][j] == INS:
num_errors += 1
alignment.insert(0, [None, hyp_words[j - 1]])
j = j - 1
elif backtrack[i][j] == DEL:
num_errors += 1
alignment.insert(0, [ref_words[i - 1], None])
i = i - 1
return num_errors, num_ref, alignment, score
def print_alignment(alignment):
ref = []
hyp = []
for pair in alignment:
if pair[0] == None:
ref.append('*' * len(pair[1]))
hyp.append(pair[1])
elif pair[1] == None:
ref.append(pair[0])
hyp.append('*' * len(pair[0]))
else:
if len(pair[0]) > len(pair[1]):
ref.append(pair[0])
hyp.append(pair[1] + ' ' * (len(pair[0]) - len(pair[1])))
else:
ref.append(pair[0] + ' ' * (len(pair[1]) - len(pair[0])))
hyp.append(pair[1])
print ' '.join(ref)
print ' '.join(hyp)
def wer(ref, hyp):
num_errors, num_ref, alignment, score = align(ref, hyp)
return num_errors
if __name__ == '__main__':
ref = "hello"
hyp = "hollow"
num_errors, num_ref, alignment, score = align(ref, hyp)
print_alignment(alignment)
print "error_rate:", float(num_errors) / num_ref