Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from xml.etree import ElementTree as ET
from collections import defaultdict
history_length = 5
_index = defaultdict(list)
class Keyword:
def __init__(self, num, history, word, action, section):
self.history = history
self.word = word
self.num = num
self.action = action
self.section = int(section) - 1
for word in history:
_index[word].append(num)
class SLU:
def __init__(self, xml_filename):
self.words = []
self.parse_xml(xml_filename)
def parse_xml(self, filename):
root = ET.parse(filename)
for section in root.findall(".//section"):
history = [x for x in reversed(range(history_length))]
for sequence in section.findall('./sequence'):
history.extend(sequence.text.strip().split())
for node in sequence:
if node.tag == 'keyword':
history.extend(node.text.strip().split())
if node.get('action').strip() != '':
self.words.append(Keyword(len(self.words), history[-history_length: -1], history[-1], node.get('action'), section.get("id")))
history.extend(node.tail.strip().split())
def process(self, history, target, expected=-1, section=-1):
history = ([x for x in reversed(range(history_length))] + history)[-history_length:]
found = set()
for word in history:
found.update(_index[word])
#print len(found)
history = set(history)
min_distance = None
max_value = 0
argmax = None
for word in [self.words[num] for num in found]:
if section != -1 and word.section != section:
continue
history_score = len(history & set(word.history)) / float(history_length)
target_score = len(set(word.word) & set(target)) / float(len(target))
value = history_score + target_score
if value > max_value:
max_value = value
argmax = word
if expected != -1 and value == max_value and (min_distance == None or distance < min_distance):
min_distance = distance
max_value = value
argmax = word
return argmax
def last_in_section(self, word):
if word.num == len(self.words) - 1 or self.words[word.num + 1].section != word.section:
return True
return False
def expected_at_section_start(self, section):
for word in self.words:
if word.section == section:
return word.num
return -1
if __name__ == '__main__':
slu = SLU('data/homeostasis_25nov.xml')
history = 'open technical'.split()
word = 'characteristics'
found = slu.process(history, word)
print found.word, found.section, found.action, found.history, word