Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
import sys, re, os
import summarizer as sz
#if len(sys.argv) < 2:
# print >>sys.stderr, 'usage: cat <synopses> | %s <tsv+>' % sys.argv[0]
# sys.exit(1)
# remove accents from text, normalize +
def unidecode(text):
text = text.decode('utf8')
text = re.sub(u'[éÉèÈêÊëË]', 'e', text)
text = re.sub(u'[ïÏîÎìÌ]', 'i', text)
text = re.sub(u'[öÖôÔòÒ]', 'o', text)
text = re.sub(u'[àÀäÄâÂ]', 'a', text)
text = re.sub(u'[çÇ]', 'c', text)
text = re.sub(u'[üÜûÛùÙ]', 'u', text)
text = text.replace('+', '-')
return text.encode('utf8')
# words which can match interchangeably
equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}
# multi-words from the corpus in order to retokenize synopses
with open('multiword-lexicon.txt') as fp:
multiwords = [unidecode(x.strip()).lower() for x in fp.readlines()]
topics = {}
with open('topics.csv') as fp:
for line in fp.readlines():
tokens = line.strip().split(';')
topics[tokens[0]] = [x for x in tokens[1:] if x != 'NO']
from speaker_type import SpeakerType
speaker_type = SpeakerType('mapping-by-id.txt')
# to detect slots in annotated synopses
pattern = re.compile(r'<a class="instance" variable="([^"]*)" style="color:[^"]*" title="[^"]*" href="#">(.*?)<')
def output_phrases(sentences, show):
from collections import defaultdict
seen = defaultdict(int)
tab_features=[]
for sentence_num, sentence in enumerate(sentences):
for word in sentence:
if word.postag.lower().startswith('n'):
#label = word.variable[0][2:] if len(word.variable) > 0 and not word.has_parent(sentence, lambda x: len(x.variable) > 0) else 'O'
label = 'O'
parent = sentence[word.parent].lemma if word.parent >= 0 else 'ROOT'
parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
# get all children that don't depend on a verb
phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
features.append(' '.join([x.text for x in phrase]))
features.append(' '.join([x.postag for x in phrase]))
features.append(seen['ne:' + word.named_entity[2:]])
features.append(seen['word:' + word.lemma])
features.append('-1' if word.parent < word.local_id else '+1')
features.append(' '.join(topics[word.filename]) if word.filename in topics else '')
features.append(len(phrase))
features.append(sentence_num / len(sentences))
features.append(speaker_type.resolve(show, word.speaker))
features.append(':'.join([str(word.global_id), str(word.local_id)]))
### Décommenter pour lancer
toprint=','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'
tab_features.append(toprint)
# print tab_features
seen['ne:' + word.named_entity[2:]] += 1
seen['word:' + word.lemma] += 1
fic = open("source/icsiboost.test", "w")
for line in tab_features:
fic.write(line)
fic.write("\n")
fic.close()
return tab_features