Skip to content
Snippets Groups Projects
Commit fffe9643 authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

parents
No related branches found
No related tags found
No related merge requests found
#-*- coding: utf-8 -*-
import os
import summarizer
convID = "20091112_RATP_SCD_0001.tsv"
seuil = 0.1
# conv ~ summarizer.Word()
print summarizer.summarize(conv, seuil, convID)
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
import sys, re, os
import summarizer as sz
#if len(sys.argv) < 2:
# print >>sys.stderr, 'usage: cat <synopses> | %s <tsv+>' % sys.argv[0]
# sys.exit(1)
# remove accents from text, normalize +
def unidecode(text):
text = text.decode('utf8')
text = re.sub(u'[éÉèÈêÊëË]', 'e', text)
text = re.sub(u'[ïÏîÎìÌ]', 'i', text)
text = re.sub(u'[öÖôÔòÒ]', 'o', text)
text = re.sub(u'[àÀäÄâÂ]', 'a', text)
text = re.sub(u'[çÇ]', 'c', text)
text = re.sub(u'[üÜûÛùÙ]', 'u', text)
text = text.replace('+', '-')
return text.encode('utf8')
# words which can match interchangeably
equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}
# multi-words from the corpus in order to retokenize synopses
with open('multiword-lexicon.txt') as fp:
multiwords = [unidecode(x.strip()).lower() for x in fp.readlines()]
topics = {}
with open('topics.csv') as fp:
for line in fp.readlines():
tokens = line.strip().split(';')
topics[tokens[0]] = [x for x in tokens[1:] if x != 'NO']
from speaker_type import SpeakerType
speaker_type = SpeakerType('mapping-by-id.txt')
# to detect slots in annotated synopses
pattern = re.compile(r'<a class="instance" variable="([^"]*)" style="color:[^"]*" title="[^"]*" href="#">(.*?)<')
def output_phrases(sentences, show):
from collections import defaultdict
seen = defaultdict(int)
tab_features=[]
for sentence_num, sentence in enumerate(sentences):
for word in sentence:
if word.postag.lower().startswith('n'):
#label = word.variable[0][2:] if len(word.variable) > 0 and not word.has_parent(sentence, lambda x: len(x.variable) > 0) else 'O'
label = 'O'
parent = sentence[word.parent].lemma if word.parent >= 0 else 'ROOT'
parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
# get all children that don't depend on a verb
phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
features.append(' '.join([x.text for x in phrase]))
features.append(' '.join([x.postag for x in phrase]))
features.append(seen['ne:' + word.named_entity[2:]])
features.append(seen['word:' + word.lemma])
features.append('-1' if word.parent < word.local_id else '+1')
features.append(' '.join(topics[word.filename]) if word.filename in topics else '')
features.append(len(phrase))
features.append(sentence_num / len(sentences))
features.append(speaker_type.resolve(show, word.speaker))
features.append(':'.join([str(word.global_id), str(word.local_id)]))
### Décommenter pour lancer
toprint=','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'
tab_features.append(toprint)
# print tab_features
seen['ne:' + word.named_entity[2:]] += 1
seen['word:' + word.lemma] += 1
fic = open("source/icsiboost.test", "w")
for line in tab_features:
fic.write(line)
fic.write("\n")
fic.close()
return tab_features
O, card_type, end_stop, frequency, from, issue, item, line, location, not_transport, pass_type, retrieve_location, start_stop, time, to, towards, transport, info_target, buy.
show: ignore.
word: text.
word_pos: text.
lemma: text.
named_entity: ignore.
parent: text.
parent_pos: text.
dep_label: text.
children: text.
children_pos: text.
num_ne: ignore.
num_lemma: continuous.
parent_side: text.
topic: ignore.
length: continuous.
sentence_order: ignore.
speaker_type: ignore.
link: ignore.
This diff is collapsed.
20091112_RATP_SCD_0001.tsv,bus,nc,bus,LL,avoir,v,OBJ,un bus,det nc,0,0,-1,ITNR,2,0,None,2:10,O.
20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,desservir,v,OBJ,la rue Ledru-Rollin à Bagneux,det nc np prep nc,0,0,-1,ITNR,5,0,None,2:14,O.
20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,1,0,-1,ITNR,3,0,None,2:15,O.
20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,2,0,-1,ITNR,1,0,None,2:17,O.
20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,ROOT,ROOT,ROOT,la rue Ledru-Rollin à Bagneux,det nc np prep nc,3,1,-1,ITNR,5,0,None,3:1,O.
20091112_RATP_SCD_0001.tsv,Ledru,np,Ledru,A,ROOT,ROOT,NOLINK,Ledru,np,4,0,-1,ITNR,1,0,None,3:2,O.
20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,5,1,-1,ITNR,3,0,None,3:3,O.
20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,6,1,-1,ITNR,1,0,None,3:5,O.
20091112_RATP_SCD_0001.tsv,Paris,nc,Paris,A,de,prep,OBJ,Paris,nc,7,0,-1,ITNR,1,0,None,6:1,O.
This diff is collapsed.
def load(filename):
output = {}
with open(filename) as fp:
for line in fp:
show, speaker_id, speaker_type = line.strip().split('\t')[:3]
output[(show, speaker_id)] = speaker_type
return output
class SpeakerType:
def __init__(self, mapping_filename):
self.mapping = load(mapping_filename)
def resolve(self, show, speaker_id):
key = (show, speaker_id)
if key in self.mapping:
return self.mapping[key]
return None
if __name__ == '__main__':
import sys
speaker_type = SpeakerType(sys.argv[1])
for line in sys.stdin:
print speaker_type.resolve(*line.strip().split())
à+cause+d'
à+côté+d'
afin+d'
afin+qu'
alors+qu'
à+moins+qu'
à+partir+d'
après+qu'
à+propos+d'
à+savoir+qu'
au+bout+d'
au+centre+d'
au+lieu+d'
au+milieu+d'
au+moment+d'
au+moyen+d'
au+nom+d'
auprès+d'
au+sujet+d'
autant+d'
avant+d'
avant+qu'
beaucoup+d'
bien+qu'
bon+app'
c'
ce+qu'
combien+d'
d'
de+parce+qu'
depuis+qu'
des+fois+qu'
dès+qu'
du+côté+d'
du+fait+qu'
en+début+d'
en+face+d'
en+fait+d'
en+fin+d'
en+milieu+d'
en+tant+qu'
en+train+d'
est-ce+qu'
est+ce+qu'
étant+donné+qu'
j'
jusqu'
l'
le+temps+qu'
loin+d'
lors+d'
lorsqu'
m'
maintenant+qu'
malgré+qu'
manip'
mat'
même+s'
n'
où+est-ce+qu'
parce+qu'
pas+mal+d'
pendant+qu'
peu+d'
plein+d'
plus+qu'
plutôt+qu'
pour+qu'
près+d'
puisqu'
qu'
quand+est-ce+qu'
qu'est-ce+qu'
quoi+qu'
s'
sachant+qu'
sauf+qu'
si+bien+qu'
surtout+qu'
t'
tandis+qu'
tant+qu'
tels+qu'
toute+l'
tout+l'
une+fois+qu'
un+peu+d'
vélib'
Vélib'
vu+qu'
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment