Commit fffe9643 authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

parents
#-*- coding: utf-8 -*-
import os
import summarizer
convID = "20091112_RATP_SCD_0001.tsv"
seuil = 0.1
# conv ~ summarizer.Word()
print summarizer.summarize(conv, seuil, convID)
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
import sys, re, os
import summarizer as sz
#if len(sys.argv) < 2:
# print >>sys.stderr, 'usage: cat <synopses> | %s <tsv+>' % sys.argv[0]
# sys.exit(1)
# remove accents from text, normalize +
def unidecode(text):
text = text.decode('utf8')
text = re.sub(u'[éÉèÈêÊëË]', 'e', text)
text = re.sub(u'[ïÏîÎìÌ]', 'i', text)
text = re.sub(u'[öÖôÔòÒ]', 'o', text)
text = re.sub(u'[àÀäÄâÂ]', 'a', text)
text = re.sub(u'[çÇ]', 'c', text)
text = re.sub(u'[üÜûÛùÙ]', 'u', text)
text = text.replace('+', '-')
return text.encode('utf8')
# words which can match interchangeably
equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}
# multi-words from the corpus in order to retokenize synopses
with open('multiword-lexicon.txt') as fp:
multiwords = [unidecode(x.strip()).lower() for x in fp.readlines()]
topics = {}
with open('topics.csv') as fp:
for line in fp.readlines():
tokens = line.strip().split(';')
topics[tokens[0]] = [x for x in tokens[1:] if x != 'NO']
from speaker_type import SpeakerType
speaker_type = SpeakerType('mapping-by-id.txt')
# to detect slots in annotated synopses
pattern = re.compile(r'<a class="instance" variable="([^"]*)" style="color:[^"]*" title="[^"]*" href="#">(.*?)<')
def output_phrases(sentences, show):
from collections import defaultdict
seen = defaultdict(int)
tab_features=[]
for sentence_num, sentence in enumerate(sentences):
for word in sentence:
if word.postag.lower().startswith('n'):
#label = word.variable[0][2:] if len(word.variable) > 0 and not word.has_parent(sentence, lambda x: len(x.variable) > 0) else 'O'
label = 'O'
parent = sentence[word.parent].lemma if word.parent >= 0 else 'ROOT'
parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
# get all children that don't depend on a verb
phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
features.append(' '.join([x.text for x in phrase]))
features.append(' '.join([x.postag for x in phrase]))
features.append(seen['ne:' + word.named_entity[2:]])
features.append(seen['word:' + word.lemma])
features.append('-1' if word.parent < word.local_id else '+1')
features.append(' '.join(topics[word.filename]) if word.filename in topics else '')
features.append(len(phrase))
features.append(sentence_num / len(sentences))
features.append(speaker_type.resolve(show, word.speaker))
features.append(':'.join([str(word.global_id), str(word.local_id)]))
### Décommenter pour lancer
toprint=','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'
tab_features.append(toprint)
# print tab_features
seen['ne:' + word.named_entity[2:]] += 1
seen['word:' + word.lemma] += 1
fic = open("source/icsiboost.test", "w")
for line in tab_features:
fic.write(line)
fic.write("\n")
fic.close()
return tab_features
O, card_type, end_stop, frequency, from, issue, item, line, location, not_transport, pass_type, retrieve_location, start_stop, time, to, towards, transport, info_target, buy.
show: ignore.
word: text.
word_pos: text.
lemma: text.
named_entity: ignore.
parent: text.
parent_pos: text.
dep_label: text.
children: text.
children_pos: text.
num_ne: ignore.
num_lemma: continuous.
parent_side: text.
topic: ignore.
length: continuous.
sentence_order: ignore.
speaker_type: ignore.
link: ignore.
This diff is collapsed.
20091112_RATP_SCD_0001.tsv,bus,nc,bus,LL,avoir,v,OBJ,un bus,det nc,0,0,-1,ITNR,2,0,None,2:10,O.
20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,desservir,v,OBJ,la rue Ledru-Rollin à Bagneux,det nc np prep nc,0,0,-1,ITNR,5,0,None,2:14,O.
20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,1,0,-1,ITNR,3,0,None,2:15,O.
20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,2,0,-1,ITNR,1,0,None,2:17,O.
20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,ROOT,ROOT,ROOT,la rue Ledru-Rollin à Bagneux,det nc np prep nc,3,1,-1,ITNR,5,0,None,3:1,O.
20091112_RATP_SCD_0001.tsv,Ledru,np,Ledru,A,ROOT,ROOT,NOLINK,Ledru,np,4,0,-1,ITNR,1,0,None,3:2,O.
20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,5,1,-1,ITNR,3,0,None,3:3,O.
20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,6,1,-1,ITNR,1,0,None,3:5,O.
20091112_RATP_SCD_0001.tsv,Paris,nc,Paris,A,de,prep,OBJ,Paris,nc,7,0,-1,ITNR,1,0,None,6:1,O.
This diff is collapsed.
def load(filename):
output = {}
with open(filename) as fp:
for line in fp:
show, speaker_id, speaker_type = line.strip().split('\t')[:3]
output[(show, speaker_id)] = speaker_type
return output
class SpeakerType:
def __init__(self, mapping_filename):
self.mapping = load(mapping_filename)
def resolve(self, show, speaker_id):
key = (show, speaker_id)
if key in self.mapping:
return self.mapping[key]
return None
if __name__ == '__main__':
import sys
speaker_type = SpeakerType(sys.argv[1])
for line in sys.stdin:
print speaker_type.resolve(*line.strip().split())
à+cause+d'
à+côté+d'
afin+d'
afin+qu'
alors+qu'
à+moins+qu'
à+partir+d'
après+qu'
à+propos+d'
à+savoir+qu'
au+bout+d'
au+centre+d'
au+lieu+d'
au+milieu+d'
au+moment+d'
au+moyen+d'
au+nom+d'
auprès+d'
au+sujet+d'
autant+d'
avant+d'
avant+qu'
beaucoup+d'
bien+qu'
bon+app'
c'
ce+qu'
combien+d'
d'
de+parce+qu'
depuis+qu'
des+fois+qu'
dès+qu'
du+côté+d'
du+fait+qu'
en+début+d'
en+face+d'
en+fait+d'
en+fin+d'
en+milieu+d'
en+tant+qu'
en+train+d'
est-ce+qu'
est+ce+qu'
étant+donné+qu'
j'
jusqu'
l'
le+temps+qu'
loin+d'
lors+d'
lorsqu'
m'
maintenant+qu'
malgré+qu'
manip'
mat'
même+s'
n'
où+est-ce+qu'
parce+qu'
pas+mal+d'
pendant+qu'
peu+d'
plein+d'
plus+qu'
plutôt+qu'
pour+qu'
près+d'
puisqu'
qu'
quand+est-ce+qu'
qu'est-ce+qu'
quoi+qu'
s'
sachant+qu'
sauf+qu'
si+bien+qu'
surtout+qu'
t'
tandis+qu'
tant+qu'
tels+qu'
toute+l'
tout+l'
une+fois+qu'
un+peu+d'
vélib'
Vélib'
vu+qu'
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment