initial commit

fffe9643 · Benoit Favre · fffe9643 · fffe9643 · fffe9643 · fffe9643
Commit fffe9643 authored Oct 5, 2016 by Benoit Favre
--- a/example.py
+++ b/example.py
+#-*- coding: utf-8 -*-
+import os
+import summarizer
+
+convID = "20091112_RATP_SCD_0001.tsv"
+seuil = 0.1 
+
+# conv ~ summarizer.Word()
+
+print summarizer.summarize(conv, seuil, convID)
+ 
--- a/mapping-by-id.txt
+++ b/mapping-by-id.txt
--- a/multiword-lexicon.txt
+++ b/multiword-lexicon.txt
--- a/predsyn.py
+++ b/predsyn.py
+# -*- coding: utf-8 -*-
+import sys, re, os
+import summarizer as sz
+
+#if len(sys.argv) < 2:
+#    print >>sys.stderr, 'usage: cat <synopses> | %s <tsv+>' % sys.argv[0]
+#    sys.exit(1)
+
+# remove accents from text, normalize +
+def unidecode(text):
+    text = text.decode('utf8')
+    text = re.sub(u'[éÉèÈêÊëË]', 'e', text)
+    text = re.sub(u'[ïÏîÎìÌ]', 'i', text)
+    text = re.sub(u'[öÖôÔòÒ]', 'o', text)
+    text = re.sub(u'[àÀäÄâÂ]', 'a', text)
+    text = re.sub(u'[çÇ]', 'c', text)
+    text = re.sub(u'[üÜûÛùÙ]', 'u', text)
+    text = text.replace('+', '-')
+    return text.encode('utf8')
+
+# words which can match interchangeably
+equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}
+
+# multi-words from the corpus in order to retokenize synopses
+with open('multiword-lexicon.txt') as fp:
+    multiwords = [unidecode(x.strip()).lower() for x in fp.readlines()]
+
+topics = {}
+with open('topics.csv') as fp:
+    for line in fp.readlines():
+        tokens = line.strip().split(';')
+        topics[tokens[0]] = [x for x in tokens[1:] if x != 'NO']
+
+from speaker_type import SpeakerType
+speaker_type = SpeakerType('mapping-by-id.txt')
+
+# to detect slots in annotated synopses
+pattern = re.compile(r'<a class="instance" variable="([^"]*)" style="color:[^"]*" title="[^"]*" href="#">(.*?)<')
+
+def output_phrases(sentences, show):
+    from collections import defaultdict
+    seen = defaultdict(int)
+    tab_features=[]
+    for sentence_num, sentence in enumerate(sentences):
+        for word in sentence:
+            if word.postag.lower().startswith('n'):
+                #label = word.variable[0][2:] if len(word.variable) > 0 and not word.has_parent(sentence, lambda x: len(x.variable) > 0) else 'O'
+                label = 'O'
+                parent = sentence[word.parent].lemma if word.parent >= 0 else 'ROOT'
+                parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
+                features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
+                # get all children that don't depend on a verb
+                phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
+                features.append(' '.join([x.text for x in phrase]))
+                features.append(' '.join([x.postag for x in phrase]))
+                features.append(seen['ne:' + word.named_entity[2:]])
+                features.append(seen['word:' + word.lemma])
+                features.append('-1' if word.parent < word.local_id else '+1')
+                features.append(' '.join(topics[word.filename]) if word.filename in topics else '')
+                features.append(len(phrase))
+                features.append(sentence_num / len(sentences))
+                features.append(speaker_type.resolve(show, word.speaker))
+                features.append(':'.join([str(word.global_id), str(word.local_id)]))
+### Décommenter pour lancer
+                toprint=','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'
+                tab_features.append(toprint)
+#                print tab_features
+
+                seen['ne:' + word.named_entity[2:]] += 1
+                seen['word:' + word.lemma] += 1
+    
+    fic = open("source/icsiboost.test", "w")
+    for line in tab_features:
+        fic.write(line)
+        fic.write("\n")
+    fic.close()
+    
+    return tab_features
+
+
--- a/source/icsiboost.names
+++ b/source/icsiboost.names
+O, card_type, end_stop, frequency, from, issue, item, line, location, not_transport, pass_type, retrieve_location, start_stop, time, to, towards, transport, info_target, buy.
+show: ignore.
+word: text.
+word_pos: text.
+lemma: text.
+named_entity: ignore.
+parent: text.
+parent_pos: text.
+dep_label: text.
+children: text.
+children_pos: text.
+num_ne: ignore.
+num_lemma: continuous.
+parent_side: text.
+topic: ignore.
+length: continuous.
+sentence_order: ignore.
+speaker_type: ignore.
+link: ignore.
--- a/source/icsiboost.shyp
+++ b/source/icsiboost.shyp
--- a/source/icsiboost.test
+++ b/source/icsiboost.test
+20091112_RATP_SCD_0001.tsv,bus,nc,bus,LL,avoir,v,OBJ,un bus,det nc,0,0,-1,ITNR,2,0,None,2:10,O.
+20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,desservir,v,OBJ,la rue Ledru-Rollin à Bagneux,det nc np prep nc,0,0,-1,ITNR,5,0,None,2:14,O.
+20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,1,0,-1,ITNR,3,0,None,2:15,O.
+20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,2,0,-1,ITNR,1,0,None,2:17,O.
+20091112_RATP_SCD_0001.tsv,rue,nc,rue,A,ROOT,ROOT,ROOT,la rue Ledru-Rollin à Bagneux,det nc np prep nc,3,1,-1,ITNR,5,0,None,3:1,O.
+20091112_RATP_SCD_0001.tsv,Ledru,np,Ledru,A,ROOT,ROOT,NOLINK,Ledru,np,4,0,-1,ITNR,1,0,None,3:2,O.
+20091112_RATP_SCD_0001.tsv,Ledru-Rollin,np,Ledru-Rollin,A,rue,nc,MOD,Ledru-Rollin à Bagneux,np prep nc,5,1,-1,ITNR,3,0,None,3:3,O.
+20091112_RATP_SCD_0001.tsv,Bagneux,nc,Bagneux,A,à,prep,OBJ,Bagneux,nc,6,1,-1,ITNR,1,0,None,3:5,O.
+20091112_RATP_SCD_0001.tsv,Paris,nc,Paris,A,de,prep,OBJ,Paris,nc,7,0,-1,ITNR,1,0,None,6:1,O.
--- a/source/syn.annot
+++ b/source/syn.annot
--- a/speaker_type.py
+++ b/speaker_type.py
+def load(filename):
+    output = {}
+    with open(filename) as fp:
+        for line in fp:
+            show, speaker_id, speaker_type = line.strip().split('\t')[:3]
+            output[(show, speaker_id)] = speaker_type
+    return output
+
+class SpeakerType:
+    def __init__(self, mapping_filename):
+        self.mapping = load(mapping_filename)
+
+    def resolve(self, show, speaker_id):
+        key = (show, speaker_id)
+        if key in self.mapping:
+            return self.mapping[key]
+        return None
+
+if __name__ == '__main__':
+    import sys
+    speaker_type = SpeakerType(sys.argv[1])
+    for line in sys.stdin:
+        print speaker_type.resolve(*line.strip().split())
+
--- a/special-tokens.txt
+++ b/special-tokens.txt
+à+cause+d'
+à+côté+d'
+afin+d'
+afin+qu'
+alors+qu'
+à+moins+qu'
+à+partir+d'
+après+qu'
+à+propos+d'
+à+savoir+qu'
+au+bout+d'
+au+centre+d'
+au+lieu+d'
+au+milieu+d'
+au+moment+d'
+au+moyen+d'
+au+nom+d'
+auprès+d'
+au+sujet+d'
+autant+d'
+avant+d'
+avant+qu'
+beaucoup+d'
+bien+qu'
+bon+app'
+c'
+ce+qu'
+combien+d'
+d'
+de+parce+qu'
+depuis+qu'
+des+fois+qu'
+dès+qu'
+du+côté+d'
+du+fait+qu'
+en+début+d'
+en+face+d'
+en+fait+d'
+en+fin+d'
+en+milieu+d'
+en+tant+qu'
+en+train+d'
+est-ce+qu'
+est+ce+qu'
+étant+donné+qu'
+j'
+jusqu'
+l'
+le+temps+qu'
+loin+d'
+lors+d'
+lorsqu'
+m'
+maintenant+qu'
+malgré+qu'
+manip'
+mat'
+même+s'
+n'
+où+est-ce+qu'
+parce+qu'
+pas+mal+d'
+pendant+qu'
+peu+d'
+plein+d'
+plus+qu'
+plutôt+qu'
+pour+qu'
+près+d'
+puisqu'
+qu'
+quand+est-ce+qu'
+qu'est-ce+qu'
+quoi+qu'
+s'
+sachant+qu'
+sauf+qu'
+si+bien+qu'
+surtout+qu'
+t'
+tandis+qu'
+tant+qu'
+tels+qu'
+toute+l'
+tout+l'
+une+fois+qu'
+un+peu+d'
+vélib'
+Vélib'
+vu+qu'
--- a/summarizer.py
+++ b/summarizer.py
--- a/topics.csv
+++ b/topics.csv