Tunning > ~85+

d45d5604 · Jeremy Trione · 7f19adc3 · d45d5604
Commit d45d5604 authored 9 years ago by Jeremy Trione
--- a/map_slots_to_conversations.py
+++ b/map_slots_to_conversations.py
@@ -22,7 +22,7 @@ def unidecode(text):
    return text.encode('utf8')

 # words which can match interchangeably
-equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place'}
+equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}

 # multi-words from the corpus in order to retokenize synopses
 with open('multiword-lexicon.txt') as fp:
@@ -95,6 +95,10 @@ def approx_word_match(orig_word1, orig_word2):
        if len(variants1 & variants2) > 0:
            print >>sys.stderr, 'WARNING: approx match "%s" <> "%s"' % (orig_word1, orig_word2)
            return True
+    ### TEST ###
+    if (len(word1) >= 9 and len(word2) >= 6 and word2 in word1) or (len(word2) >= 9 and len(word1) >= 6 and word1 in word2):
+        return True
+    ############
    return False

 # faster match for two sequences of same length
@@ -108,7 +112,7 @@ def exact_match(sequence1, sequence2):
    return (0, len(sequence1))

 # check if a string represents a number
-numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un)?)|cent|cents|mille|mille)'
+numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un| et une)?)|cent|cents|mille|mille)'
 def is_number(word):
    if re.match('^[0-9]+$', word):
        return True
@@ -122,15 +126,15 @@ def same_number(word1, word2):
        word1 = str(letter_to_number.convert(word1))
    if re.match('^' + numbers + '( ' + numbers +')*$', word2.replace('-', ' ').replace('+', ' ')):
        word2 = str(letter_to_number.convert(word2))
-    #print '   same?', word1, word2
+##    print '   same?', word1, word2
    return word1 == word2

 # common words for which we want to reduce the mismatch cost
-stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par'.split())
+stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par vers part de'.split())

 # the cost of a substitution looks at approximations, stopwords and numbers, it gets very expensive for content words
 def sub_cost(word1, word2):
-    #if word2 == '89':
+#    if word2 == '21':
 #        print 'NUMBER', word1, is_number(word1), word2, is_number(word2), same_number(word1, word2)
    if approx_word_match(word1, word2):
        return 0
@@ -160,7 +164,7 @@ def sequence_match(sequence1, sequence2):
    value = float(score) / len(sequence2)
    #print [unidecode(x.text).lower() for x in sequence1], [unidecode(x.text).lower() for x in sequence2], score, value
    #levenstein.print_alignment(alignment, sys.stderr)
-    if value < .5:
+    if value < .6:
        print >>sys.stderr, 'WARNING: match through alignment score=', score, num_ref
        levenstein.print_alignment(alignment, sys.stderr)

@@ -187,7 +191,7 @@ def match(sentence, variable, words):
    result = False
    for i in range(len(sentence)):
        found = exact_match(sentence[i: i + len(words)], words) 
-        #print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]]
+##        print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]]
        if not found:
            found = sequence_match(sentence[i: i + len(words) + 2], words)
        if found:
@@ -227,7 +231,7 @@ def output_phrases(sentences, show):
                parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
                features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
                # get all children that don't depend on a verb
-                phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL')
+                phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
                features.append(' '.join([x.text for x in phrase]))
                features.append(' '.join([x.postag for x in phrase]))
                features.append(seen['ne:' + word.named_entity[2:]])
@@ -237,6 +241,7 @@ def output_phrases(sentences, show):
                features.append(len(phrase))
                features.append(sentence_num / len(sentences))
                features.append(speaker_type.resolve(show, word.speaker))
+### Décommenté pour lancer
                print ','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'

                seen['ne:' + word.named_entity[2:]] += 1
@@ -277,7 +282,3 @@ for filename in sys.argv[1:]:

 print >>sys.stderr, 'matched variables: %.2f%%' % (100 * float(num_matched) / num_variables)

-
-
-    
-