diff --git a/map_slots_to_conversations.py b/map_slots_to_conversations.py index bd2361471cdab332b6d451886fd2d42a76290f79..c257b2d917f32fcaead1f1d34655f5e68f3d268a 100644 --- a/map_slots_to_conversations.py +++ b/map_slots_to_conversations.py @@ -22,7 +22,7 @@ def unidecode(text): return text.encode('utf8') # words which can match interchangeably -equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place'} +equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'} # multi-words from the corpus in order to retokenize synopses with open('multiword-lexicon.txt') as fp: @@ -95,6 +95,10 @@ def approx_word_match(orig_word1, orig_word2): if len(variants1 & variants2) > 0: print >>sys.stderr, 'WARNING: approx match "%s" <> "%s"' % (orig_word1, orig_word2) return True + ### TEST ### + if (len(word1) >= 9 and len(word2) >= 6 and word2 in word1) or (len(word2) >= 9 and len(word1) >= 6 and word1 in word2): + return True + ############ return False # faster match for two sequences of same length @@ -108,7 +112,7 @@ def exact_match(sequence1, sequence2): return (0, len(sequence1)) # check if a string represents a number -numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un)?)|cent|cents|mille|mille)' +numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un| et une)?)|cent|cents|mille|mille)' def is_number(word): if re.match('^[0-9]+$', word): return True @@ -122,16 +126,16 @@ def same_number(word1, word2): word1 = str(letter_to_number.convert(word1)) if re.match('^' + numbers + '( ' + numbers +')*$', word2.replace('-', ' ').replace('+', ' ')): word2 = str(letter_to_number.convert(word2)) - #print ' same?', word1, word2 +## print ' same?', word1, word2 return word1 == word2 # common words for which we want to reduce the mismatch cost -stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par'.split()) +stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par vers part de'.split()) # the cost of a substitution looks at approximations, stopwords and numbers, it gets very expensive for content words def sub_cost(word1, word2): - #if word2 == '89': - # print 'NUMBER', word1, is_number(word1), word2, is_number(word2), same_number(word1, word2) +# if word2 == '21': +# print 'NUMBER', word1, is_number(word1), word2, is_number(word2), same_number(word1, word2) if approx_word_match(word1, word2): return 0 elif word1 in stopwords and word2 in stopwords: @@ -160,7 +164,7 @@ def sequence_match(sequence1, sequence2): value = float(score) / len(sequence2) #print [unidecode(x.text).lower() for x in sequence1], [unidecode(x.text).lower() for x in sequence2], score, value #levenstein.print_alignment(alignment, sys.stderr) - if value < .5: + if value < .6: print >>sys.stderr, 'WARNING: match through alignment score=', score, num_ref levenstein.print_alignment(alignment, sys.stderr) @@ -187,7 +191,7 @@ def match(sentence, variable, words): result = False for i in range(len(sentence)): found = exact_match(sentence[i: i + len(words)], words) - #print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]] +## print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]] if not found: found = sequence_match(sentence[i: i + len(words) + 2], words) if found: @@ -227,7 +231,7 @@ def output_phrases(sentences, show): parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT' features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label] # get all children that don't depend on a verb - phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') + phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à" features.append(' '.join([x.text for x in phrase])) features.append(' '.join([x.postag for x in phrase])) features.append(seen['ne:' + word.named_entity[2:]]) @@ -237,6 +241,7 @@ def output_phrases(sentences, show): features.append(len(phrase)) features.append(sentence_num / len(sentences)) features.append(speaker_type.resolve(show, word.speaker)) +### Décommenté pour lancer print ','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.' seen['ne:' + word.named_entity[2:]] += 1 @@ -277,7 +282,3 @@ for filename in sys.argv[1:]: print >>sys.stderr, 'matched variables: %.2f%%' % (100 * float(num_matched) / num_variables) - - - -