Skip to content
Snippets Groups Projects
Commit d45d5604 authored by Jeremy Trione's avatar Jeremy Trione
Browse files

Tunning > ~85+

parent 7f19adc3
No related branches found
No related tags found
No related merge requests found
......@@ -22,7 +22,7 @@ def unidecode(text):
return text.encode('utf8')
# words which can match interchangeably
equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place'}
equivalent_words = {'sucy': 'sussis', 'bus': 'ligne', 'rer a': 'rer', 'rer b': 'rer', 'rer c': 'rer', 'cdg': 'charles-de-gaulle', 'rer': 'train', 'rer': 'ligne', 'bus': 'autobus', 'square': 'place', 'ligne': 'bus', 'cles': 'clefs', 'anthony': 'antony', 'station-la': 'sation', 'roissy': 'aeroport', 'cour de vincennes': 'cours de vincennes', 'une': 'un'}
# multi-words from the corpus in order to retokenize synopses
with open('multiword-lexicon.txt') as fp:
......@@ -95,6 +95,10 @@ def approx_word_match(orig_word1, orig_word2):
if len(variants1 & variants2) > 0:
print >>sys.stderr, 'WARNING: approx match "%s" <> "%s"' % (orig_word1, orig_word2)
return True
### TEST ###
if (len(word1) >= 9 and len(word2) >= 6 and word2 in word1) or (len(word2) >= 9 and len(word1) >= 6 and word1 in word2):
return True
############
return False
# faster match for two sequences of same length
......@@ -108,7 +112,7 @@ def exact_match(sequence1, sequence2):
return (0, len(sequence1))
# check if a string represents a number
numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un)?)|cent|cents|mille|mille)'
numbers = '(zero|une|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|((vingt|vingts|trente|quarante|cinquante|soixante)( et un| et une)?)|cent|cents|mille|mille)'
def is_number(word):
if re.match('^[0-9]+$', word):
return True
......@@ -122,15 +126,15 @@ def same_number(word1, word2):
word1 = str(letter_to_number.convert(word1))
if re.match('^' + numbers + '( ' + numbers +')*$', word2.replace('-', ' ').replace('+', ' ')):
word2 = str(letter_to_number.convert(word2))
#print ' same?', word1, word2
## print ' same?', word1, word2
return word1 == word2
# common words for which we want to reduce the mismatch cost
stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par'.split())
stopwords = set('je tu il elle ils elles nous vous avec sans pour or et ni car mais ou donc a le les la l\' un une des ce cette ces mon ma mes vos votre ton ta tes dans de par vers part de'.split())
# the cost of a substitution looks at approximations, stopwords and numbers, it gets very expensive for content words
def sub_cost(word1, word2):
#if word2 == '89':
# if word2 == '21':
# print 'NUMBER', word1, is_number(word1), word2, is_number(word2), same_number(word1, word2)
if approx_word_match(word1, word2):
return 0
......@@ -160,7 +164,7 @@ def sequence_match(sequence1, sequence2):
value = float(score) / len(sequence2)
#print [unidecode(x.text).lower() for x in sequence1], [unidecode(x.text).lower() for x in sequence2], score, value
#levenstein.print_alignment(alignment, sys.stderr)
if value < .5:
if value < .6:
print >>sys.stderr, 'WARNING: match through alignment score=', score, num_ref
levenstein.print_alignment(alignment, sys.stderr)
......@@ -187,7 +191,7 @@ def match(sentence, variable, words):
result = False
for i in range(len(sentence)):
found = exact_match(sentence[i: i + len(words)], words)
#print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]]
## print 'DEBUG', found, i, [x.text for x in words], [x.text for x in sentence[i: i + len(words)]]
if not found:
found = sequence_match(sentence[i: i + len(words) + 2], words)
if found:
......@@ -227,7 +231,7 @@ def output_phrases(sentences, show):
parent_pos = sentence[word.parent].postag if word.parent >= 0 else 'ROOT'
features = [show, word.text, word.postag, word.lemma, word.named_entity[2:], parent, parent_pos, word.dep_label]
# get all children that don't depend on a verb
phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL')
phrase = word.get_phrase(sentence, blocker=lambda x: x.postag.startswith('v') or x.disfluency != 'NULL') # or x.text == "jusqu'à"
features.append(' '.join([x.text for x in phrase]))
features.append(' '.join([x.postag for x in phrase]))
features.append(seen['ne:' + word.named_entity[2:]])
......@@ -237,6 +241,7 @@ def output_phrases(sentences, show):
features.append(len(phrase))
features.append(sentence_num / len(sentences))
features.append(speaker_type.resolve(show, word.speaker))
### Décommenté pour lancer
print ','.join([str(x).replace(',', '<comma>') for x in features + [label]]) + '.'
seen['ne:' + word.named_entity[2:]] += 1
......@@ -277,7 +282,3 @@ for filename in sys.argv[1:]:
print >>sys.stderr, 'matched variables: %.2f%%' % (100 * float(num_matched) / num_variables)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment