Minor fixes

fa7bedb1 · Jeremy Auguste · cb26cf7b · fa7bedb1 · fa7bedb1 · fa7bedb1
Commit fa7bedb1 authored 9 years ago by Jeremy Auguste
--- a/src/alignments.cc
+++ b/src/alignments.cc
@@ -3,7 +3,6 @@
 #include "modules/sequence_extractors.hh"
 #include "modules/sequence_alignment.hh"
 #include "modules/similarity_functions.hh"
-#include "modules/decision_resolver.hh"

 #include <getopt.h>
 #include <cmath>
@@ -27,7 +26,9 @@ std::pair<decoda::Extractor, decoda::Similarity> ExtractorFromString(std::string
  if (str == "speaker")
    return {decoda::Speaker, decoda::speaker_similarity};
  if (str == "responsetime")
-    return {decoda::ResponseTime, decoda::response_time_similarity};
+    return {decoda::ResponseTime, decoda::continuous_similarity};
+  if (str == "length")
+    return {decoda::TurnLength, decoda::identity_similarity};

  throw std::invalid_argument(str + " is not a valid extractor");
 }
@@ -52,7 +53,7 @@ void HelpMessage(std::ostream &output, const std::string prog_name) {
  output << "\t 1: Builds an alignment dictionnary" << std::endl;
  output << "\t 2: Builds vectors for each dialogue" << std::endl;
  output << "Align Methods: 'smithwaterman' or 'needlemanwunsch'" << std::endl;
-    output << "Extractors: 'dialogueacts', 'polarity', 'speaker' and 'responsetime'"
+  output << "Extractors: 'dialogueacts', 'polarity', 'speaker' and 'length'"
 		 << std::endl;
 }

@@ -63,7 +64,7 @@ void BuildAffinityMatrix(std::vector<decoda::Sequence> &sequences,
 						 std::ostream &output) {
  int nb_dialogues = sequences.size();

-  if (!verbose)
+  if (verbose)
    std::cerr << "Building affinity matrix..." << std::endl;
  
  int count = 0;
@@ -181,7 +182,7 @@ void BuildAlignmentDictionnary(std::vector<decoda::Sequence> &sequences,
 							   std::ostream &output, bool perfect_match) {
  size_t nb_dialogues = sequences.size();

-  if (!verbose)
+  if (verbose)
    std::cerr << "Building dictionnary..." << std::endl;

  std::map<decoda::Sequence, int> dictionnary;
@@ -210,7 +211,7 @@ void BuildAlignmentDictionnary(std::vector<decoda::Sequence> &sequences,
  if (verbose > 1)
    std::cerr << std::endl << "Done." << std::endl;

-  if (!verbose)
+  if (verbose)
    std::cerr << "Dictionnary size: " << dictionnary.size() << std::endl;

  for (auto &kv : dictionnary) {
@@ -219,34 +220,56 @@ void BuildAlignmentDictionnary(std::vector<decoda::Sequence> &sequences,
 }

 void AddAlignmentToVectors(decoda::Alignment alignment,
-			   std::map<decoda::Sequence, std::set<int>> &vectors,
+						   std::vector<decoda::Similarity> &similarity_funcs,
+						   std::map<std::string, std::set<int>> &vectors,
 						   int dlg1, int dlg2) {
-  decoda::Sequence sequence;
-  
+  std::string sequence1 = "";
+  std::string sequence2 = "";
  
  int s1_idx = alignment.topleft.first;
  int s2_idx = alignment.topleft.second;
+  bool first = true;
  
  for (auto &action : alignment.cigar) {
    switch (action) {
-    case 'M': {
-      sequence.push_back(alignment.a[s1_idx]);
-      s1_idx++;
-      s2_idx++;
-      break;
-    }
+    case 'M':
    case 'X': {
-      std::vector<decoda::ExtractorTypes> mismatch;
      size_t n_extractors = alignment.a[s1_idx].size();
-      mismatch.resize(n_extractors);
-      for (size_t k = 0; k < n_extractors; ++k) {
-    	if (alignment.a[s1_idx][k] == alignment.b[s2_idx][k]) {
-    	  mismatch[k] = alignment.a[s1_idx][k];
+      if (!first) {
+		sequence1 += ";";
+		sequence2 += ";";
+      }
+      first = false;
+      sequence1 += "(";
+      sequence2 += "(";
+      for (size_t k = 0; k < n_extractors; k++) {
+		if (similarity_funcs[k] == decoda::continuous_similarity) {
+		  double sim = decoda::continuous_similarity(alignment.a[s1_idx][k], alignment.b[s2_idx][k]);
+		  if (sim >= 0.0) {
+			long int_a = std::lround(boost::lexical_cast<double>(alignment.a[s1_idx][k]));
+			long int_b = std::lround(boost::lexical_cast<double>(alignment.b[s2_idx][k]));
+
+			sequence1 += std::to_string(int_a);
+			sequence2 += std::to_string(int_b);
+		  }
 		} else {
-    	  mismatch[k] = "";
+		  if (alignment.a[s1_idx][k] == alignment.b[s2_idx][k]) {
+			std::stringstream ss;
+			ss << alignment.a[s1_idx][k];
+			sequence1 += ss.str();
+			sequence2 += ss.str();
 		  }
 		}
-      sequence.push_back(mismatch);
+	
+		if (k != n_extractors-1) {
+		  sequence1 += ",";
+		  sequence2 += ",";
+		}
+      }
+
+      sequence1 += ")";
+      sequence2 += ")";
+      
      s1_idx++;
      s2_idx++;
      break;
@@ -262,36 +285,109 @@ void AddAlignmentToVectors(decoda::Alignment alignment,
    }
  }

-  vectors[sequence].insert(dlg1);
-  vectors[sequence].insert(dlg2);
+  vectors[sequence1].insert(dlg1);
+  vectors[sequence1].insert(dlg2);
+  vectors[sequence2].insert(dlg1);
+  vectors[sequence2].insert(dlg2);
 }

 void AddSubAlignmentsToVectors(decoda::Alignment alignment,
-			       std::map<decoda::Sequence, std::set<int>> &vectors,
+							   std::vector<decoda::Similarity> &similarity_funcs,
+							   std::map<std::string, std::set<int>> &vectors,
 							   int dlg1, int dlg2) {
  int s1_idx = alignment.topleft.first;
+  int s2_idx = alignment.topleft.second;
+	
+  std::string sequence1 = "";
+  std::string sequence2 = "";
+  bool first = true;
  
-  decoda::Sequence subsequence;
  for (auto &action : alignment.cigar) {
-    if (action == 'M') {
-      subsequence.push_back(alignment.a[s1_idx]);
-      s1_idx++;
+    if (action == 'M' || action == 'X') {
+      size_t n_extractors = alignment.a[s1_idx].size();
+      std::string substr1 = "(";
+      std::string substr2 = "(";
+      bool completed = true;
+
+      for (size_t k = 0; k < n_extractors; k++) {
+		if (similarity_funcs[k] == decoda::continuous_similarity) {
+		  double sim = decoda::continuous_similarity(alignment.a[s1_idx][k], alignment.b[s2_idx][k]);
+		  if (sim >= 0.0) {
+			long int_a = std::lround(boost::lexical_cast<double>(alignment.a[s1_idx][k]));
+			long int_b = std::lround(boost::lexical_cast<double>(alignment.b[s2_idx][k]));
+
+			substr1 += std::to_string(int_a);
+			substr2 += std::to_string(int_b);
 		  } else {
-      if (!subsequence.empty()) {
-	vectors[subsequence].insert(dlg1);
-	vectors[subsequence].insert(dlg2);
+			completed = false;
+			break;
+		  }
+		} else {
+		  if (alignment.a[s1_idx][k] == alignment.b[s2_idx][k]) {
+			std::stringstream ss;
+			ss << alignment.a[s1_idx][k];
+			substr1 += ss.str();
+			substr2 += ss.str();
+		  } else {
+			completed = false;
+			break;
+		  }
+		}
 	
-	subsequence.clear();
+		if (k != n_extractors-1) {
+		  substr1 += ",";
+		  substr2 += ",";
 		}
-      if (action == 'D' || action == 'X') {
+      }
+      substr1 += ")";
+      substr2 += ")";
+
+      if (completed) {
+		if (!first) {
+		  sequence1 += ";";
+		  sequence2 += ";";
+		}
+		first = false;
+		sequence1 += substr1;
+		sequence2 += substr2;
+      } else {
+		if (!sequence1.empty()) {
+		  vectors[sequence1].insert(dlg1);
+		  vectors[sequence1].insert(dlg2);
+		  vectors[sequence2].insert(dlg1);
+		  vectors[sequence2].insert(dlg2);
+		  sequence1.clear();
+		  sequence2.clear();
+		}
+		first = true;
+      }
+
+      s1_idx++;
+      s2_idx++;
+    } else {
+      if (!sequence1.empty()) {
+		vectors[sequence1].insert(dlg1);
+		vectors[sequence1].insert(dlg2);
+		vectors[sequence2].insert(dlg1);
+		vectors[sequence2].insert(dlg2);
+		first = true;
+		sequence1.clear();
+		sequence2.clear();
+      }
+      if (action == 'D') {
 		s1_idx++;
      }
+      if (action == 'I') {
+		s2_idx++;
+      }
    }
  }

-  if (!subsequence.empty()) {
-    vectors[subsequence].insert(dlg1);
-    vectors[subsequence].insert(dlg2);
+  if (!sequence1.empty()) {
+    vectors[sequence1].insert(dlg1);
+    vectors[sequence1].insert(dlg2);
+    vectors[sequence2].insert(dlg1);
+    vectors[sequence2].insert(dlg2);
  }
 }

@@ -302,10 +398,10 @@ void BuildDialogueVectors(std::vector<decoda::Sequence> &sequences,

  size_t nb_dialogues = sequences.size();

-  if (!verbose)
+  if (verbose)
    std::cerr << "Building vectors..." << std::endl;

-  std::map<decoda::Sequence, std::set<int>> vectors;
+  std::map<std::string, std::set<int>> vectors;
  int count = 0;
  for (size_t i = 0; i < nb_dialogues; ++i) {
    for (size_t j = i+1; j < nb_dialogues; ++j) {
@@ -321,9 +417,9 @@ void BuildDialogueVectors(std::vector<decoda::Sequence> &sequences,
 																		 min_score);
      for (decoda::Alignment &alignment : alignments) {
 		if (perfect_match) {
-	  AddSubAlignmentsToVectors(alignment, vectors, i, j);
+		  AddSubAlignmentsToVectors(alignment, similarity_funcs, vectors, i, j);
 		} else {
-	  AddAlignmentToVectors(alignment, vectors, i, j);
+		  AddAlignmentToVectors(alignment, similarity_funcs, vectors, i, j);
 		}
      }
    }
@@ -333,7 +429,7 @@ void BuildDialogueVectors(std::vector<decoda::Sequence> &sequences,
    std::cerr << std::endl << "Done." << std::endl;

  for (auto &kv : vectors) {
-    output << SequenceToString(kv.first);
+    output << kv.first;
    for (int value : vectors[kv.first]) {
      output << "\t" << value;
    }
@@ -477,14 +573,14 @@ int main(int argc, char *argv[]) {
    extractors_map.insert({argv[k], ExtractorFromString(argv[k])});
  }

-  if (!verbose)
+  if (verbose)
    std::cerr << "Parsing dialogues..." << std::endl;
  decoda::DialogueParser parser(stem_dir);

  parser.ReadAll();

  if (dialoguemap_set) {
-    if (!verbose)
+    if (verbose)
      std::cerr << "Outputing dialogue map..." << std::endl;
    std::ofstream odialoguemap(dialoguemap);

@@ -495,31 +591,31 @@ int main(int argc, char *argv[]) {
    odialoguemap.close();
  }

-  if (!verbose)
+  if (verbose)
    std::cerr << "Number of dialogues: " << parser.dialogues.size() << std::endl;

  std::vector<decoda::Extractor> extractors;
  std::vector<decoda::Similarity> similarity_funcs;

-  if (!verbose) {
+  if (verbose) {
    if (mode == 0)
      std::cerr << "Alignment method being used: " << align_method << std::endl;
    else if (mode == 1)
      std::cerr << "Alignment method being used: WatermanEggert" << std::endl;
  }

-  if (!verbose)
+  if (verbose)
    std::cerr << "Extractors being used: ";
  for (auto &kv : extractors_map) {
-    if (!verbose)
+    if (verbose)
      std::cerr << kv.first << " ";
    extractors.push_back(kv.second.first);
    similarity_funcs.push_back(kv.second.second);
  }
-  if (!verbose)
+  if (verbose)
    std::cerr << std::endl;

-  if (!verbose) {
+  if (verbose) {
    std::cerr << "Gap penality affine function: " << init_gap << " + " << extend_gap << "*k" << std::endl;
  }
  

--- a/src/modules/decision_resolver.hh
+++ b/src/modules/decision_resolver.hh
@@ -8,18 +8,21 @@

 namespace decoda {

-  typedef std::string (*Resolver)(ExtractorTypes&, ExtractorTypes&);
+  typedef std::string (*Resolver)(ExtractorTypes&, ExtractorTypes&, bool);

-  std::string BinaryDecisionResolver(decoda::ExtractorTypes &a, decoda::ExtractorTypes &b) {
+  std::string BinaryDecisionResolver(decoda::ExtractorTypes &a, decoda::ExtractorTypes &b,
+				     bool strict) {
    if (a == b) return boost::lexical_cast<std::string>(a);
    return "";
  }

-  std::string ContinuousDecisionResolver(decoda::ExtractorTypes &a, decoda::ExtractorTypes &b) {
+  std::string ContinuousDecisionResolver(decoda::ExtractorTypes &a, decoda::ExtractorTypes &b,
+					 bool strict=false) {
    double double_a = boost::lexical_cast<double>(a);
    double double_b = boost::lexical_cast<double>(b);
-
-    return std::to_string(std::lround(std::abs(double_a - double_b)));
+    long value = std::lround(std::abs(double_a - double_b));
+    if (!strict)
+      return std::to_string(value);
  }
  
 }

--- a/src/modules/dialogue_sequencer.cc
+++ b/src/modules/dialogue_sequencer.cc
 #include "dialogue_sequencer.hh"
+#include <exception>

 namespace decoda {
  DialogueSequencer::DialogueSequencer(std::vector<Extractor> extractors) : extractors(extractors) {}

  Sequence DialogueSequencer::Transform(Dialogue &dialogue) {
    Sequence sequence;
+	size_t size = 0;
 	
    for (Extractor extractor : extractors) {
      std::vector<ExtractorTypes> extracted = extractor(dialogue);

      if (sequence.empty()) {
 		sequence.resize(extracted.size());
+		size = extracted.size();
+      }
+
+	  if (size != extracted.size()) {
+		throw std::runtime_error("Extractors don't extract sequences of the same length !");
 	  }
      
      for (size_t k = 0; k < extracted.size(); ++k) {

--- a/src/modules/sequence_extractors.cc
+++ b/src/modules/sequence_extractors.cc
@@ -4,7 +4,6 @@
 #include <exception>

 namespace decoda {
-  
  std::vector<ExtractorTypes> DialogueActs(Dialogue &dialogue) {
 	std::vector<ExtractorTypes> partial_sequence;

@@ -20,6 +19,32 @@ namespace decoda {
 	return partial_sequence;
  }

+  std::vector<ExtractorTypes> TurnLength(Dialogue &dialogue) {
+	std::vector<ExtractorTypes> partial_sequence;
+
+	int length = 0;
+	for (Turn &turn : dialogue) {
+	  if (turn[0][kAct] == "//") {
+		continue;
+	  }
+	  length = turn.size();
+	  length = (length-1) / 3;
+	  if (length > 5)
+		length = 5;
+	  partial_sequence.push_back(length);
+	  // length = 0;
+	}
+
+	// if (length != 0) {
+	//   length = (length-1) / 3;
+	//   if (length > 5)
+	// 	length = 5;
+	//   partial_sequence.push_back(length);
+	// }
+
+	return partial_sequence;
+  }
+
  std::vector<ExtractorTypes> TurnMeanPolarity(Dialogue &dialogue) {
 	std::vector<ExtractorTypes> partial_sequence;
 		

--- a/src/modules/sequence_extractors.hh
+++ b/src/modules/sequence_extractors.hh
@@ -14,6 +14,7 @@ namespace decoda {
  typedef std::vector<ExtractorTypes> (*Extractor)(Dialogue &dialogue);
  
  std::vector<ExtractorTypes> DialogueActs(Dialogue &dialogue);
+  std::vector<ExtractorTypes> TurnLength(Dialogue &dialogue);
  std::vector<ExtractorTypes> TurnMeanPolarity(Dialogue &dialogue);
  std::vector<ExtractorTypes> TurnMeanPolarityDiscrete(Dialogue &dialogue);
  std::vector<ExtractorTypes> Speaker(Dialogue &dialogue);

--- a/src/modules/similarity_functions.cc
+++ b/src/modules/similarity_functions.cc
@@ -13,16 +13,6 @@ namespace decoda {
    return -1.9;
  }

-  double response_time_similarity(ExtractorTypes &a, ExtractorTypes &b) {
-    double double_a = boost::get<double>(a);
-    double double_b = boost::get<double>(b);
-
-    if (std::abs(double_a - double_b) <= 0.1) {
-      return 2;
-    }
-    return -1;
-  }
-
  double continuous_similarity(ExtractorTypes &a, ExtractorTypes &b) {
    double double_a = boost::get<double>(a);
    double double_b = boost::get<double>(b);