Skip to content
Snippets Groups Projects
Commit 631434c5 authored by Franck Dary's avatar Franck Dary
Browse files

Improved the way we delete duplicates examples

parent 85818a96
No related branches found
No related tags found
No related merge requests found
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
#include "string.h" #include "string.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <set>
Fann_file::Fann_file(const char * filename, unsigned int classifier_id) Fann_file::Fann_file(const char * filename, unsigned int classifier_id)
{ {
...@@ -166,7 +165,8 @@ void Fann_file::over_sample(float ratio) ...@@ -166,7 +165,8 @@ void Fann_file::over_sample(float ratio)
void Fann_file::delete_duplicates() void Fann_file::delete_duplicates()
{ {
std::vector<unsigned int> new_order; std::vector<unsigned int> new_order;
std::set<std::string> examples; std::map<std::string, int> examples;
int max_distance = 3;
for(int index : order) for(int index : order)
{ {
...@@ -175,10 +175,10 @@ void Fann_file::delete_duplicates() ...@@ -175,10 +175,10 @@ void Fann_file::delete_duplicates()
for(int feat : features[index]) for(int feat : features[index])
example += "," + std::to_string(feat); example += "," + std::to_string(feat);
if(examples.count(example)) if(examples.count(example) && (index - examples[example] < max_distance))
continue; continue;
examples.insert(example); examples[example] = index;
new_order.push_back(index); new_order.push_back(index);
} }
......
...@@ -158,8 +158,8 @@ void train_nn(context * ctx) ...@@ -158,8 +158,8 @@ void train_nn(context * ctx)
Fann_file fann_train(ctx->fann_filename, classif_nb); Fann_file fann_train(ctx->fann_filename, classif_nb);
Fann_file fann_dev(ctx->fann_dev_filename, classif_nb); Fann_file fann_dev(ctx->fann_dev_filename, classif_nb);
if(!strcmp("TAGGER", classif->oracle_name)||!strcmp("PARSER", classif->oracle_name))
fann_train.delete_duplicates(); fann_train.delete_duplicates();
fann_dev.delete_duplicates();
fann_train.over_sample(0.0); //The ratio between the nb of examples of the majority class and of the minority class will not be inferior to this value fann_train.over_sample(0.0); //The ratio between the nb of examples of the majority class and of the minority class will not be inferior to this value
auto first_example_batch = fann_train.get_batch(1); auto first_example_batch = fann_train.get_batch(1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment