Skip to content
Snippets Groups Projects
Commit 53a616c9 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

first operational version of rule based lemmatizer

parent cf836cbd
No related branches found
No related tags found
No related merge requests found
...@@ -35,7 +35,7 @@ word *word_read(FILE *f, mcd *mcd_struct) ...@@ -35,7 +35,7 @@ word *word_read(FILE *f, mcd *mcd_struct)
/* look for a valid word */ /* look for a valid word */
while(fgets(buffer, 10000, f)){ while(fgets(buffer, 10000, f)){
/* printf("buffer = %s\n", buffer); */ // printf("buffer = %s\n", buffer);
/* ignore empty lines */ /* ignore empty lines */
if((buffer[0] == '\n')) continue; if((buffer[0] == '\n')) continue;
/* lines beginning with ## are comments */ /* lines beginning with ## are comments */
...@@ -63,6 +63,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) ...@@ -63,6 +63,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct)
/* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */ /* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */
if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){ if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){
w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col); w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
// printf("col = %d val = %d\n", col, w->wf_array[mcd_struct->wf[col]]);
} }
if(mcd_struct->wf[col] == MCD_WF_FORM){ if(mcd_struct->wf[col] == MCD_WF_FORM){
w->form = strdup(token); w->form = strdup(token);
......
...@@ -28,10 +28,11 @@ int get_feat_value_cff(feat_model *fm, config *c, dico *dico_features, int feat_ ...@@ -28,10 +28,11 @@ int get_feat_value_cff(feat_model *fm, config *c, dico *dico_features, int feat_
} }
if(mode == LOOKUP_MODE){ if(mode == LOOKUP_MODE){
if(fm->string) if(fm->string){
/* printf("fmstring = %s\n", fm->string); */ // printf("fmstring = %s\n", fm->string);
return dico_string2int(dico_features, fm->string); return dico_string2int(dico_features, fm->string);
} }
}
return dico_add(dico_features, fm->string); return dico_add(dico_features, fm->string);
} }
...@@ -50,8 +51,9 @@ feat_vec *config2feat_vec_cff(feat_model *fm, config *c, dico *dico_features, fe ...@@ -50,8 +51,9 @@ feat_vec *config2feat_vec_cff(feat_model *fm, config *c, dico *dico_features, fe
{ {
int i; int i;
feat_vec_empty(fv); feat_vec_empty(fv);
for(i=0; i < fm->nbelem; i++) for(i=0; i < fm->nbelem; i++){
feat_vec_add(fv, get_feat_value_cff(fm, c, dico_features, i, mode)); feat_vec_add(fv, get_feat_value_cff(fm, c, dico_features, i, mode));
}
return fv; return fv;
} }
......
...@@ -101,7 +101,7 @@ context *context_new(void) ...@@ -101,7 +101,7 @@ context *context_new(void)
ctx->json_filename = NULL; ctx->json_filename = NULL;
ctx->dnn_model_filename = NULL; ctx->dnn_model_filename = NULL;
ctx->l_rules_filename = NULL;
return ctx; return ctx;
} }
...@@ -194,7 +194,7 @@ context *context_read_options(int argc, char *argv[]) ...@@ -194,7 +194,7 @@ context *context_read_options(int argc, char *argv[])
ctx->program_name = strdup(argv[0]); ctx->program_name = strdup(argv[0]);
static struct option long_options[24] = static struct option long_options[26] =
{ {
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'}, {"verbose", no_argument, 0, 'v'},
...@@ -219,13 +219,15 @@ context *context_read_options(int argc, char *argv[]) ...@@ -219,13 +219,15 @@ context *context_read_options(int argc, char *argv[])
{"f2p", required_argument, 0, 'P'}, {"f2p", required_argument, 0, 'P'},
{"traces", required_argument, 0, 'T'}, {"traces", required_argument, 0, 'T'},
{"json", required_argument, 0, 'J'}, {"json", required_argument, 0, 'J'},
{"dnn_model", required_argument, 0, 'N'} {"dnn_model", required_argument, 0, 'N'},
{"l_rules", required_argument, 0, 'l'},
{"fplm", required_argument, 0, 'w'}
}; };
optind = 0; optind = 0;
opterr = 0; opterr = 0;
while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:", long_options, &option_index)) != -1){ while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:w:l:", long_options, &option_index)) != -1){
switch (c) switch (c)
{ {
case 'h': case 'h':
...@@ -255,6 +257,9 @@ context *context_read_options(int argc, char *argv[]) ...@@ -255,6 +257,9 @@ context *context_read_options(int argc, char *argv[])
case 'x': case 'x':
ctx->cff_filename = strdup(optarg); ctx->cff_filename = strdup(optarg);
break; break;
case 'w':
ctx->fplm_filename = strdup(optarg);
break;
case 'u': case 'u':
ctx->feature_cutoff = atoi(optarg); ctx->feature_cutoff = atoi(optarg);
break; break;
...@@ -270,6 +275,9 @@ context *context_read_options(int argc, char *argv[]) ...@@ -270,6 +275,9 @@ context *context_read_options(int argc, char *argv[])
case 'f': case 'f':
ctx->fann_filename = strdup(optarg); ctx->fann_filename = strdup(optarg);
break; break;
case 'l':
ctx->l_rules_filename = strdup(optarg);
break;
case 's': case 's':
ctx->sent_nb = atoi(optarg); ctx->sent_nb = atoi(optarg);
break; break;
......
...@@ -14,6 +14,13 @@ ...@@ -14,6 +14,13 @@
#define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab"
#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model"
#define DEFAULT_MULTI_COL_DESC_LEMMATIZER_FILENAME "maca_trans_lemmatizer.mcd"
#define DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME "maca_trans_lemmatizer.fm"
#define DEFAULT_VOCABS_LEMMATIZER_FILENAME "maca_trans_lemmatizer.vocab"
#define DEFAULT_MODEL_LEMMATIZER_FILENAME "maca_trans_lemmatizer.model"
#define DEFAULT_RULES_LEMMATIZER_FILENAME "maca_trans_lemmatizer_rules.txt"
#define DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME "maca_trans_lemmatizer_exceptions.fplm"
#define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd" #define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd"
#define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm" #define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm"
#define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab" #define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab"
...@@ -82,6 +89,7 @@ typedef struct { ...@@ -82,6 +89,7 @@ typedef struct {
char *json_filename; char *json_filename;
char *dnn_model_filename; char *dnn_model_filename;
char *l_rules_filename;
} context; } context;
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
#include"dico.h" #include"dico.h"
#include"config.h" #include"config.h"
#include"fplm.h" #include"fplm.h"
#include"l_rule.h"
#include"config2feat_vec.h"
void maca_lemmatizer_help_message(context *ctx) void maca_lemmatizer_help_message(context *ctx)
{ {
...@@ -20,29 +22,58 @@ void maca_lemmatizer_help_message(context *ctx) ...@@ -20,29 +22,58 @@ void maca_lemmatizer_help_message(context *ctx)
context_mcd_help_message(ctx); context_mcd_help_message(ctx);
} }
void maca_lemmatizer_check_options(context *ctx){
if(ctx->help
){
maca_lemmatizer_help_message(ctx);
exit(1);
}
}
void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx)
{ {
char absolute_filename[500]; char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->fplm_filename){ if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path); strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME); strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
ctx->fplm_filename = strdup(absolute_filename); ctx->fplm_filename = strdup(absolute_filename);
} }
if(!ctx->l_rules_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
ctx->l_rules_filename = strdup(absolute_filename);
}
if(ctx->verbose){ if(ctx->verbose){
fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename); fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
} }
} }
void maca_lemmatizer_check_options(context *ctx){
if(ctx->help
){
maca_lemmatizer_help_message(ctx);
exit(1);
}
}
/* a bit messy */ /* a bit messy */
void print_word(word *w, mcd *mcd_struct, char *lemma) void print_word(word *w, mcd *mcd_struct, char *lemma)
...@@ -75,6 +106,90 @@ void print_word(word *w, mcd *mcd_struct, char *lemma) ...@@ -75,6 +106,90 @@ void print_word(word *w, mcd *mcd_struct, char *lemma)
} }
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
feat_vec *fv = feat_vec_new(10);
word *b0;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
config *c;
int l_rule_code;
char *l_rule;
float max;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
// fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
// if lemma is not specified in input it is looked up in exceptions file
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("lemma %s found in exceptions file\n", lemma_from_fplm);
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
}
// if lemma is not found in exception file, predict an l_rule
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
// feat_vec_print_string(fv, ctx->d_perceptron_features);
// feat_vec_print(stdout, fv);
l_rule_code = feature_table_argmax(fv, ft, &max);
//fprintf(stderr, "lrule code %d predicted\n", l_rule_code);
l_rule = dico_int2string(d_l_rules, l_rule_code);
// printf("lrule %s predicted\n", l_rule);
char *transformed_lemma = apply_l_rule(form, l_rule);
// printf("transformed_lemma = %s\n", transformed_lemma);
print_word(b0, ctx->mcd_struct, transformed_lemma);
free(transformed_lemma);
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 10; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(d_l_rules, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
}
word_buffer_move_right(c->bf);
}
config_free(c);
if (ctx->input_filename) fclose(f);
context_free(ctx);
fplm_free(exceptions);
feature_table_free(ft);
return 0;
}
#if 0
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
context *ctx = context_read_options(argc, argv); context *ctx = context_read_options(argc, argv);
...@@ -118,3 +233,4 @@ int main(int argc, char *argv[]) ...@@ -118,3 +233,4 @@ int main(int argc, char *argv[])
return 0; return 0;
} }
#endif
...@@ -37,6 +37,54 @@ int movement_lemmatizer(config *c, int feats) ...@@ -37,6 +37,54 @@ int movement_lemmatizer(config *c, int feats)
return 1; return 1;
} }
void decode_lemmatizer_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(!ctx->l_rules_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
ctx->l_rules_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx) void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx)
{ {
...@@ -92,7 +140,7 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp ...@@ -92,7 +140,7 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp
//printf("form = %s pos = %s lemma = %s\n", form, pos, lemma); //printf("form = %s pos = %s lemma = %s\n", form, pos, lemma);
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, 0); lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){ if(lemma_from_fplm){
// printf("exception\n"); // printf("exception\n");
...@@ -106,13 +154,14 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp ...@@ -106,13 +154,14 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp
l_rule_code = dico_string2int(d_l_rules, l_rule); l_rule_code = dico_string2int(d_l_rules, l_rule);
free(l_rule);
if(l_rule_code != -1){ if(l_rule_code != -1){
// if(strcmp(l_rule, "@@")){
// fprintf(stdout, "rule exists\n"); // fprintf(stdout, "rule exists\n");
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
fprintf(output_file, "%d", l_rule_code); fprintf(output_file, "%d", l_rule_code);
feat_vec_print(output_file, fv); feat_vec_print(output_file, fv);
// }
} }
else{ else{
// fprintf(stdout, "rule does not exist\n"); // fprintf(stdout, "rule does not exist\n");
...@@ -120,6 +169,8 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp ...@@ -120,6 +169,8 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp
} }
word_buffer_move_right(c->bf); word_buffer_move_right(c->bf);
// movement_lemmatizer(c, l_rule); // movement_lemmatizer(c, l_rule);
free(l_rule);
} }
} }
...@@ -130,11 +181,11 @@ int main(int argc, char *argv[]) ...@@ -130,11 +181,11 @@ int main(int argc, char *argv[])
dico *d_l_rules; dico *d_l_rules;
fplm_struct *exceptions; fplm_struct *exceptions;
ctx = context_read_options(argc, argv); ctx = context_read_options(argc, argv);
maca_trans_lemmatizer_mcf2cff_check_options(ctx);
// exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose); // decode_lemmatizer_set_linguistic_resources_filenames(ctx);
exceptions = fplm_load_file((char *)"exceptions.fplm", ctx->verbose); maca_trans_lemmatizer_mcf2cff_check_options(ctx);
d_l_rules = dico_read((char *)"rules", 0.5); exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose);
d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment