Skip to content
Snippets Groups Projects
Commit db3b262b authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modified lemmatizer in such a way that it checks the applicability of a rule before applying it

parent 2d5f281e
No related branches found
No related tags found
No related merge requests found
......@@ -3,5 +3,6 @@
char *apply_l_rule(char *form, char *l_rule);
char *compute_l_rule(char *lemma, char *form, int strict);
int l_rule_is_applicable(char *form, char *l_rule);
#endif
......@@ -9,21 +9,25 @@ int l_rule_is_applicable(char *form, char *l_rule)
{
int i,j;
int sep_index;
int lemma_suffix_length;
int form_suffix_length;
int form_length = strlen(form);
int l_rule_length = strlen(l_rule);
// printf("in is_applicable form = %s lrune = %s\n", form, l_rule);
for(sep_index=1; sep_index < l_rule_length; sep_index++)
if(l_rule[sep_index] == '@')
break;
lemma_suffix_length = l_rule_length - 1 - sep_index;
form_suffix_length = sep_index - 1;
for(j=0, i=form_length - form_suffix_length; j < sep_index; i++, j++)
for(j=1, i=form_length - form_suffix_length; j < sep_index; i++, j++){
// printf("l_rule[%d] = %c (%d) form[%d] = %c (%d)\n", j, l_rule[j],l_rule[j], i, form[i], form[i]);
if((l_rule[j] != '*') && (l_rule[j] != form[i]))
// if(l_rule[j] != form[i])
return 0;
}
// printf("rule can be applied\n");
return 1;
}
......
......@@ -121,6 +121,97 @@ int main(int argc, char *argv[])
float max;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
// fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
// if lemma is not specified in input it is looked up in exceptions file
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("lemma %s found in exceptions file\n", lemma_from_fplm);
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
}
// if lemma is not found in exception file, predict an l_rule
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
// feat_vec_print_string(fv, ctx->d_perceptron_features);
// feat_vec_print(stdout, fv);
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
if(ctx->debug_mode){
for(int i=0; i < 10; i++){
l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
fprintf(stderr, "%d", i);
if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*");
fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score);
}
}
int i;
for(i=0; i < 10; i++){
l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
if(l_rule_is_applicable(form, l_rule)){
char *transformed_lemma = apply_l_rule(form, l_rule);
// printf("transformed_lemma = %s\n", transformed_lemma);
// print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
print_word(b0, ctx->mcd_struct, transformed_lemma);
free(transformed_lemma);
break;
}
}
/* no rule applied */
if(i == 10){
print_word(b0, ctx->mcd_struct, form);
}
free(vcode_array);
}
}
word_buffer_move_right(c->bf);
}
config_free(c);
if (ctx->input_filename) fclose(f);
context_free(ctx);
fplm_free(exceptions);
feature_table_free(ft);
return 0;
}
#if 0
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
feat_vec *fv = feat_vec_new(10);
word *b0;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
config *c;
int l_rule_code;
char *l_rule;
float max;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
......@@ -189,6 +280,20 @@ int main(int argc, char *argv[])
return 0;
}
#endif
#if 0
int main(int argc, char *argv[])
{
......
......@@ -17,19 +17,21 @@ void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
{
int i;
word *w;
char lower_form[100];
char lower_form[1000];
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
w->signature = form2pos_get_signature(f2p, w->form);
if(w->signature == -1){
if(w->form){
strcpy(lower_form, w->form);
to_lower_string(lower_form);
w->signature = form2pos_get_signature(f2p, lower_form);
}
}
}
}
#endif
#if 0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment