Skip to content
Snippets Groups Projects
Commit cf836cbd authored by Alexis Nasr's avatar Alexis Nasr
Browse files

still working on predictive lemmatizer

parent 74ad3a64
No related branches found
No related tags found
No related merge requests found
...@@ -108,5 +108,6 @@ char *fplm_lookup_lemma(fplm_struct *fplm, char *form, char *pos, int verbose) ...@@ -108,5 +108,6 @@ char *fplm_lookup_lemma(fplm_struct *fplm, char *form, char *pos, int verbose)
if(verbose) if(verbose)
fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
return form; return NULL;
// return form;
} }
...@@ -38,6 +38,12 @@ target_link_libraries(transparse perceptron) ...@@ -38,6 +38,12 @@ target_link_libraries(transparse perceptron)
#target_link_libraries(maca_trans_parser_nn maca_common) #target_link_libraries(maca_trans_parser_nn maca_common)
#install (TARGETS maca_trans_parser_nn DESTINATION bin) #install (TARGETS maca_trans_parser_nn DESTINATION bin)
add_executable(maca_trans_lemmatizer_mcf2cff ./src/maca_trans_lemmatizer_mcf2cff.c)
target_link_libraries(maca_trans_lemmatizer_mcf2cff perceptron)
target_link_libraries(maca_trans_lemmatizer_mcf2cff transparse)
target_link_libraries(maca_trans_lemmatizer_mcf2cff maca_common)
install (TARGETS maca_trans_lemmatizer_mcf2cff DESTINATION bin)
add_executable(maca_trans_tagger_mcf2cff ./src/maca_trans_tagger_mcf2cff.c) add_executable(maca_trans_tagger_mcf2cff ./src/maca_trans_tagger_mcf2cff.c)
target_link_libraries(maca_trans_tagger_mcf2cff perceptron) target_link_libraries(maca_trans_tagger_mcf2cff perceptron)
target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff transparse)
......
...@@ -82,6 +82,7 @@ int main(int argc, char *argv[]) ...@@ -82,6 +82,7 @@ int main(int argc, char *argv[])
char lemma[200]; char lemma[200];
char form[200]; char form[200];
char pos[200]; char pos[200];
char *lemma_from_fplm;
config *c; config *c;
fplm_struct *fplm; fplm_struct *fplm;
FILE *f; FILE *f;
...@@ -101,8 +102,13 @@ int main(int argc, char *argv[]) ...@@ -101,8 +102,13 @@ int main(int argc, char *argv[])
/* if lemma is not specified in input it is looked up */ /* if lemma is not specified in input it is looked up */
if(strlen(lemma) && strcmp(lemma, "_")) if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma); print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(fplm, form, pos, ctx->verbose);
if(lemma_from_fplm)
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
else else
print_word(b0, ctx->mcd_struct, fplm_lookup_lemma(fplm, form, pos, ctx->verbose)); print_word(b0, ctx->mcd_struct, form);
}
word_buffer_move_right(c->bf); word_buffer_move_right(c->bf);
} }
config_free(c); config_free(c);
......
...@@ -11,8 +11,10 @@ ...@@ -11,8 +11,10 @@
#include"dico_vec.h" #include"dico_vec.h"
#include"word_emb.h" #include"word_emb.h"
#include"config2feat_vec.h" #include"config2feat_vec.h"
#include"fplm.h"
#include"l_rule.h"
int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules) /*int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
{ {
char lemma[200]; char lemma[200];
char form[200]; char form[200];
...@@ -26,7 +28,7 @@ int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules) ...@@ -26,7 +28,7 @@ int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules)
l_rule_code = dico_string2int(d_l_rule, l_rule); l_rule_code = dico_string2int(d_l_rule, l_rule);
free(l_rule); free(l_rule);
return l_rule_code; return l_rule_code;
} }*/
int movement_lemmatizer(config *c, int feats) int movement_lemmatizer(config *c, int feats)
{ {
...@@ -66,28 +68,58 @@ void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx) ...@@ -66,28 +68,58 @@ void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx)
} }
} }
void generate_training_file(FILE *output_file, context *ctx) void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
{ {
config *c; config *c;
feat_vec *fv = feat_vec_new(feature_types_nb); feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *conll_file = myfopen(ctx->input_filename, "r"); FILE *conll_file = myfopen(ctx->input_filename, "r");
int l_rule; char *l_rule;
int l_rule_code;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
int strict = 1;
/* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
c = config_new(conll_file, ctx->mcd_struct, 5); c = config_new(conll_file, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){ while(!config_is_terminal(c)){
word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, word_buffer_b0(c->bf), mcd_get_pos_col(ctx->mcd_struct));
// printf("form = %s pos = %s lemma = %s\n", form, pos, lemma);
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, 0);
if(lemma_from_fplm){
// printf("exception\n");
word_buffer_move_right(c->bf);
continue;
}
l_rule = oracle_lemmatizer(c); l_rule = compute_l_rule(lemma, form, strict);
//printf("rule = %s\n", l_rule);
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
l_rule_code = dico_string2int(d_l_rules, l_rule);
free(l_rule);
fprintf(output_file, "%d", l_rule); if(l_rule_code != -1){
// fprintf(stdout, "rule exists\n");
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
fprintf(output_file, "%d", l_rule_code);
feat_vec_print(output_file, fv); feat_vec_print(output_file, fv);
movement_lemmatizer(c, l_rule); }
else{
// fprintf(stdout, "rule does not exist\n");
}
word_buffer_move_right(c->bf);
// movement_lemmatizer(c, l_rule);
} }
} }
...@@ -96,10 +128,14 @@ int main(int argc, char *argv[]) ...@@ -96,10 +128,14 @@ int main(int argc, char *argv[])
context *ctx; context *ctx;
FILE *output_file; FILE *output_file;
dico *d_l_rules; dico *d_l_rules;
fplm_struct *exceptions;
ctx = context_read_options(argc, argv); ctx = context_read_options(argc, argv);
maca_trans_lemmatizer_mcf2cff_check_options(ctx); maca_trans_lemmatizer_mcf2cff_check_options(ctx);
d_l_rules = dico_read("l_rules", "r"); // exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose);
exceptions = fplm_load_file((char *)"exceptions.fplm", ctx->verbose);
d_l_rules = dico_read((char *)"rules", 0.5);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){ if(ctx->mode == TRAIN_MODE){
...@@ -128,7 +164,7 @@ int main(int argc, char *argv[]) ...@@ -128,7 +164,7 @@ int main(int argc, char *argv[])
else else
output_file = stdout; output_file = stdout;
generate_training_file(output_file, ctx); generate_training_file(output_file, ctx, d_l_rules, exceptions);
if(ctx->mode == TRAIN_MODE){ if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment