Select Git revision
maca_trans_lemmatizer.c
maca_trans_lemmatizer.c 10.18 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include<ctype.h>
#include"context.h"
#include"dico.h"
#include"config.h"
#include"fplm.h"
#include"l_rule.h"
#include"config2feat_vec.h"
void maca_lemmatizer_help_message(context *ctx)
{
context_general_help_message(ctx);
context_beam_help_message(ctx);
context_conll_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
}
void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(!ctx->l_rules_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
ctx->l_rules_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
}
}
void maca_lemmatizer_check_options(context *ctx){
if(ctx->help
){
maca_lemmatizer_help_message(ctx);
exit(1);
}
}
/* a bit messy */
void print_word(word *w, mcd *mcd_struct, char *lemma)
{
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_lemma_col(mcd_struct) == -1){
printf("%s\t%s\n", w->input, lemma);
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_lemma_col(mcd_struct))
printf("%s", lemma);
else
word_print_col_n(stdout, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_lemma_col(mcd_struct))
printf("\t%s", lemma);
printf("\n");
free(buffer);
}
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
feat_vec *fv = feat_vec_new(10);
word *b0;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
config *c;
int l_rule_code;
char *l_rule;
float max;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
// fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
// if lemma is not specified in input it is looked up in exceptions file
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("lemma %s found in exceptions file\n", lemma_from_fplm);
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
}
// if lemma is not found in exception file, predict an l_rule
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
// feat_vec_print_string(fv, ctx->d_perceptron_features);
// feat_vec_print(stdout, fv);
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
if(ctx->debug_mode){
for(int i=0; i < 10; i++){
l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
fprintf(stderr, "%d", i);
if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*");
fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score);
}
}
int i;
for(i=0; i < 10; i++){
l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
if(l_rule_is_applicable(form, l_rule)){
char *transformed_lemma = apply_l_rule(form, l_rule);
// printf("transformed_lemma = %s\n", transformed_lemma);
// print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
print_word(b0, ctx->mcd_struct, transformed_lemma);
free(transformed_lemma);
break;
}
}
/* no rule applied */
if(i == 10){
print_word(b0, ctx->mcd_struct, form);
}
free(vcode_array);
}
}
word_buffer_move_right(c->bf);
}
config_free(c);
if (ctx->input_filename) fclose(f);
context_free(ctx);
fplm_free(exceptions);
feature_table_free(ft);
return 0;
}
#if 0
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
feat_vec *fv = feat_vec_new(10);
word *b0;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
config *c;
int l_rule_code;
char *l_rule;
float max;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
// fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
// if lemma is not specified in input it is looked up in exceptions file
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("lemma %s found in exceptions file\n", lemma_from_fplm);
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
}
// if lemma is not found in exception file, predict an l_rule
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
// feat_vec_print_string(fv, ctx->d_perceptron_features);
// feat_vec_print(stdout, fv);
l_rule_code = feature_table_argmax(fv, ft, &max);
//fprintf(stderr, "lrule code %d predicted\n", l_rule_code);
l_rule = dico_int2string(d_l_rules, l_rule_code);
// printf("lrule %s predicted\n", l_rule);
char *transformed_lemma = apply_l_rule(form, l_rule);
// printf("transformed_lemma = %s\n", transformed_lemma);
print_word(b0, ctx->mcd_struct, transformed_lemma);
free(transformed_lemma);
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 10; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(d_l_rules, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
}
word_buffer_move_right(c->bf);
}
config_free(c);
if (ctx->input_filename) fclose(f);
context_free(ctx);
fplm_free(exceptions);
feature_table_free(ft);
return 0;
}
#endif
#if 0
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
word *b0;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
config *c;
fplm_struct *fplm;
FILE *f;
maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
fplm = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
/* if lemma is not specified in input it is looked up */
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma);
else{
lemma_from_fplm = fplm_lookup_lemma(fplm, form, pos, ctx->verbose);
if(lemma_from_fplm)
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
else
print_word(b0, ctx->mcd_struct, form);
}
word_buffer_move_right(c->bf);
}
config_free(c);
if (ctx->input_filename) fclose(f);
context_free(ctx);
fplm_free(fplm);
return 0;
}
#endif