Select Git revision
maca_trans_lemmatizer_mcf2cff.c 6.74 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_tagger.h"
#include"oracle_tagger.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
#include"fplm.h"
#include"l_rule.h"
/*int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
{
char lemma[200];
char form[200];
char *l_rule;
int l_rule_code;
word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct));
l_rule = compute_l_rule(lemma, form, 1);
l_rule_code = dico_string2int(d_l_rule, l_rule);
free(l_rule);
return l_rule_code;
}*/
int movement_lemmatizer(config *c, int feats)
{
// word_set_feats(word_buffer_b0(c->bf), feats);
word_buffer_move_right(c->bf);
return 1;
}
void decode_lemmatizer_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(!ctx->l_rules_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
ctx->l_rules_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
context_mcd_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
){
maca_trans_lemmatizer_mcf2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *conll_file = myfopen(ctx->input_filename, "r");
char *l_rule;
int l_rule_code;
char lemma[200];
char form[200];
char pos[200];
char *lemma_from_fplm;
int strict = 1;
/* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
c = config_new(conll_file, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, word_buffer_b0(c->bf), mcd_get_pos_col(ctx->mcd_struct));
//printf("form = %s pos = %s lemma = %s\n", form, pos, lemma);
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("exception\n");
word_buffer_move_right(c->bf);
continue;
}
l_rule = compute_l_rule(lemma, form, strict);
// printf("rule = %s\n", l_rule);
l_rule_code = dico_string2int(d_l_rules, l_rule);
if(l_rule_code != -1){
// if(strcmp(l_rule, "@@")){
// fprintf(stdout, "rule exists\n");
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
fprintf(output_file, "%d", l_rule_code);
feat_vec_print(output_file, fv);
// }
}
else{
// fprintf(stdout, "rule does not exist\n");
}
word_buffer_move_right(c->bf);
// movement_lemmatizer(c, l_rule);
free(l_rule);
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
dico *d_l_rules;
fplm_struct *exceptions;
ctx = context_read_options(argc, argv);
// decode_lemmatizer_set_linguistic_resources_filenames(ctx);
maca_trans_lemmatizer_mcf2cff_check_options(ctx);
exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose);
d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
if(ctx->cff_filename)
output_file = myfopen(ctx->cff_filename, "w");
else
output_file = stdout;
generate_training_file(output_file, ctx, d_l_rules, exceptions);
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
}
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}