Skip to content
Snippets Groups Projects
Select Git revision
  • ac6fa7a2170892dc969d256fa4b9636f2ad2cbe4
  • main default protected
2 results

README.md

Blame
  • maca_trans_lemmatizer_mcf2cff.c 6.74 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include"movement_tagger.h"
    #include"oracle_tagger.h"
    #include"feat_fct.h"
    #include"context.h"
    #include"feat_vec.h"
    #include"dico_vec.h"
    #include"word_emb.h"
    #include"config2feat_vec.h"
    #include"fplm.h"
    #include"l_rule.h"
    
    /*int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
    {
      char lemma[200];
      char form[200];
      char *l_rule;
      int l_rule_code;
    
      word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct));
      word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct));
    
      l_rule = compute_l_rule(lemma, form, 1);
      l_rule_code = dico_string2int(d_l_rule, l_rule);
      free(l_rule);
      return l_rule_code;   
      }*/
    
    int movement_lemmatizer(config *c, int feats)
    {
      //  word_set_feats(word_buffer_b0(c->bf), feats); 
      word_buffer_move_right(c->bf);
    
      return 1;
    }
    void decode_lemmatizer_set_linguistic_resources_filenames(context *ctx)
    {
      char absolute_filename[500];
      
      if(!ctx->perc_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
        ctx->perc_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->vocabs_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
        ctx->vocabs_filename = strdup(absolute_filename);
      }
    
      if(!ctx->features_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
        ctx->features_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->fplm_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
        ctx->fplm_filename = strdup(absolute_filename);
      }
    
      if(!ctx->l_rules_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
        ctx->l_rules_filename = strdup(absolute_filename);
      }
      
    
      if(ctx->verbose){
        fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
        fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
        fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
        fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
    
        fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
        fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
    
        
        fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
      }
    }
    
    void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_mode_help_message(ctx);
      context_sent_nb_help_message(ctx);
      context_mcd_help_message(ctx);
    
      fprintf(stderr, "INPUT\n");
      context_conll_help_message(ctx);
      fprintf(stderr, "IN TEST MODE\n");
      context_vocabs_help_message(ctx);
    
      fprintf(stderr, "OUTPUT\n");
      context_cff_help_message(ctx);
      fprintf(stderr, "IN TRAIN MODE\n");
      context_vocabs_help_message(ctx);
    }
    
    void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx)
    {
      if(!ctx->input_filename
         || ctx->help
         /* || !ctx->mcd_filename */
         || !(ctx->cff_filename || ctx->fann_filename)
         ){
        maca_trans_lemmatizer_mcf2cff_help_message(ctx);
        exit(1);
      }
    }
    
    void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fplm_struct *exceptions)
    {  
      config *c;
      feat_vec *fv = feat_vec_new(feature_types_nb);
      FILE *conll_file = myfopen(ctx->input_filename, "r");
      char *l_rule;
      int l_rule_code;
      char lemma[200];
      char form[200];
      char pos[200];
      char *lemma_from_fplm;
      int strict = 1;
    
      /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
      
      c = config_new(conll_file, ctx->mcd_struct, 5);
    
      while(!config_is_terminal(c)){
        word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct));
        word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct));
        word_sprint_col_n(pos, word_buffer_b0(c->bf), mcd_get_pos_col(ctx->mcd_struct));
    
        //printf("form = %s pos = %s lemma = %s\n", form, pos, lemma);
        
        lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
    
        if(lemma_from_fplm){
          //      printf("exception\n");
          word_buffer_move_right(c->bf);
          continue;
        }
    
        l_rule = compute_l_rule(lemma, form, strict);
    
        //    printf("rule = %s\n", l_rule);
    
        
        l_rule_code = dico_string2int(d_l_rules, l_rule);
    
        if(l_rule_code != -1){
          //      if(strcmp(l_rule, "@@")){
    	//	fprintf(stdout, "rule exists\n");
    	config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
    	fprintf(output_file, "%d", l_rule_code);
    	feat_vec_print(output_file, fv);
    	// }
        }
        else{
          //    fprintf(stdout, "rule does not exist\n");
    
        }
          word_buffer_move_right(c->bf);
        //    movement_lemmatizer(c, l_rule);
              free(l_rule);
    
      }
    }
    
    int main(int argc, char *argv[])
    {
      context *ctx;
      FILE *output_file;
      dico *d_l_rules;
      fplm_struct *exceptions;
      ctx = context_read_options(argc, argv);
    
      //  decode_lemmatizer_set_linguistic_resources_filenames(ctx);
      maca_trans_lemmatizer_mcf2cff_check_options(ctx);
      exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose);
      d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
    
      ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
    
      if(ctx->mode == TRAIN_MODE){
        mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
        ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
      }
      else if(ctx->mode == TEST_MODE){
        ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
        mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
      }
        
      /* in train mode create feature dictionnary for perceptron */
      if(ctx->mode == TRAIN_MODE)
        ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
      
      /* in test mode read feature dictionnary for perceptron */
      if(ctx->mode == TEST_MODE)
        ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
      
      /* add the feature dictionnary to the dico vector */
      dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
      
      /* open output file */
      if(ctx->cff_filename)
        output_file = myfopen(ctx->cff_filename, "w");
      else
        output_file = stdout;
      
      generate_training_file(output_file, ctx, d_l_rules, exceptions);
        
      if(ctx->mode == TRAIN_MODE){
        /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
        dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
        
      }
      
      if(ctx->cff_filename)
        fclose(output_file);
      context_free(ctx);
      return 0;
    }