Skip to content
Snippets Groups Projects
Select Git revision
  • 9162d34b4af7dd34cbeba01372d7492dfd05886d
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

maca_trans_lemmatizer.c

Blame
  • maca_trans_lemmatizer.c 10.18 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include<ctype.h>
    
    #include"context.h"
    #include"dico.h"
    #include"config.h"
    #include"fplm.h"
    #include"l_rule.h"
    #include"config2feat_vec.h"
    
    void maca_lemmatizer_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_beam_help_message(ctx);
      context_conll_help_message(ctx);
      fprintf(stderr, "INPUT\n");
      context_input_help_message(ctx);
      context_mcd_help_message(ctx);
    }
    
    void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx)
    {
      char absolute_filename[500];
      
      if(!ctx->perc_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME);
        ctx->perc_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->vocabs_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME);
        ctx->vocabs_filename = strdup(absolute_filename);
      }
    
      if(!ctx->features_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME);
        ctx->features_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->fplm_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME);
        ctx->fplm_filename = strdup(absolute_filename);
      }
    
      if(!ctx->l_rules_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME);
        ctx->l_rules_filename = strdup(absolute_filename);
      }
      
    
      if(ctx->verbose){
        fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
        fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
        fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
        fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
        fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename);
        fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename);
      }
    }
    
    void maca_lemmatizer_check_options(context *ctx){
      if(ctx->help
         ){
        maca_lemmatizer_help_message(ctx);
        exit(1);
      }
    }
    
    /* a bit messy */
    void print_word(word *w, mcd *mcd_struct, char *lemma)
    {
      char *buffer = NULL;
      char *token = NULL;
      int col_nb = 0;
    
      if(mcd_get_lemma_col(mcd_struct) == -1){
        printf("%s\t%s\n", w->input, lemma);
      }
      else{
        buffer = strdup(w->input);
        token = strtok(buffer, "\t");
        col_nb = 0;
        while(token){
          if(col_nb != 0) printf("\t");
          if(col_nb == mcd_get_lemma_col(mcd_struct))
    	printf("%s", lemma);
          else
    	word_print_col_n(stdout, w, col_nb);
          col_nb++;
          token = strtok(NULL, "\t");
        }
        if(col_nb <= mcd_get_lemma_col(mcd_struct))
          printf("\t%s", lemma);
        printf("\n");
        free(buffer);
      }
    }
    
    
    int main(int argc, char *argv[])
    {
      context *ctx = context_read_options(argc, argv);
      feat_vec *fv = feat_vec_new(10);
      word *b0;
      char lemma[200];
      char form[200];
      char pos[200];
      char *lemma_from_fplm;
      config *c;
      int l_rule_code;
      char *l_rule;
      float max;
    
      
      maca_lemmatizer_check_options(ctx);
      maca_lemmatizer_set_linguistic_resources_filenames(ctx);
    
      dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
      fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
      FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
      ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
      ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
      mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
    
      ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
      feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
      c = config_new(f, ctx->mcd_struct, 5); 
      
      while(!config_is_terminal(c)){
        b0 = word_buffer_b0(c->bf);
        word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
        word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
        word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
        //    fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
    
        // if lemma is not specified in input it is looked up in exceptions file
        if(strlen(lemma) && strcmp(lemma, "_"))
          print_word(b0, ctx->mcd_struct, lemma);
        else{
          lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
          if(lemma_from_fplm){
    	//	printf("lemma %s found in exceptions file\n", lemma_from_fplm);
    	print_word(b0, ctx->mcd_struct, lemma_from_fplm);
          }
        // if lemma is not found in exception file, predict an l_rule 
          else{
    	config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
    	//	feat_vec_print_string(fv, ctx->d_perceptron_features);
    	//	feat_vec_print(stdout, fv);
    	
    	vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
    	if(ctx->debug_mode){
    	  for(int i=0; i < 10; i++){
    	    l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
    	    fprintf(stderr, "%d", i);
    	    if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*");
    	    fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score);
    	  }
    	}
    	int i;
    	for(i=0; i < 10; i++){
    	  l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code);
    	  if(l_rule_is_applicable(form, l_rule)){
    	    char *transformed_lemma = apply_l_rule(form, l_rule);
    	    //	printf("transformed_lemma = %s\n", transformed_lemma);
    	    //	    print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
    	    print_word(b0, ctx->mcd_struct, transformed_lemma);
    	    free(transformed_lemma);
    	    break;
    	  }
    	}
    	/* no rule applied */
    	if(i == 10){
    	  print_word(b0, ctx->mcd_struct, form);
    	}
    	
    	free(vcode_array);
          }
        }
        word_buffer_move_right(c->bf);
      }
      config_free(c); 
      if (ctx->input_filename) fclose(f);
      context_free(ctx);
      fplm_free(exceptions);
      feature_table_free(ft);
      return 0;
    }
    
    #if 0
    int main(int argc, char *argv[])
    {
      context *ctx = context_read_options(argc, argv);
      feat_vec *fv = feat_vec_new(10);
      word *b0;
      char lemma[200];
      char form[200];
      char pos[200];
      char *lemma_from_fplm;
      config *c;
      int l_rule_code;
      char *l_rule;
      float max;
    
      
      maca_lemmatizer_check_options(ctx);
      maca_lemmatizer_set_linguistic_resources_filenames(ctx);
    
      dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5);
      fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
      FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
      ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
      ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
      mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
    
      ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
      feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
      c = config_new(f, ctx->mcd_struct, 5); 
      
      while(!config_is_terminal(c)){
        b0 = word_buffer_b0(c->bf);
        word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
        word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
        word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
        //    fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma);
    
        // if lemma is not specified in input it is looked up in exceptions file
        if(strlen(lemma) && strcmp(lemma, "_"))
          print_word(b0, ctx->mcd_struct, lemma);
        else{
          lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
          if(lemma_from_fplm){
    	//	printf("lemma %s found in exceptions file\n", lemma_from_fplm);
    	print_word(b0, ctx->mcd_struct, lemma_from_fplm);
          }
        // if lemma is not found in exception file, predict an l_rule 
          else{
    	config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
    	//	feat_vec_print_string(fv, ctx->d_perceptron_features);
    	//	feat_vec_print(stdout, fv);
    
    	l_rule_code = feature_table_argmax(fv, ft, &max);
    	//fprintf(stderr, "lrule code %d predicted\n", l_rule_code);
    	l_rule = dico_int2string(d_l_rules, l_rule_code);
    	//	printf("lrule %s predicted\n", l_rule);
    	char *transformed_lemma = apply_l_rule(form, l_rule);
    
    	//	printf("transformed_lemma = %s\n", transformed_lemma);
    	print_word(b0, ctx->mcd_struct, transformed_lemma);
    	
    	free(transformed_lemma);
    	if(ctx->debug_mode){
    	  vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
    	  for(int i=0; i < 10; i++){
    	    fprintf(stderr, "%d\t", i);
    	    fprintf(stderr, "%s\t%.4f\n", dico_int2string(d_l_rules, vcode_array[i].class_code), vcode_array[i].score);
    	  }
    	  free(vcode_array);
    	}
          }
        }
        word_buffer_move_right(c->bf);
      }
      config_free(c); 
      if (ctx->input_filename) fclose(f);
      context_free(ctx);
      fplm_free(exceptions);
      feature_table_free(ft);
    
    
      return 0;
    }
    
    #endif
    
    
    
    
    
    
    
    
    
    
    
    
    
    #if 0
     int main(int argc, char *argv[])
    {
      context *ctx = context_read_options(argc, argv);
      word *b0;
      char lemma[200];
      char form[200];
      char pos[200];
      char *lemma_from_fplm;
      config *c;
      fplm_struct *fplm;
      FILE *f;
      
      maca_lemmatizer_check_options(ctx);
      maca_lemmatizer_set_linguistic_resources_filenames(ctx);
      fplm = fplm_load_file(ctx->fplm_filename, ctx->debug_mode);
      f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
      c = config_new(f, ctx->mcd_struct, 5); 
    
      while(!config_is_terminal(c)){
        b0 = word_buffer_b0(c->bf);
        word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
        word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
        word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
    
        /* if lemma is not specified in input it is looked up */
        if(strlen(lemma) && strcmp(lemma, "_"))
          print_word(b0, ctx->mcd_struct, lemma);
        else{
          lemma_from_fplm = fplm_lookup_lemma(fplm, form, pos, ctx->verbose);
          if(lemma_from_fplm)
    	print_word(b0, ctx->mcd_struct, lemma_from_fplm);
          else
    	print_word(b0, ctx->mcd_struct, form);
        }
        word_buffer_move_right(c->bf);
      }
      config_free(c); 
      if (ctx->input_filename) fclose(f);
      context_free(ctx);
      fplm_free(fplm);
      return 0;
    }
    
    #endif