Skip to content
Snippets Groups Projects
Select Git revision
  • 44a039c79ef050493161ffb1d3ad425c7c7bfb71
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

maca_tm_decoder.c

Blame
  • user avatar
    Alexis Nasr authored
    modified maca_tm_decoder so that it does not lemmatize when word does not have a POS (hack git status)
    44a039c7
    History
    maca_tm_decoder.c 10.31 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include<ctype.h>
    #include"context.h"
    #include"feat_fct.h"
    #include"feature_table.h"
    #include"dico.h"
    #include"form2pos.h"
    #include"simple_decoder_tagger.h"
    #include"config2feat_vec.h"
    #include"movements.h"
    #include"config2feat_vec.h"
    #include"dico.h"
    #include"mcd.h"
    #include"fplm.h"
    
    void maca_tm_decoder_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_beam_help_message(ctx);
      context_conll_help_message(ctx);
      fprintf(stderr, "INPUT\n");
      context_input_help_message(ctx);
      context_mcd_help_message(ctx);
      context_model_help_message(ctx);
      context_vocabs_help_message(ctx);
      context_features_model_help_message(ctx);
      context_f2p_filename_help_message(ctx);
    }
    
    void maca_tm_decoder_check_options(context *ctx){
      if(ctx->help
         /*!ctx->conll_filename*/
         /*     || !ctx->perc_model_filename
         || !ctx->mcd_filename
         || !ctx->vocabs_filename
         || !ctx->features_model_filename*/
         ){
        maca_tm_decoder_help_message(ctx);
        exit(1);
      }
    }
    
    void maca_tm_decoder_set_linguistic_resources_filenames(context *ctx)
    {
      char absolute_filename[500];
    
      /*  if(!ctx->classif_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_CLASSIFIER_TAGGER_FILENAME);
        ctx->classif_filename = strdup(absolute_filename);
        }*/
      if(!ctx->f2p_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_F2P_FILENAME);
        ctx->f2p_filename = strdup(absolute_filename);
        ctx->f2p = form2pos_read(ctx->f2p_filename);
      }
    
      if(!ctx->tm_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_TRANS_MACHINE_TAGPARSER_FILENAME);
        ctx->tm_filename = strdup(absolute_filename);
      }
      if(!ctx->fplm_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
        ctx->fplm_filename = strdup(absolute_filename);
       
      }
    
      if(ctx->verbose){
        fprintf(stderr, "tm_filename = %s\n", ctx->tm_filename);
        fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
        fprintf(stderr, "mcf_filename = %s\n", ctx->input_filename);
        fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
      }
    
    }
    
    
    void maca_tm_decoder_set_linguistic_resources_filenames_old(context *ctx)
    {
      char absolute_filename[500];
      
      if(!ctx->perc_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME);
        ctx->perc_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->vocabs_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME);
        ctx->vocabs_filename = strdup(absolute_filename);
      }
    
      /*  if(!ctx->mcd_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME);
        ctx->mcd_filename = strdup(absolute_filename);
        }*/
    
      if(!ctx->features_model_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME);
        ctx->features_model_filename = strdup(absolute_filename);
      }
    
      if(!ctx->f2p_filename){
        strcpy(absolute_filename, ctx->maca_data_path);
        strcat(absolute_filename, DEFAULT_F2P_FILENAME);
        ctx->f2p_filename = strdup(absolute_filename);
        ctx->f2p = form2pos_read(ctx->f2p_filename);
      }
    
      if(ctx->verbose){
        fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
        fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
        fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
        fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
        fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
      }
    }
    
    
    #if 1
    void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
    {
      int i;
      word *w;
      char lower_form[100];
    
      for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
        w = word_buffer_get_word_n(bf, i);
        if(word_get_signature(w) != -1) break;
        w->signature = form2pos_get_signature(f2p, w->form);
        if(w->signature == -1){
          strcpy(lower_form, w->form);
          to_lower_string(lower_form);
          w->signature = form2pos_get_signature(f2p, lower_form);
        }
      }
    }
    #endif
    
    #if 0
    void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos)
    {
      int i;
      word *w;
      int signature;
      char *pos;
      for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
        w = word_buffer_get_word_n(bf, i);
        if(word_get_signature(w) != -1) break;
        signature = form2pos_get_signature(f2p, w->form);
        w->signature = signature;
        if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){
          /* printf("%s non ambigu code = %d \n", pos, dico_string2int(dico_pos, pos)); */
          word_set_pos(w, dico_string2int(dico_pos, pos));
          
        }
      }
    }
    #endif
    
    
    void print_word_buffer_tagparser(config *c, dico *dico_labels, dico *dico_pos, dico *dico_forms, dico *dico_morpho)
    {
      int i;
      word *dep;
      char *label;
      char *pos;
      char *lemma;
      char *morpho;
      
      for(i=0; i < config_get_buffer(c)->nbelem; i++){
        dep = word_buffer_get_word_n(config_get_buffer(c), i);
        printf("%s\t", word_get_input(dep));
    
        pos = (word_get_pos(dep) == -1)? NULL : dico_int2string(dico_pos, word_get_pos(dep));
        if(pos != NULL)
          printf("%s\t", pos) ;
        else
          printf("_\t");
    
        morpho = (word_get_feats(dep) == -1)? NULL : dico_int2string(dico_morpho, word_get_feats(dep));
        if(morpho != NULL)
          printf("%s\t", morpho) ;
        else
          printf("_\t");
    
        lemma = (word_get_lemma(dep) == -1)? NULL : dico_int2string(dico_forms, word_get_lemma(dep));
        if(lemma != NULL)
          printf("%s\t", lemma) ;
        else
          printf("_\t");
    
        printf("%d\t", word_get_gov(dep));
        label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep));
        if(label != NULL)
          printf("%s\t", label) ;
        else
          printf("_\t");
        if(word_get_sent_seg(dep) == 1)
          printf("1\n") ;
        else
          printf("0\n");
      }
    }
    
    
    void maca_tm_decoder(context *ctx)
    {
      FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
      config *c = config_new(f, ctx->mcd_struct, 5); 
      int mvt_code;
      int root_label;
      classifier *classif;
      dico *dico_pos = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"POS");
      dico *dico_forms = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"FORM");
      dico *dico_labels = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LABEL");
      dico *dico_morpho = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"FEATS");
      int result;
      fplm_struct *fplm = fplm_load(ctx->fplm_filename, ctx->debug_mode);
      char form_str[200];
      char *pos_str;
      char *lemma_str;
      int lemma_code;
      tm_state *current_state = NULL;
      int parser_state_nb = tm_get_parser_state(ctx->machine);
      int morpho_state_nb = tm_get_morpho_state(ctx->machine);
      int tagger_state_nb = tm_get_tagger_state(ctx->machine);
      mvt_tagset *std_mvt_tagset = mvt_tagset_std();
    
      /* printf("tagger state nb = %d\n", tagger_state_nb); */
      /* printf("parser state nb = %d\n", parser_state_nb); */
      /* printf("morpho state nb = %d\n", morpho_state_nb); */
      
      root_label = dico_string2int(dico_labels, ctx->root_label);
      if(root_label == -1) root_label = 0;
      
      while(!config_is_terminal(c)){
        if(ctx->f2p)
          add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); 
    
        while(tm_state_num_has_forced_transition(ctx->machine, c->current_state_nb)){
          tm_take_forced_transition(c, ctx->machine, std_mvt_tagset);
        }
        
        /* horrible trick : when at the end of buffer, skip tagger mode stay in parser mode */
        if((c->current_state_nb == tagger_state_nb) && word_buffer_end(config_get_buffer(c)))
          c->current_state_nb = parser_state_nb;
    
        current_state = ctx->machine->state_array[c->current_state_nb]; 
        classif = current_state->classif;
        mvt_code = classifier_argmax(classif, c);
        if(ctx->debug_mode){
          fprintf(stdout, "***********************************\n");
          fprintf(stdout, "%s   ", ctx->machine->state_array[c->current_state_nb]->name);
          config_print(stdout, c);
          classifier_print_vcode_array(stdout, classif, c, 4);
        }
        result = movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, ctx->machine);
    
        /* in tagger state , look for lemma */
        if(c->current_state_nb == tagger_state_nb){
    	/* it is bm1 rather than b0 because the machine changed state after applying the pos movement */
    	word_sprint_col_n(form_str, word_buffer_bm1(config_get_buffer(c)), mcd_get_form_col(ctx->mcd_struct));
    	pos_str = dico_int2string(dico_pos, word_get_pos(word_buffer_bm1(config_get_buffer(c))));
    	lemma_code = -1;
    	if(pos_str){
    	  lemma_str = fplm_lookup_lemma(fplm, form_str, pos_str, ctx->verbose);
    	  lemma_code = dico_string2int(dico_forms, lemma_str);
    	}
    	word_set_lemma(word_buffer_bm1(config_get_buffer(c)), lemma_code);
    	/* printf("--------> form = %s pos = %s lemma = %s code = %d\n", form_str, pos_str, lemma_str, lemma_code); */
          }
        
        if(result == 0){
          if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n");
          mvt_code = mvt_tagset_get_code(classifier_get_output_tagset(classif), MVT_SHIFT, 0);
          movement_apply(c, mvt_code, classifier_get_output_tagset(classif), -1, ctx->machine);
          if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */
    	if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n");
    	while(!stack_is_empty(config_get_stack(c))){
    	  mvt_code = mvt_tagset_get_code(classifier_get_output_tagset(classif), MVT_ROOT, 0);
    	  movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, ctx->machine);
    	}
          }
        }
      }
      print_word_buffer_tagparser(c, dico_labels, dico_pos, dico_forms, dico_morpho);
      config_free(c); 
      if(ctx->input_filename) fclose(f);
    }
    
    
    
    int main(int argc, char *argv[])
    {
      context *ctx = context_read_options(argc, argv);
      maca_tm_decoder_check_options(ctx);
      maca_tm_decoder_set_linguistic_resources_filenames(ctx);
    
      ctx->machine = tm_load(ctx->tm_filename, ctx->maca_data_path, ctx->verbose);
      mcd_link_to_dico(ctx->mcd_struct, ctx->machine->d_tapes, ctx->verbose);
    
      maca_tm_decoder(ctx);
      context_free(ctx);
      return 0;
    }