Skip to content
Snippets Groups Projects
Select Git revision
  • e2a9c33e0921d55f2bd6ebad98b2b31c95495643
  • main default protected
  • V1
3 results

CITATION.cff

Blame
  • maca_trans_parser_mcf2cff.c 6.40 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include"movement_parser.h"
    #include"oracle_parser.h"
    #include"feat_fct.h"
    #include"context.h"
    #include"feat_vec.h"
    #include"dico_vec.h"
    #include"word_emb.h"
    #include"config2feat_vec.h"
    
    void maca_trans_parser_conll2cff_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_mode_help_message(ctx);
      context_sent_nb_help_message(ctx);
    
      fprintf(stderr, "INPUT\n");
      context_conll_help_message(ctx);
      fprintf(stderr, "IN TEST MODE\n");
      context_vocabs_help_message(ctx);
    
      fprintf(stderr, "OUTPUT\n");
      context_cff_help_message(ctx);
      fprintf(stderr, "IN TRAIN MODE\n");
      context_vocabs_help_message(ctx);
    
    }
    
    void maca_trans_parser_conll2cff_check_options(context *ctx)
    {
      if(!ctx->input_filename
         || ctx->help
         /* || !ctx->mcd_filename */
         /* || !(ctx->cff_filename || ctx->fann_filename) */
         ){
        maca_trans_parser_conll2cff_help_message(ctx);
        exit(1);
      }
    }
    
    void generate_training_file_stream(FILE *output_file, context *ctx)
    {
      config *c;
      int mvt_code;
      char mvt_type;
      int mvt_label;
      feat_vec *fv = feat_vec_new(feature_types_nb);
      sentence *ref = NULL;
      int sentence_nb = 0;
      /* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */
      int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
      FILE *conll_file = myfopen(ctx->input_filename, "r");
      FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
    
      c = config_initial(conll_file, ctx->mcd_struct, 5);
      
      while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ 
        /* sentence_print(stdout, ref, ctx->dico_labels);  */
        while(1){
             /* config_print(stdout,c);       */
          config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
          
          /* feat_vec_print(stdout, fv);    */
          
          mvt_code = oracle_parser(c, ref);
          
          mvt_type = movement_type(mvt_code);
          mvt_label = movement_label(mvt_code);
    
          /* printf("mvt code = %d\n", mvt_code); */
           /* movement_print(stdout, mvt_code, ctx->dico_labels);   */
          
          fprintf(output_file, "%d", mvt_code);
          feat_vec_print(output_file, fv);
          
          if(queue_is_empty(c->bf)) break;
          
          if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){       /* sentence is complete */
    	
    	/* create the root arc */
    	movement_right_arc(c, mvt_label, 0);
    	
    	/* shift dummy word in stack */
    	movement_shift(c, 1, 0);
    
    	/*	printf("sentence complete config : ");
    		config_print(stdout,c);   */
    	
    	/* empty depset */
    	depset_free(c->ds);
    	c->ds = depset_new();
    	sentence_free(ref);
    	sentence_nb++;
    
    	c->current_index = queue_renumber_words(c->bf);
    	
    	break;
          }
          
          if(mvt_type == MVT_LEFT){
    	movement_left_arc(c, mvt_label, 0);
    	continue;
          }
          if(mvt_type == MVT_RIGHT){
    	movement_right_arc(c, mvt_label, 0);
    	continue;
          }
          if(mvt_type == MVT_SHIFT){
    	movement_shift(c, 1, 0);
    	continue;
          }
        }
      } 
    }
    
    void generate_training_file_buffer(FILE *output_file, context *ctx)
    {  
      config *c;
      int mvt_code;
      char mvt_type;
      int mvt_label;
      feat_vec *fv = feat_vec_new(feature_types_nb);
      sentence *ref = NULL;
      int sentence_nb = 0;
      FILE *conll_file = myfopen(ctx->input_filename, "r");
      FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
    
      c = config_initial(conll_file, ctx->mcd_struct, 0);
    
      while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ 
          /* sentence_print(stdout, ref, NULL);    */
        queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
        while(!config_is_terminal(c)){
          /* config_print(stdout,c);     */
          
          config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
          
          mvt_code = oracle_parser(c, ref);
          
          mvt_type = movement_type(mvt_code);
          mvt_label = movement_label(mvt_code);
    
          /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */
    
          fprintf(output_file, "%d", mvt_code);
          feat_vec_print(output_file, fv);
          
          if(mvt_type == MVT_LEFT){
    	movement_left_arc(c, mvt_label, 0);
    	continue;
          }
          if(mvt_type == MVT_RIGHT){
    	movement_right_arc(c, mvt_label, 0);
    	continue;
          }
          if(mvt_type == MVT_SHIFT){
    	movement_shift(c, 0, 0);
    	continue;
          }
        }
        config_free(c); 
        c = config_initial(conll_file, ctx->mcd_struct, 0);
        sentence_nb++;
      }
    }
    
    int main(int argc, char *argv[])
    {
      context *ctx;
      FILE *output_file;
      
      ctx = context_read_options(argc, argv);
      maca_trans_parser_conll2cff_check_options(ctx);
    
      ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
      
      if(ctx->mode == TRAIN_MODE){
        mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
        ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
      }
      else if(ctx->mode == TEST_MODE){
        ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
        mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
      }
    
     /* dico_vec_print(NULL, ctx->vocabs); */
      
      ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
    
      
      if(ctx->dico_labels == NULL){
        fprintf(stderr, "cannot find label names\n");
        return 1;
      }
      ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1;
        
      feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
    
      
      /* in train mode create feature dictionnary for perceptron */
      if(ctx->mode == TRAIN_MODE)
        ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
      
      /* in test mode read feature dictionnary for perceptron */
      if(ctx->mode == TEST_MODE)
        ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
      
      /* add the feature dictionnary to the dico vector */
      dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
      
      /* open output file */
      if(ctx->cff_filename)
        output_file = myfopen(ctx->cff_filename, "w");
      else
        output_file = stdout;
    
    
    
    
      
      if(ctx->stream_mode)
        generate_training_file_stream(output_file, ctx);
      else
        generate_training_file_buffer(output_file, ctx);
      
      if(ctx->mode == TRAIN_MODE){
        /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
        dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
        
      }
      
      if(ctx->cff_filename)
        fclose(output_file);
      context_free(ctx);
      return 0;
    }