Skip to content
Snippets Groups Projects
Select Git revision
  • 3abce919af2dc0760efc61a8d3ae5343ef647201
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

maca_tm_mcf2scf.c

Blame
  • user avatar
    Alexis Nasr authored
    3abce919
    History
    maca_tm_mcf2scf.c 6.80 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    /* #include"movement_parser_arc_eager.h" */
    #include"movements.h"
    #include"oracle_parser_arc_eager.h"
    #include"oracle_tagger.h"
    #include"feat_fct.h"
    #include"context.h"
    #include"feat_vec.h"
    #include"dico_vec.h"
    #include"word_emb.h"
    #include"config2feat_vec.h"
    #include"classifier.h"
    
    
    int oracle_morpho(config *c, mvt_tagset *tagset)
    {
      int morpho_feats = word_get_feats(word_buffer_b0(config_get_buffer(c)));
      int mvt_code = mvt_tagset_get_code(tagset, MVT_MORPHO, morpho_feats);
      return mvt_code;
    }
    
    
    void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
    {
      int i;
      word *w;
      char lower_form[100];
    
      for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
        w = word_buffer_get_word_n(bf, i);
        if(word_get_signature(w) != -1) break;
        w->signature = form2pos_get_signature(f2p, w->form);
        if(w->signature == -1){
          strcpy(lower_form, w->form);
          to_lower_string(lower_form);
          w->signature = form2pos_get_signature(f2p, lower_form);
        }
      }
    }
    
    void maca_tm_mcf2scf_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_mode_help_message(ctx);
      context_sent_nb_help_message(ctx);
    
      context_tm_help_message(ctx);
    
      
      fprintf(stderr, "INPUT\n");
      context_conll_help_message(ctx);
      fprintf(stderr, "IN TEST MODE\n");
      context_vocabs_help_message(ctx);
    
      fprintf(stderr, "OUTPUT\n");
      context_cff_help_message(ctx);
      fprintf(stderr, "IN TRAIN MODE\n");
      context_vocabs_help_message(ctx);
    
    }
    
    void maca_tm_mcf2scf_check_options(context *ctx)
    {
      if(!ctx->input_filename
         || ctx->help
         /* || !ctx->mcd_filename */
         /* || !(ctx->cff_filename || ctx->fann_filename) */
         ){
        maca_tm_mcf2scf_help_message(ctx);
        exit(1);
      }
    }
    
    void generate_scf_file(context *ctx)
    {
      config *c;
      int mvt_code;
      int sentence_nb = 0;
      int root_label = 0;
      word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
      FILE *mcf_file = myfopen(ctx->input_filename, "r"); 
      tm *machine = ctx->machine;
      tm_state *current_state = NULL;
      int mvt_type;
      dico *d_synt_labels;
      classifier *classif = NULL;
      FILE *output_file;
      int parser_state_nb = tm_get_parser_state(machine);
      int morpho_state_nb = tm_get_morpho_state(machine);
      int tagger_state_nb = tm_get_tagger_state(machine);
    
      /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
      /* the idea is to ignore syntax in the mcf file that will be read */
      /* it is ugly !!! */
      
      mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
      /*mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_POS); */
      mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
      mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
      mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG);
    
    
      /* open output file */
      output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w", ctx->verbose) : stdout;
    
      d_synt_labels = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LABEL");
      
      if(d_synt_labels == NULL){
        fprintf(stderr, "cannot find syntactic label alphabet in transition machine tape dictionaries\n");
      }
      else{
        root_label = dico_string2int(d_synt_labels, (char *) ctx->root_label);
      }
      
     /* c = config_new(mcf_file, ctx->mcd_struct, 5);  */
      c = config_new(mcf_file, mcd_struct_hyp, 5);  
      /* c = config_new_load_all_mcf(ctx->input_filename, ctx->mcd_struct); */
      
      //while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){ 
       while(!word_buffer_end(config_get_buffer(c)) && (sentence_nb < ctx->sent_nb)){ 
        current_state = machine->state_array[c->current_state_nb];
        classif = current_state->classif;
        
        if(ctx->f2p)
          add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); 
    
        if(c->current_state_nb == parser_state_nb){
          mvt_code = oracle_parser_arc_eager(c, ref, root_label, classifier_get_output_tagset(classif)); 
        }
        else if(c->current_state_nb == morpho_state_nb){
          mvt_code = oracle_morpho(c, classifier_get_output_tagset(classif));
        }
        else if(c->current_state_nb == tagger_state_nb){
         mvt_code = oracle_tagger(c, classifier_get_output_tagset(classif));
        }
        else{
          fprintf(stderr, "do not know which oracle to use for state %s\n", current_state->name);
          exit(1);
        }
        mvt_type = mvt_tagset_get_type(classifier_get_output_tagset(classif), mvt_code);
        config2feat_vec_cff(classif->fm, c, classif->d_features, classif->fv, ctx->mode);
    
        if(ctx->debug_mode){
          config_print(stdout,c);
          mvt_tagset_print_mvt(stdout, classifier_get_output_tagset(classif), mvt_code);
          fprintf(stdout, "\n");
        }
    
        movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, machine);
    
        /* advance head in ref word buffer */
        if((mvt_type == MVT_RIGHT) || (mvt_type == MVT_SHIFT)){
          word_buffer_move_right(ref);
        }
    
        fprintf(output_file, "%d", current_state->classifier_nb);
        fprintf(output_file, "\t%d", mvt_code);
        feat_vec_print(output_file, classif->fv);
        
        if(mvt_type == MVT_EOS)
          if((++sentence_nb % 100) == 0) fprintf(stderr, "\rsentence %d", sentence_nb);
    
        /* fprintf(stderr, "hyp index = %d ref_index = %d\n", word_get_index(word_buffer_b0(config_get_buffer(c))), word_get_index(word_buffer_b0(ref)));  */
        
        /*    if(ctx->trace_mode){
          fprintf(output_file, "%d\t", word_get_index(word_buffer_b0(config_get_buffer(c))));
          stack_print(output_file, c->st);
          fprintf(output_file, "\t");
          
          movement_parser_print(output_file, mvt_code, ctx->dico_labels);        
          fprintf(output_file, "\t1\n");
          }*/
      }
      fprintf(stderr, "\n"); 
      if(ctx->cff_filename)
        fclose(output_file);
    }
    
    int main(int argc, char *argv[])
    {
      classifier *classif = NULL;
      context *ctx = context_read_options(argc, argv);
      int i;
      tm *machine;
      char string[100];
      
      maca_tm_mcf2scf_check_options(ctx);
      machine = tm_load(ctx->tm_filename, ctx->maca_data_path, ctx->verbose);
      ctx->machine = machine;
      mcd_link_to_dico(ctx->mcd_struct, machine->d_tapes, ctx->verbose);
    
      
      /* create perceptron features dictionnaries for all classifiers of the machine */
      for(i=0; i < machine->classif_vec->nb; i++){
        classif = machine->classif_vec->array[i];
        strcpy(string, classifier_get_name(classif)); 
        classifier_set_d_features(classif, dico_new(string, 1000000));
        /* classifier_set_d_features(classif, dico_new((char *)"d_perceptron_features", 100000)); */
      }
    
      generate_scf_file(ctx);
    
      /* in train mode print all feature dictionnaries that have been created as well as classifiers descriptions */
      if(ctx->mode == TRAIN_MODE){
      for(i=0; i < machine->classif_vec->nb; i++){
          classif = machine->classif_vec->array[i];
          classifier_print_d_features(classif);
          classifier_print_desc_file(classif->filename, classif);
        }
      }
      
      context_free(ctx);
      return 0;
    }