Skip to content
Snippets Groups Projects
Select Git revision
  • bd658b7a8c7be3a92a2d3c2981ef8e847d822677
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

cff2fann.c

Blame
  • user avatar
    Alexis Nasr authored
    bd658b7a
    History
    cff2fann.c 5.36 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include"context.h"
    #include"util.h"
    #include"cf_file.h"
    #include"feat_lib.h"
    #include"feat_types.h"
    
    
    void cff2fann_help_message(context *ctx)
    {
      context_general_help_message(ctx);
    
      printf("\nInput:\n");
      context_cff_help_message(ctx);
      context_features_model_help_message(ctx);
      context_vocabs_help_message(ctx);
    }
    
    void cff2fann_check_options(context *ctx)
    {
      if(ctx->cff_filename) fprintf(stderr, "cff filename = %s\n", ctx->cff_filename);
      if(ctx->mcd_filename) fprintf(stderr, "mcd filename = %s\n", ctx->mcd_filename);
      if(ctx->features_model_filename) fprintf(stderr, "fm filename = %s\n", ctx->features_model_filename);
      
      if(!ctx->cff_filename
         || ctx->help
         || !ctx->vocabs_filename
         || !ctx->features_model_filename
         ){
        cff2fann_help_message(ctx);
        exit(1);
      }
    }
    
    void one_hot_print(FILE *f, int val, int dim)
    {
      int i;
      for(i=0; i < dim; i++)
        fprintf(f, "%d ", (i == val)? 1  : 0);
    }
    
    void check_feature_model(feat_model *fm)
    {
      int i;
      feat_desc *fd;
      
      for(i=0; i <fm->nbelem; i++){
        fd = fm->array[i];
        if(fd->nbelem > 1){
          fprintf(stderr, "feature %d is a complex feature, aborting\n", i);
          exit(1);
        }
      }
    }
    
    void print_header(mcd *m, feat_model *fm)
    {
      int i;
      feat_desc *fd;
      simple_feat_desc *sfd;
    
      printf("OUT");
    
      for(i=0; i <fm->nbelem; i++){
        fd = fm->array[i];
        sfd = fd->array[0];
        printf("\t%s", sfd->name);
      }
    
      printf("\n");
      printf("OUT");
      for(i=0; i <fm->nbelem; i++){
        fd = fm->array[i];
        sfd = fd->array[0];
        if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
        if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
        if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
        if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
        if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
        if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
        printf("\tUNK");
      }
    
      printf("\n");
      /*
      for(i=0; i < m->nb_col; i++){
        if(m->representation[i] == MCD_REPRESENTATION_EMB){
          printf("\tEMB");
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_NULL){
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_VOCAB){
          printf("\t%s", m->wf_str[i]);
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_INT){
          printf("\tINT");
          continue;
        }
      }
      printf("\n");*/
    }
    
    void cff2fann(context *ctx)
    {
      char buffer[10000];
      char *token;
      int col_nb;
      int feat_type;
      mcd *m = ctx->mcd_struct;
      FILE *f = myfopen(ctx->cff_filename, "r");
      int val;
      dico *vocab;
      char feature_type[64];
      int feature_valindex;
      int count = 0;
      char *feat_str = NULL;
      
      vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
    
      /* printf("%d %d\n", 1, ctx->features_model->nbelem); */
    
      print_header(m, ctx->features_model);
      
      while(fgets(buffer, 10000, f)){
        /* printf("%s", buffer);  */
        /* printf("\n");  */
        token = strtok(buffer, "\t");
        col_nb = 0;
        if (count % 100 == 0)
          fprintf(stderr, "%d\r", count);
        while(token){
          /* printf("col = %d token = %s\n", col_nb, token); */
          val = atoi(token);
          if(col_nb == 0){
            /* one_hot_print(stdout, val, ctx->mvt_nb);  */
            /* printf("\n"); */
    	printf("%d", val);
          } else {
    	feat_str = dico_int2string(vocab, val);
    	if(feat_str){
    	  /* printf("feat str = %s\n", feat_str); */
    	  sscanf(feat_str, "%[^==]==%d", feature_type, &feature_valindex);
    	  /* printf("feature_type = %s\n", feature_type); */
    	  feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
    	  /* printf("feat_type = %d\n", feat_type);  */
    	  /* printf("%d: ", col_nb); */
    	  int mcd_col = m->wf2col[feat_type];
    	
    	  /* printf("representation = %d\n", m->representation[mcd_col]); */
    	  if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
    	    fprintf(stderr, "it is an embedding val = %d, file = %s\n", feature_valindex, m->filename[mcd_col]);
    
    	    //int word_emb_get_code(word_emb *we, char *word)
    	    /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
    	    /* printf("\n"); */
    	    printf("\t%d", feature_valindex);
    	    
    	  } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
    	    /* printf("it is a vocab\n"); */
    	    /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem);  */
    	    /* printf("\n"); */
    	    printf("\t%d", feature_valindex);
    	  } else {
    	    printf("\t%d", feature_valindex);
    	  }
    	}
    	else{
    	  fprintf(stderr, "WARNING cannot find the description of feature : %d\n", val);
    	  feature_valindex = -1;
    	  
    	  printf("\t%d", feature_valindex);
    	  
    	}
          }
          col_nb++;
          token = strtok(NULL , "\t");
        }
        printf("\n");
        count++;
      }
      fclose(f);
    }
    
    int main(int argc, char *argv[])
    {
      context *ctx;
      int nb_feat;
      int nb_class;
    
      ctx = context_read_options(argc, argv);
      cff2fann_check_options(ctx);
    
      ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
    
      ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
    
    
      check_feature_model(ctx->features_model);
      
      look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class);
      ctx->mvt_nb = nb_class;
    
      mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, 1);
    
      cff2fann(ctx);
      return 0;
    }