Skip to content
Snippets Groups Projects
Select Git revision
  • 9162d34b4af7dd34cbeba01372d7492dfd05886d
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

cff2fann.c

Blame
  • cff2fann.c 5.32 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<unistd.h>
    #include<getopt.h>
    #include"context.h"
    #include"util.h"
    #include"cf_file.h"
    #include"feat_lib.h"
    #include"feat_types.h"
    
    
    void cff2fann_help_message(context *ctx)
    {
      context_general_help_message(ctx);
      context_mode_help_message(ctx);
      context_sent_nb_help_message(ctx);
    
      fprintf(stderr, "INPUT\n");
      context_conll_help_message(ctx);
      fprintf(stderr, "IN TEST MODE\n");
      context_vocabs_help_message(ctx);
    
      fprintf(stderr, "OUTPUT\n");
      context_cff_help_message(ctx);
      fprintf(stderr, "IN TRAIN MODE\n");
      context_vocabs_help_message(ctx);
    
    }
    
    void cff2fann_check_options(context *ctx)
    {
      if(ctx->cff_filename) fprintf(stderr, "cff filename = %s\n", ctx->cff_filename);
      if(ctx->mcd_filename) fprintf(stderr, "mcd filename = %s\n", ctx->mcd_filename);
      if(ctx->features_model_filename) fprintf(stderr, "fm filename = %s\n", ctx->features_model_filename);
      
      if(!ctx->cff_filename
         || ctx->help
         || !ctx->mcd_filename
         || !ctx->features_model_filename
         /* || !(ctx->cff_filename || ctx->fann_filename) */
         ){
        cff2fann_help_message(ctx);
        exit(1);
      }
    }
    
    void one_hot_print(FILE *f, int val, int dim)
    {
      int i;
      for(i=0; i < dim; i++)
        fprintf(f, "%d ", (i == val)? 1  : 0);
    }
    
    void print_header(mcd *m, feat_model *fm)
    {
      int i;
      feat_desc *fd;
      simple_feat_desc *sfd;
    
      printf("OUT");
    
      for(i=0; i <fm->nbelem; i++){
        fd = fm->array[i];
        if(fd->nbelem > 1){
          printf("feature %d is a complex feature, skipping it\n", i);
        }
        else{
          sfd = fd->array[0];
          printf("\t%s", sfd->name);
        }
      }
    
      printf("\n");
      printf("OUT");
      for(i=0; i <fm->nbelem; i++){
        fd = fm->array[i];
        if(fd->nbelem > 1){
          printf("feature %d is a complex feature, skipping it\n", i);
        }
        else{
          sfd = fd->array[0];
          if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
          if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
          if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
          if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
          if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
          if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
          printf("\tUNK");
        }
      }
      printf("\n");
      /*
      for(i=0; i < m->nb_col; i++){
        if(m->representation[i] == MCD_REPRESENTATION_EMB){
          printf("\tEMB");
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_NULL){
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_VOCAB){
          printf("\t%s", m->wf_str[i]);
          continue;
        }
        
        if(m->representation[i] == MCD_REPRESENTATION_INT){
          printf("\tINT");
          continue;
        }
      }
      printf("\n");*/
    }
    
    void cff2fann(context *ctx)
    {
      char buffer[10000];
      char *token;
      int col_nb;
      int feat_type;
      mcd *m = ctx->mcd_struct;
      FILE *f = myfopen(ctx->cff_filename, "r");
      int val;
      dico *vocab;
      char feature_type[64];
      int feature_valindex;
      int count = 0;
      
      vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
    
      /* printf("%d %d\n", 1, ctx->features_model->nbelem); */
    
      print_header(m, ctx->features_model);
      
      while(fgets(buffer, 10000, f)){
        /* printf("%s", buffer);  */
        /* printf("\n");  */
        token = strtok(buffer, "\t");
        col_nb = 0;
        if (count % 100 == 0)
          fprintf(stderr, "%d\r", count);
        while(token){
          /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]);  */
          val = atoi(token);
          if(col_nb == 0){
            /* one_hot_print(stdout, val, ctx->mvt_nb);  */
            /* printf("\n"); */
    	printf("%d", val);
          } else {
            sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex);
            /* printf("feature_type = %s\n", feature_type); */
            feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
    	/* printf("feat_type = %d\n", feat_type);  */
    	/* printf("%d: ", col_nb); */
            int mcd_col = m->wf2col[feat_type];
            /* printf("representation = %d\n", m->representation[mcd_col]); */
            if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
              /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */
              /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
              /* printf("\n"); */
    	  printf("\t%d", feature_valindex);
    
            } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
              /* printf("it is a vocab\n"); */
              /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem);  */
              /* printf("\n"); */
    	  printf("\t%d", feature_valindex);
            } else {
              printf("\t%d", feature_valindex);
            }
          }
          col_nb++;
          token = strtok(NULL , "\t");
        }
        printf("\n");
        count++;
      }
      fclose(f);
    }
    
    int main(int argc, char *argv[])
    {
      context *ctx;
      int nb_feat;
      int nb_class;
    
      ctx = context_read_options(argc, argv);
      cff2fann_check_options(ctx);
    
      ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
    
      ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
    
      look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class);
      ctx->mvt_nb = nb_class;
    
      mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, 1);
    
      cff2fann(ctx);
      return 0;
    }