Skip to content
Snippets Groups Projects
Select Git revision
  • e2c1d1b5bce4b0888b5ac717d86e176cf6426aa1
  • master default protected
2 results

batch_chairs.pt

Blame
  • mcd.c 15.92 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    
    #include "mcd.h"
    #include "util.h"
    #include "dico.h"
    #include "word_emb.h"
    
    
    void mcd_remove_wf_column(mcd *m, int wf_code)
    {
      int col = m->wf2col[wf_code];
      m->wf2col[wf_code] = -1;
      
      m->representation[col] = MCD_REPRESENTATION_NULL;
      m->wf[col] = -1;
      m->wf_str[col] = NULL;
      m->filename[col] = NULL;
      m->dico_array[col] = NULL;
      m->word_emb_array[col] = NULL;
      
    }
    
    mcd *mcd_new(int nb_col)
    {
      mcd *m = (mcd *)memalloc(sizeof(mcd));
      int i;
      m->nb_col = nb_col;
    
      for(i=0; i < MCD_WF_NB; i++)
        m->wf2col[i] = -1;
    
      m->representation = (int *)       memalloc(nb_col * sizeof(int));
      m->wf =             (int *)       memalloc(nb_col * sizeof(int));
      m->wf_str =         (char **)     memalloc(nb_col * sizeof(char *));
      m->filename =       (char **)     memalloc(nb_col * sizeof(char *));
      m->dico_array =     (dico **)     memalloc(nb_col * sizeof(dico *));
      m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
      
      for(i=0; i < nb_col; i++){
        m->representation[i] = MCD_REPRESENTATION_NULL;
        m->wf[i] = -1;
        m->wf_str[i] = NULL;
        m->filename[i] = NULL;
        m->dico_array[i] = NULL;
        m->word_emb_array[i] = NULL;
      }
      return m;
    }
    
    mcd *mcd_copy(mcd *m)
    {
      int i;
      mcd *copy = mcd_new(m->nb_col);
    
      for(i=0; i < MCD_WF_NB; i++)
        copy->wf2col[i] = m->wf2col[i];
    
      for(i=0; i < m->nb_col; i++){
        copy->representation[i] = m->representation[i];
        copy->wf[i] = m->wf[i];
        copy->wf_str[i] = (m->wf_str[i]) ? strdup(m->wf_str[i]) : NULL;
        copy->filename[i] = (m->filename[i]) ? strdup(m->filename[i]) : NULL;
        copy->dico_array[i] = (m->dico_array[i]) ? m->dico_array[i] : NULL;
        copy->word_emb_array[i] = (m->word_emb_array[i])? m->word_emb_array[i] : NULL;
      }
      return copy; 
    }
    
    void mcd_free(mcd *m)
    {
      int i;
      for(i=0; i < m->nb_col; i++){
        if(m->dico_array[i]) dico_free(m->dico_array[i]);
        if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]);
        if(m->wf_str[i]) free(m->wf_str[i]);
        if (m->filename[i]) free(m->filename[i]);
      }
      free(m->representation);
      free(m->filename);
      free(m->dico_array);
      free(m->word_emb_array);
      free(m->wf_str);
      free(m->wf);
      free(m);
    }
    
    /* this function is used when reading a corpus file which structure is described in mcd m */
    /* it returns the code associated to string str found in column col */
    /* the code depends on the way the column is represented (vocabulary, embedding or integer) */ 
    
    int mcd_get_code(mcd *m, char *str, int col){
      if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
        return (m->dico_array[col])? dico_string2int(m->dico_array[col], str) : -1;
      if(m->representation[col] == MCD_REPRESENTATION_EMB)
        return word_emb_get_code(m->word_emb_array[col], str);
      if(m->representation[col] == MCD_REPRESENTATION_INT)
        return atoi(str);
      return MCD_INVALID_VALUE;
    }
    
    /* look for the number of columns in an mcd file */
    
    int mcd_max_column_index_in_file(char *mcd_filename)
    {
      int max_col = -1;
      FILE *f = myfopen(mcd_filename, "r");
      char buffer[1000]; /* ugly */
      int column;
      char wf[100];
      char representation[100];
      char filename[500]; /* ugly */
      int fields_number;
      int line_number = 0;
    
      while(fgets(buffer, 1000, f)){
        line_number++;
        if(feof(f)) break;
         if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
        fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
        if(fields_number != 4){
          fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename);
          continue;
        }
        if(column > max_col) max_col = column;
      }
      if (f != NULL) fclose(f);
      return max_col;
    }
    
    
    /* takes as argument an mcd structure (m) and the name of a corpus file (corpus_filename) */
    /* populates the vocabularies of m with values found in corpus_filename */
    
    void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
    {
      int column;
    
      for(column=0; column < m->nb_col; column++){
        if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
           /* && (strcmp(m->filename[column], "_")) */
           && (m->dico_array[column] == NULL)){
          m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->wf_str[column]);
          fprintf(stderr, "extracting dico %s \tfrom corpus\n", m->wf_str[column]);
        }
      }
    }
    
    
    /* takes as argument an mcd structure (m) and a dictionary vector (vocabs) */
    /* links the vocabularies of m to vocabularies of vocabs (based on their names) */
    
    void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
    {
      int column;
      for(column=0; column < m->nb_col; column++){
        if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
           && (!strcmp(m->filename[column], "_"))
           && (m->dico_array[column] == NULL)){
          m->dico_array[column] = dico_vec_get_dico(vocabs, m->wf_str[column]);
          if(verbose) fprintf(stderr, "linking to dico %s\n", m->wf_str[column]);
        }
      }
    }
    
    /* read an multi column description file and produces an mcd structure */
    
    mcd *mcd_read(char *mcd_filename, int verbose)
    {
      int column;
      char wf[100];
      char representation[100];
      char filename[500]; /* ugly */
      int fields_number;
      int line_number = 0;
      char buffer[1000]; /* ugly */
      int nb_col = mcd_max_column_index_in_file(mcd_filename);
      mcd *m = mcd_new(nb_col + 1);
      FILE *f = myfopen(mcd_filename, "r");
      /* int first = 1; */
    
      while(fgets(buffer, 1000, f)){
        line_number++;
        if(feof(f)) break;
         if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
         fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
         if(fields_number != 4){
           /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
           continue;
         }
         if(verbose) fprintf(stderr, "column = %d\tword feature = %s\trepresentation = %s\tfilename = %s\n", column, wf, representation, filename);
         column--; /* in the mcd file, columns begin at index 1, in internal representation, columns begin at index 0 */
         m->wf[column] = mcd_wf_code(wf);
         m->wf_str[column] = strdup(wf);
         if(m->wf[column] == -1){
           fprintf(stderr, "in line %d of mcd file %s invalid wf, I'm skipping it\n", line_number, mcd_filename);	
           continue;
         }
         
        m->wf2col[m->wf[column]] = column;
    
        if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
        else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
        else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB;
        else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT;
        else{ 
          fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename);	  
          m->representation[column] = MCD_REPRESENTATION_NULL;
        }
        if(m->representation[column] != MCD_REPRESENTATION_NULL)
          m->filename[column] = strdup(filename);
        
        if(strcmp(m->filename[column], "_")){
          if(m->representation[column] == MCD_REPRESENTATION_EMB){
    	if(verbose) fprintf(stderr, "loading word embedding %s\n", m->filename[column]);
    	m->word_emb_array[column] = word_emb_load(m->filename[column]);
          }
          else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){
    	if(verbose) fprintf(stderr, "loading dico %s\n", m->filename[column]);
    	  m->dico_array[column] = dico_read(m->filename[column], 0.5);
          }
        }
      }
    
      fclose(f);
      return m;
    }
    
    /* builds an mcd corresponding to the conll07 format */
    
    mcd *mcd_build_conll07(void)
    {
      mcd *m = mcd_new(8);
      m->wf[0]=MCD_WF_ID;
      m->wf_str[0]=strdup("INDEX");
      m->representation[0]= MCD_REPRESENTATION_INT;
      m->filename[0] = strdup("_");
      m->wf2col[MCD_WF_ID] = 0;
    
      m->wf[1]=MCD_WF_FORM;
      m->wf_str[1]=strdup("FORM");
      m->representation[1]= MCD_REPRESENTATION_VOCAB;
      m->filename[1] = strdup("_");
      m->wf2col[MCD_WF_FORM] = 1;
      
      m->wf[2]=MCD_WF_LEMMA;
      m->wf_str[2]=strdup("LEMMA");
      m->representation[2]= MCD_REPRESENTATION_VOCAB;
      m->filename[2] = strdup("_");
      m->wf2col[MCD_WF_LEMMA] = 2;
      
      m->wf[3]=MCD_WF_CPOS;
      m->wf_str[3]=strdup("CPOS");
      m->representation[3]= MCD_REPRESENTATION_VOCAB;
      m->filename[3] = strdup("_");
      m->wf2col[MCD_WF_CPOS] = 3;
      
      m->wf[4]=MCD_WF_POS;
      m->wf_str[4]=strdup("POS");
      m->representation[4]= MCD_REPRESENTATION_VOCAB;
      m->filename[4] = strdup("_");
      m->wf2col[MCD_WF_POS] = 4;
      
      m->wf[5]=MCD_WF_FEATS;
      m->wf_str[5]=strdup("FEATS");
      m->representation[5]= MCD_REPRESENTATION_VOCAB;
      m->filename[5] = strdup("_");
      m->wf2col[MCD_WF_FEATS] = 5;
      
      m->wf[6]=MCD_WF_GOV;
      m->wf_str[6]=strdup("GOV");
      m->representation[6]= MCD_REPRESENTATION_INT;
      m->filename[6] = strdup("_");
      m->wf2col[MCD_WF_GOV] = 6;
      
      m->wf[7]=MCD_WF_LABEL;
      m->wf_str[7]=strdup("LABEL");
      m->representation[7]= MCD_REPRESENTATION_VOCAB;
      m->filename[7] = strdup("_");
      m->wf2col[MCD_WF_LABEL] = 7;
      
      return m;
    }
    
    /* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
    
    mcd *mcd_build_wplgf(void)
    {
      mcd *m = mcd_new(5);
      int col;
    
      col = 0;
      m->wf[col]=MCD_WF_FORM;
      m->wf_str[col]=strdup("FORM");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_FORM] = col;
      
      col = 1;
      m->wf[col]=MCD_WF_POS;
      m->wf_str[col]=strdup("POS");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_POS] = 1;
    
      col = 2;
      m->wf[col]=MCD_WF_LEMMA;
      m->wf_str[col]=strdup("LEMMA");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LEMMA] = 2;
    
      col = 3;
      m->wf[col]=MCD_WF_GOV;
      m->wf_str[col]=strdup("GOV");
      m->representation[col]= MCD_REPRESENTATION_INT;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_GOV] = 3;
    
      col = 4;
      m->wf[col]=MCD_WF_LABEL;
      m->wf_str[col]=strdup("LABEL");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LABEL] = 4;
      
      return m;
    }
    /* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
    
    mcd *mcd_build_ifpls(void)
    {
      mcd *m = mcd_new(6);
    
      m->wf[0]=MCD_WF_ID;
      m->wf_str[0]=strdup("INDEX");
      m->representation[0]= MCD_REPRESENTATION_INT;
      m->filename[0] = strdup("_");
      m->wf2col[MCD_WF_ID] = 0;
    
      m->wf[1]=MCD_WF_FORM;
      m->wf_str[1]=strdup("FORM");
      m->representation[1]= MCD_REPRESENTATION_VOCAB;
      m->filename[1] = strdup("_");
      m->wf2col[MCD_WF_FORM] = 1;
      
      m->wf[2]=MCD_WF_POS;
      m->wf_str[2]=strdup("POS");
      m->representation[2]= MCD_REPRESENTATION_VOCAB;
      m->filename[2] = strdup("_");
      m->wf2col[MCD_WF_POS] = 2;
      
      m->wf[3]=MCD_WF_LEMMA;
      m->wf_str[3]=strdup("LEMMA");
      m->representation[3]= MCD_REPRESENTATION_VOCAB;
      m->filename[3] = strdup("_");
      m->wf2col[MCD_WF_LEMMA] = 3;
      
      m->wf[4]=MCD_WF_GOV;
      m->wf_str[4]=strdup("GOV");
      m->representation[4]= MCD_REPRESENTATION_INT;
      m->filename[4] = strdup("_");
      m->wf2col[MCD_WF_GOV] = 4;
      
      m->wf[5]=MCD_WF_LABEL;
      m->wf_str[5]=strdup("LABEL");
      m->representation[5]= MCD_REPRESENTATION_VOCAB;
      m->filename[5] = strdup("_");
      m->wf2col[MCD_WF_LABEL] = 5;
      
      return m;
    }
    
    mcd *mcd_build_wplgfs(void)
    {
      mcd *m = mcd_new(6);
      int col;
    
      col = 0;
      m->wf[col]=MCD_WF_FORM;
      m->wf_str[col]=strdup("FORM");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_FORM] = col;
      
      col = 1;
      m->wf[col]=MCD_WF_POS;
      m->wf_str[col]=strdup("POS");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_POS] = 1;
    
      col = 2;
      m->wf[col]=MCD_WF_LEMMA;
      m->wf_str[col]=strdup("LEMMA");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LEMMA] = 2;
    
      col = 3;
      m->wf[col]=MCD_WF_GOV;
      m->wf_str[col]=strdup("GOV");
      m->representation[col]= MCD_REPRESENTATION_INT;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_GOV] = 3;
    
      col = 4;
      m->wf[col]=MCD_WF_LABEL;
      m->wf_str[col]=strdup("LABEL");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LABEL] = 4;
      
      col = 5;
      m->wf[col]=MCD_WF_SENT_SEG;
      m->wf_str[col]=strdup("SENT_SEG");
      m->representation[col]= MCD_REPRESENTATION_INT;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_SENT_SEG] = 5;
      
      return m;
    }
    
    mcd *mcd_build_wpmlgfs(void)
    {
      mcd *m = mcd_new(7);
      int col;
    
      col = 0;
      m->wf[col]=MCD_WF_FORM;
      m->wf_str[col]=strdup("FORM");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_FORM] = col;
      
      col = 1;
      m->wf[col]=MCD_WF_POS;
      m->wf_str[col]=strdup("POS");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_POS] = col;
    
      col = 2;
      m->wf[col]=MCD_WF_FEATS;
      m->wf_str[col]=strdup("FEATS");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_FEATS] = col;
    
      col = 3;
      m->wf[col]=MCD_WF_LEMMA;
      m->wf_str[col]=strdup("LEMMA");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LEMMA] = col;
    
      col = 4;
      m->wf[col]=MCD_WF_GOV;
      m->wf_str[col]=strdup("GOV");
      m->representation[col]= MCD_REPRESENTATION_INT;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_GOV] = col;
    
      col = 5;
      m->wf[col]=MCD_WF_LABEL;
      m->wf_str[col]=strdup("LABEL");
      m->representation[col]= MCD_REPRESENTATION_VOCAB;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_LABEL] = col;
      
      col = 6;
      m->wf[col]=MCD_WF_SENT_SEG;
      m->wf_str[col]=strdup("SENT_SEG");
      m->representation[col]= MCD_REPRESENTATION_INT;
      m->filename[col] = strdup("_");
      m->wf2col[MCD_WF_SENT_SEG] = col;
      
      return m;
    }
    
    
    
    /* returns a dico_vec containing the different dictionnaries found in an mcd structure */
    
    dico_vec *mcd_build_dico_vec(mcd *mcd_struct)
    {
      dico_vec *dv = dico_vec_new();
      int i;
      for(i=0; i < mcd_struct->nb_col; i++){
        /* printf("in mcd_build_dico_vec i = %d\n", i); */
        if(mcd_struct->dico_array[i]){
          /* printf("dico name = %s\n", mcd_struct->dico_array[i]->name);  */
          dico_vec_add(dv, mcd_struct->dico_array[i]);
        }
      }
      return dv;
    }
    
    int mcd_wf_code(char *wf)
    {
      if(!strcmp(wf, "INDEX")) return MCD_WF_ID;
      if(!strcmp(wf, "FORM")) return MCD_WF_FORM;
      if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA;
      if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS;
      if(!strcmp(wf, "POS")) return MCD_WF_POS;
      if(!strcmp(wf, "FEATS")) return MCD_WF_FEATS;
      if(!strcmp(wf, "LABEL")) return MCD_WF_LABEL;
      if(!strcmp(wf, "STAG")) return MCD_WF_STAG;
      /* if(!strcmp(wf, "INT")) return MCD_WF_INT; */
      if(!strcmp(wf, "GOV")) return MCD_WF_GOV;
      if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG;
      if(!strcmp(wf, "A")) return MCD_WF_A;
      if(!strcmp(wf, "B")) return MCD_WF_B;
      if(!strcmp(wf, "C")) return MCD_WF_C;
      if(!strcmp(wf, "D")) return MCD_WF_D;
      if(!strcmp(wf, "E")) return MCD_WF_E;
      if(!strcmp(wf, "F")) return MCD_WF_F;
      if(!strcmp(wf, "G")) return MCD_WF_G;
      if(!strcmp(wf, "H")) return MCD_WF_H;
      if(!strcmp(wf, "I")) return MCD_WF_I;
      if(!strcmp(wf, "J")) return MCD_WF_J;
      if(!strcmp(wf, "K")) return MCD_WF_K;
      if(!strcmp(wf, "L")) return MCD_WF_L;
      if(!strcmp(wf, "M")) return MCD_WF_M;
      if(!strcmp(wf, "N")) return MCD_WF_N;
      if(!strcmp(wf, "O")) return MCD_WF_O;
      if(!strcmp(wf, "P")) return MCD_WF_P;
      if(!strcmp(wf, "Q")) return MCD_WF_Q;
      if(!strcmp(wf, "R")) return MCD_WF_R;
      if(!strcmp(wf, "S")) return MCD_WF_S;
      if(!strcmp(wf, "T")) return MCD_WF_T;
      if(!strcmp(wf, "U")) return MCD_WF_U;
      if(!strcmp(wf, "V")) return MCD_WF_V;
      if(!strcmp(wf, "W")) return MCD_WF_W;
      if(!strcmp(wf, "X")) return MCD_WF_X;
      if(!strcmp(wf, "Y")) return MCD_WF_Y;
      if(!strcmp(wf, "Z")) return MCD_WF_Z;
      return -1;
    }
    
    
    char *mcd_get_str(mcd *m, int code, int col)
    {
      if((col < 0) || (col >= m->nb_col)) return NULL;
      if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
        return (m->dico_array[col])? dico_int2string(m->dico_array[col], code) : NULL;
      return NULL;
    }