Skip to content
Snippets Groups Projects
Select Git revision
  • 27edc713ab6ec24e3961c7d42aec61f402b9e2d5
  • master default protected
  • py
  • rmevec
  • tffm
  • approx
  • v0.1.5
  • v0.1.4
  • v0.1.3
9 results

set_smooth_mask_params.m

Blame
  • word_emb.c 5.60 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include"word_emb.h"
    #include"util.h"
    
    const long long w2v_max_size = 2000;         // max length of strings
    const long long w2v_max_w = 50;              // max length of vocabulary entries
    
    
    word_emb *word_emb_load_w2v_file_filtered(char *file_name, dico *d)
    {
      FILE *f;
      char word[w2v_max_size];
      long long words, size, a, b;
      word_emb *we;
      int k = 0;
      int word_nb = 0;
      float w;
    
      f = myfopen(file_name, "rb");
    
      fscanf(f, "%lld", &words);
      fscanf(f, "%lld", &size);
      /* printf("words = %lld size = %lld\n", words, size); */
      we = word_emb_new(size, words);
      for (b = 0; b < words; b++) {
        a = 0;
        while (1) {
          word[a] = fgetc(f);
          if (feof(f) || (word[a] == ' ')) break;
          if ((a < w2v_max_w) && (word[a] != '\n')) a++;
        }
        word[a] = 0;
    
        if(dico_string2int(d, word) != -1){
          fprintf(stderr, "found word %s\n", word);
          hash_add(we->htable, word, word_nb++);
          /* fprintf(stdout, "read word %s %d\n", word, word_nb - 1);  */
          /* fprintf(stdout, "\r%d", word_nb - 1);  */
          for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f);
    	/* fprintf(stdout, "%d = %f\n", a, we->array[k-1]); */
          }
        }
        else
          for (a = 0; a < size; a++)
    	fread(&w, sizeof(float), 1, f);
      }
      fclose(f);
      return we;
    }
    
    word_emb *word_emb_load_w2v_file(char *file_name)
    {
      FILE *f;
      char word[w2v_max_size];
      long long words, size, a, b;
      word_emb *we;
      int k = 0;
      int word_nb = 0;
    
      f = myfopen(file_name, "rb");
      fscanf(f, "%lld", &words);
      fscanf(f, "%lld", &size);
      /* printf("words = %lld size = %lld\n", words, size); */
      we = word_emb_new(size, words);
      for (b = 0; b < words; b++) {
        a = 0;
        while (1) {
          word[a] = fgetc(f);
          if (feof(f) || (word[a] == ' ')) break;
          if ((a < w2v_max_w) && (word[a] != '\n')) a++;
        }
        word[a] = 0;
        hash_add(we->htable, word, word_nb++);
        /* fprintf(stdout, "read word %s %d\n", word, word_nb - 1);  */
        fprintf(stdout, "\r%d", word_nb - 1); 
        for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f);
          /* fprintf(stdout, "%d = %f\n", a, we->array[k-1]); */
        }
      }
      fclose(f);
      return we;
    }
    
    word_emb *word_emb_new(int dim, int nbelem)
    {
      word_emb *we;
      we = (word_emb *)memalloc(sizeof(word_emb));
      we->dim = dim;
      we->nbelem = nbelem;
      we->htable = hash_new(nbelem);
      we->array = (float *)memalloc(dim * nbelem * sizeof(float));
      return we;
    }
    
    void word_emb_free(word_emb *we)
    {
      hash_free(we->htable);
      free(we->array);
      free(we);
    }
    
    
    int word_emb_number_of_lines_in_file(char *filename)
    {
      FILE *f = myfopen(filename, "r");
      char buffer[10000];
      int line_nb = 0;
    
      while(fgets(buffer, 10000, f)){
        if(feof(f)) break;
        line_nb++;
      }
      return line_nb;
      
    }
    
    int word_emb_number_of_columns_in_file(char *filename)
    {
      FILE *f = myfopen(filename, "r");
      char buffer[10000];
      int column_nb;
      char *token;
    
      fgets(buffer, 10000, f);
      token = strtok(buffer, " ");
      column_nb = 1;
      while((token = strtok(NULL , " \n")))
          column_nb++;
      fclose(f);
      return column_nb;
    }
    
    void word_emb_print(FILE *f, word_emb *we, int code)
    {
      int i;
      if(code == -1){
        for(i=0; i < we->dim - 1; i++)
          fprintf(f, "0.0 ");
        fprintf(f, "0.0");
    
      }
      else{
        int offset = code * we->dim;
        for(i=0; i < we->dim - 1; i++)
          fprintf(f, "%f ", we->array[offset + i]);
        fprintf(f, "%f", we->array[offset + i]);
      }
    }
    
    /* int word_emb_fill_input_array_dnn(fann_type *input_array, word_emb *we, int code, int first_index) */
    int word_emb_fill_input_array_dnn(float *input_array, word_emb *we, int code, int first_index)
    {
      int i;
      int last_index = first_index + we->dim;
      if(code == -1){
        for(i = first_index; i < last_index; i++)
          input_array[i] = 0;
      }
      else{
        int offset = code * we->dim;
        for(i=0; i < we->dim; i++)
          input_array[first_index + i] = we->array[offset + i];
      }
      return last_index;
    }
    
    void word_emb_print_to_file(word_emb *we, char *filename)
    {
      cell *c = NULL;
      FILE *f = NULL;
      int i, j;
      int offset;
      
      if(filename == NULL)
        f = stdout;
      else
        f = myfopen(filename, "w");
      
      
      for(i=0; i < we->htable->size; i++){
        for(c=we->htable->array[i]; c != NULL; c = c->next){
          fprintf(f, "%s", c->key);
          offset = c->val * we->dim;
          for(j = 0; j < we->dim; j++){
    	fprintf(f, " %f", we->array[offset + j]);
          }
          fprintf(f, "\n");
        }
      }
    }
    
    
    word_emb *word_emb_load(char *filename)
    {
      FILE *f;
      char word[300];
      int k = 0;
      int word_nb = 0;
      int i;
      int res;
      int nbelem = word_emb_number_of_lines_in_file(filename);
      int dim = word_emb_number_of_columns_in_file(filename) - 1;
      word_emb *we = word_emb_new(dim, nbelem);
      int line_nb = 0;
    
      fprintf(stderr, "loading word embeddings\n");
      fprintf(stderr, "we dim = %d\n", we->dim);
      fprintf(stderr, "we nbelem = %d\n", we->nbelem);
    
      f= myfopen(filename, "r");
      while(!feof(f) && (line_nb < nbelem)){
        line_nb++;
        res = fscanf(f, "%s", word);
        if(res == 0) fprintf(stderr, "word embdedding file %s ill formed\n", filename);
        hash_add(we->htable, word, word_nb);
        /* printf("word = %s word_nb = %d k = %d\n", word, word_nb, k); */
        for(i=0; i < dim; i++){
          res = fscanf(f, "%f", &(we->array[k++]));
          if(res == 0) fprintf(stderr, "word embdedding file %s ill formed\n", filename);
        }
        word_nb ++;
      }
      return we;
    }
    
    float *word_emb_get_vector(word_emb *we, char *word)
    {
      cell *c = hash_lookup(we->htable, word);
      if(c == NULL) return NULL;
      return &we->array[c->val * we->dim];
    }
    
    int word_emb_get_code(word_emb *we, char *word)
    {
      return hash_get_val(we->htable, word);
    }