Select Git revision
set_smooth_mask_params.m
word_emb.c 5.60 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"word_emb.h"
#include"util.h"
const long long w2v_max_size = 2000; // max length of strings
const long long w2v_max_w = 50; // max length of vocabulary entries
word_emb *word_emb_load_w2v_file_filtered(char *file_name, dico *d)
{
FILE *f;
char word[w2v_max_size];
long long words, size, a, b;
word_emb *we;
int k = 0;
int word_nb = 0;
float w;
f = myfopen(file_name, "rb");
fscanf(f, "%lld", &words);
fscanf(f, "%lld", &size);
/* printf("words = %lld size = %lld\n", words, size); */
we = word_emb_new(size, words);
for (b = 0; b < words; b++) {
a = 0;
while (1) {
word[a] = fgetc(f);
if (feof(f) || (word[a] == ' ')) break;
if ((a < w2v_max_w) && (word[a] != '\n')) a++;
}
word[a] = 0;
if(dico_string2int(d, word) != -1){
fprintf(stderr, "found word %s\n", word);
hash_add(we->htable, word, word_nb++);
/* fprintf(stdout, "read word %s %d\n", word, word_nb - 1); */
/* fprintf(stdout, "\r%d", word_nb - 1); */
for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f);
/* fprintf(stdout, "%d = %f\n", a, we->array[k-1]); */
}
}
else
for (a = 0; a < size; a++)
fread(&w, sizeof(float), 1, f);
}
fclose(f);
return we;
}
word_emb *word_emb_load_w2v_file(char *file_name)
{
FILE *f;
char word[w2v_max_size];
long long words, size, a, b;
word_emb *we;
int k = 0;
int word_nb = 0;
f = myfopen(file_name, "rb");
fscanf(f, "%lld", &words);
fscanf(f, "%lld", &size);
/* printf("words = %lld size = %lld\n", words, size); */
we = word_emb_new(size, words);
for (b = 0; b < words; b++) {
a = 0;
while (1) {
word[a] = fgetc(f);
if (feof(f) || (word[a] == ' ')) break;
if ((a < w2v_max_w) && (word[a] != '\n')) a++;
}
word[a] = 0;
hash_add(we->htable, word, word_nb++);
/* fprintf(stdout, "read word %s %d\n", word, word_nb - 1); */
fprintf(stdout, "\r%d", word_nb - 1);
for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f);
/* fprintf(stdout, "%d = %f\n", a, we->array[k-1]); */
}
}
fclose(f);
return we;
}
word_emb *word_emb_new(int dim, int nbelem)
{
word_emb *we;
we = (word_emb *)memalloc(sizeof(word_emb));
we->dim = dim;
we->nbelem = nbelem;
we->htable = hash_new(nbelem);
we->array = (float *)memalloc(dim * nbelem * sizeof(float));
return we;
}
void word_emb_free(word_emb *we)
{
hash_free(we->htable);
free(we->array);
free(we);
}
int word_emb_number_of_lines_in_file(char *filename)
{
FILE *f = myfopen(filename, "r");
char buffer[10000];
int line_nb = 0;
while(fgets(buffer, 10000, f)){
if(feof(f)) break;
line_nb++;
}
return line_nb;
}
int word_emb_number_of_columns_in_file(char *filename)
{
FILE *f = myfopen(filename, "r");
char buffer[10000];
int column_nb;
char *token;
fgets(buffer, 10000, f);
token = strtok(buffer, " ");
column_nb = 1;
while((token = strtok(NULL , " \n")))
column_nb++;
fclose(f);
return column_nb;
}
void word_emb_print(FILE *f, word_emb *we, int code)
{
int i;
if(code == -1){
for(i=0; i < we->dim - 1; i++)
fprintf(f, "0.0 ");
fprintf(f, "0.0");
}
else{
int offset = code * we->dim;
for(i=0; i < we->dim - 1; i++)
fprintf(f, "%f ", we->array[offset + i]);
fprintf(f, "%f", we->array[offset + i]);
}
}
/* int word_emb_fill_input_array_dnn(fann_type *input_array, word_emb *we, int code, int first_index) */
int word_emb_fill_input_array_dnn(float *input_array, word_emb *we, int code, int first_index)
{
int i;
int last_index = first_index + we->dim;
if(code == -1){
for(i = first_index; i < last_index; i++)
input_array[i] = 0;
}
else{
int offset = code * we->dim;
for(i=0; i < we->dim; i++)
input_array[first_index + i] = we->array[offset + i];
}
return last_index;
}
void word_emb_print_to_file(word_emb *we, char *filename)
{
cell *c = NULL;
FILE *f = NULL;
int i, j;
int offset;
if(filename == NULL)
f = stdout;
else
f = myfopen(filename, "w");
for(i=0; i < we->htable->size; i++){
for(c=we->htable->array[i]; c != NULL; c = c->next){
fprintf(f, "%s", c->key);
offset = c->val * we->dim;
for(j = 0; j < we->dim; j++){
fprintf(f, " %f", we->array[offset + j]);
}
fprintf(f, "\n");
}
}
}
word_emb *word_emb_load(char *filename)
{
FILE *f;
char word[300];
int k = 0;
int word_nb = 0;
int i;
int res;
int nbelem = word_emb_number_of_lines_in_file(filename);
int dim = word_emb_number_of_columns_in_file(filename) - 1;
word_emb *we = word_emb_new(dim, nbelem);
int line_nb = 0;
fprintf(stderr, "loading word embeddings\n");
fprintf(stderr, "we dim = %d\n", we->dim);
fprintf(stderr, "we nbelem = %d\n", we->nbelem);
f= myfopen(filename, "r");
while(!feof(f) && (line_nb < nbelem)){
line_nb++;
res = fscanf(f, "%s", word);
if(res == 0) fprintf(stderr, "word embdedding file %s ill formed\n", filename);
hash_add(we->htable, word, word_nb);
/* printf("word = %s word_nb = %d k = %d\n", word, word_nb, k); */
for(i=0; i < dim; i++){
res = fscanf(f, "%f", &(we->array[k++]));
if(res == 0) fprintf(stderr, "word embdedding file %s ill formed\n", filename);
}
word_nb ++;
}
return we;
}
float *word_emb_get_vector(word_emb *we, char *word)
{
cell *c = hash_lookup(we->htable, word);
if(c == NULL) return NULL;
return &we->array[c->val * we->dim];
}
int word_emb_get_code(word_emb *we, char *word)
{
return hash_get_val(we->htable, word);
}