Select Git revision
vectorize.c
form2pos.c NaN GiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"form2pos.h"
#include"util.h"
form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list)
{
form2pos *f2p = (form2pos *)memalloc(sizeof(form2pos));
char *token;
f2p->nbelem = nbelem;
f2p->pos_nb = pos_nb;
f2p->d_pos = dico_new((char *)"d_pos", pos_nb * 10);
f2p->d_signature = dico_new((char *)"d_signature", pos_nb * 10);
f2p->h_form2signature = hash_new(nbelem * 4);
token = strtok(pos_list, "\t");
do{
//dico_add(f2p->d_pos, strdup(token));
dico_add(f2p->d_pos, token); // token is strdup'ed in dico_add()
}while((token = strtok(NULL, "\t")));
return f2p;
}
void form2pos_free(form2pos *f2p)
{
dico_free(f2p->d_pos);
dico_free(f2p->d_signature);
hash_free(f2p->h_form2signature);
free(f2p);
}
int form2pos_word_is_non_ambiguous(form2pos *f2p, char *form, char **pos)
{
int pos_code;
int signature = form2pos_get_signature(f2p, form);
char *signature_str = dico_int2string(f2p->d_signature, signature);
if(signature_str == NULL) return 0;
int l = strlen(signature_str);
int sum = 0;
/* printf("form = %s signature = %s\n", form, signature_str); */
for(int i = 0; i < l; i++){
sum += signature_str[i] - '0';
if(signature_str[i] != '0') pos_code = i;
/* printf("sum = %d\n", sum); */
}
if(sum == 1)
*pos = dico_int2string(f2p->d_pos, pos_code);
else
*pos = NULL;
return (sum ==1)? 1 : 0;
}
form2pos *form2pos_read(char *filename)
{
FILE *f = myfopen_no_exit(filename, "r");
int nbelem;
int pos_nb;
char pos_list[10000];
char form[300];
char signature[200];
form2pos *f2p = NULL;
if(f == NULL) return NULL;
/* read number of forms */
fscanf(f, "%d\n", &nbelem);
/* read number of pos tags */
fscanf(f, "%d\n", &pos_nb);
/* read list of pos tags */
fgets(pos_list, 10000, f);
/* printf("form2pos read nbelem = %d pos nb = %d pos list = %s\n", nbelem, pos_nb, pos_list); */
f2p = form2pos_new(nbelem, pos_nb, pos_list);
while(!feof(f)){
fscanf(f, "%[^\t]\t%s\n", form, signature);
/* printf("form = %s signature = %s code = %d\n", form, signature, signature_code); */
hash_add(f2p->h_form2signature, form, dico_add(f2p->d_signature, signature));
}
fclose(f);
return f2p;
}
int form2pos_get_signature(form2pos *f2p, char *form)
{
if(form == NULL)
return -1;
else
return hash_get_val(f2p->h_form2signature, form);
}
int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos)
{
int pos_code = dico_string2int(f2p->d_pos, pos);
char *signature;
int signature_code;
if(pos_code == -1){
fprintf(stderr, "cat %s unknown\n", pos);
return -1;
}
signature_code = hash_get_val(f2p->h_form2signature, form);
if(signature_code == -1){
fprintf(stderr, "form %s unknown\n", form);
return -1;
}
signature = dico_int2string(f2p->d_signature, signature_code);
return signature[pos_code];
}