Select Git revision
staticman.yml
word.c 3.93 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<ctype.h>
#include"word.h"
#include"util.h"
word *word_new(char *input)
{
int i;
word *w = (word *) memalloc(sizeof(word));
if(input == NULL)
w->input = NULL;
else
w->input = strdup(input);
for(i=0; i < MCD_WF_NB; i++) w->wf_array[i] = -1;
w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
w->form = NULL;
w->index = -1;
w->signature = -1;
w->is_root = 0;
return w;
}
/* look for the next word in file f */
word *word_read(FILE *f, mcd *mcd_struct)
{
char buffer[10000];
/* look for a valid word */
while(fgets(buffer, 10000, f)){
/* printf("buffer = %s\n", buffer); */
/* ignore empty lines */
if((buffer[0] == '\n')) continue;
/* lines beginning with ## are comments */
if((buffer[0] == '#') && (buffer[1] == '#')) continue;
return word_parse_buffer(buffer, mcd_struct);
}
return NULL;
}
/* parse string buffer to extract the different word features */
/* codes of the word features are stored in wf_array */
word *word_parse_buffer(char *buffer, mcd *mcd_struct)
{
char *token;
word *w = NULL;
int col = 0;
/* remove newline from buffer */
if(buffer[strlen(buffer)-1] == '\n') buffer[strlen(buffer)-1] = '\0';
w = word_new(buffer);
token = strtok(buffer, "\t");
do{
/* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */
if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){
w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
}
if(mcd_struct->wf[col] == MCD_WF_FORM){
w->form = strdup(token);
w->U1 = isupper(token[0]) ? 1 : 0;
}
col++;
} while((token = strtok(NULL , "\t")));
return w;
}
/* out of date, must be updated */
word *word_copy(word *w)
{
word *copy = word_new(w->input);
int i;
for(i=0; i < MCD_WF_NB; i++)
copy->wf_array[i] = w->wf_array[i];
copy->U1 = w->U1;
copy->signature = w->signature;
copy->label = w->label;
copy->form = (w->form)? strdup(w->form): NULL;
return copy;
}
void word_free(word *w)
{
if(w == NULL) return;
if(w->input) free(w->input);
if(w->form) free(w->form);
free(w);
}
word *word_create_dummy(mcd *mcd_struct)
{
word *w = word_new(NULL);
/* int type; */
w->wf_array[MCD_WF_ID] = 0;
w->index = 0;
/* for(type = 1; type < MCD_WF_NB; type++)
w->wf_array[type] = -1;*/
/* if(mcd_struct->wf2col[type] != -1)
w->wf_array[type] = mcd_get_code(mcd_struct, (char *) "ROOT", mcd_struct->wf2col[type]);*/
return w;
}
void word_print2(FILE *f, word *w)
{
if(w == NULL) return;
if(w->input) fprintf(f, "%s\t", w->input);
printf("form = %d\t", word_get_form(w));
printf("lemma = %d\t", word_get_lemma(w));
printf("pos = %d\t", word_get_pos(w));
printf("index = %d\t", word_get_id(w));
printf("rel index = %d\n", word_get_index(w));
}
void word_print(FILE *f, word *w)
{
if(w == NULL) return;
if(w->input == NULL)
fprintf(f, "NULL");
else
fprintf(f, "%s", w->input);
}
int word_is_eos(word *w, mcd *mcd_struct)
{
if(w == NULL) return 0;
if(mcd_get_sent_seg_col(mcd_struct) == -1) return 0;
return word_get_sent_seg(w);
}
int word_get_gov_index(word *w)
{
int index;
if(word_get_gov(w) == WORD_INVALID_GOV) return -1;
index = (word_get_index(w)) + (word_get_gov(w));
return index;
}
void word_print_col_n(FILE *f, word *w, int n)
{
int i;
int col = 0;
char *buffer = w->input;
if(buffer == NULL) return;
int l= strlen(buffer);
for(i=0; i < l; i++){
if(buffer[i] == '\t') {
col++;
continue;
}
if(col == n)
fprintf(f, "%c", buffer[i]);
}
}
void word_sprint_col_n(char *s, word *w, int n)
{
int i;
int col = 0;
int j = 0;
char *buffer = w->input;
if(buffer == NULL) return;
int l= strlen(buffer);
for(i=0; i < l; i++){
if(buffer[i] == '\t') {
col++;
continue;
}
if(col == n)
s[j++] = buffer[i];
}
s[j] = '\0';
}