word.c

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<ctype.h>
#include"word.h"
#include"util.h"


word *word_new(char *input)
{
  int i;
  word *w = (word *) memalloc(sizeof(word));
  if(input == NULL)
    w->input = NULL;
  else
    w->input = strdup(input);
  
  for(i=0; i < MCD_WF_NB; i++) w->wf_array[i] = -1;

  w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
  w->form = NULL;

  w->index = -1;
  w->signature = -1;
  w->is_root = 0;
  return w;
}


/* look for the next word in file f */
word *word_read(FILE *f, mcd *mcd_struct)
{
  char buffer[10000];

  /* look for a valid word */
  while(fgets(buffer, 10000, f)){
    /* printf("buffer = %s\n", buffer); */
    /* ignore empty lines */
    if((buffer[0] == '\n')) continue;
    /* lines beginning with ## are comments */ 
    if((buffer[0] == '#') && (buffer[1] == '#')) continue; 
    return word_parse_buffer(buffer, mcd_struct);
  }
  return NULL;
}

/* parse string buffer to extract the different word features */
/* codes of the word features are stored in wf_array */

word *word_parse_buffer(char *buffer, mcd *mcd_struct)
{
  char *token;   
  word *w = NULL;
  int col = 0;

  /* remove newline from buffer */
  if(buffer[strlen(buffer)-1] == '\n')  buffer[strlen(buffer)-1] = '\0';  

  w = word_new(buffer);
  token = strtok(buffer, "\t");
  do{
    /* if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */
    if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1)){
      w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
    }
    if(mcd_struct->wf[col] == MCD_WF_FORM){
      w->form = strdup(token);
      w->U1 = isupper(token[0]) ? 1 : 0;
    }
    col++;
  } while((token = strtok(NULL , "\t")));
  
  return w;
}


/* out of date, must be updated */

word *word_copy(word *w)
{
  word *copy = word_new(w->input);
  int i;

  for(i=0; i < MCD_WF_NB; i++)
    copy->wf_array[i] = w->wf_array[i];

  copy->U1 = w->U1;
  copy->signature = w->signature;
  copy->label = w->label;
  copy->form = (w->form)? strdup(w->form): NULL;
  return copy;
}

void word_free(word *w)
{
  if(w == NULL) return;
  if(w->input) free(w->input);
  if(w->form) free(w->form);
  free(w);
}

word *word_create_dummy(mcd *mcd_struct)
{
  word *w = word_new(NULL);
  /* int type; */

  w->wf_array[MCD_WF_ID] = 0;
  w->index = 0;
  /*  for(type = 1; type < MCD_WF_NB; type++)
      w->wf_array[type] = -1;*/
    /*    if(mcd_struct->wf2col[type] != -1)
	  w->wf_array[type] = mcd_get_code(mcd_struct, (char *) "ROOT", mcd_struct->wf2col[type]);*/
  
  return w;
}

void word_print2(FILE *f, word *w)
{
  if(w == NULL) return;
  
  if(w->input) fprintf(f, "%s\t", w->input);
  printf("form = %d\t", word_get_form(w));
  printf("lemma = %d\t", word_get_lemma(w));
  printf("pos = %d\t", word_get_pos(w));
  printf("index = %d\t", word_get_id(w));
  printf("rel index = %d\n", word_get_index(w));
}

void word_print(FILE *f, word *w)
{
  if(w == NULL) return;
  if(w->input == NULL)
    fprintf(f, "NULL");
  else
  fprintf(f, "%s", w->input);
}

int word_is_eos(word *w, mcd *mcd_struct)
{
  if(w == NULL) return 0;
  if(mcd_get_sent_seg_col(mcd_struct) == -1) return 0;
  return word_get_sent_seg(w);

}
int word_get_gov_index(word *w)
{
  int index;
  if(word_get_gov(w) == WORD_INVALID_GOV) return -1;
  index = (word_get_index(w)) + (word_get_gov(w));
  return index; 
}

void word_print_col_n(FILE *f, word *w, int n)
{
  int i;
  int col = 0;
  char *buffer = w->input;
  if(buffer == NULL) return;
  int l= strlen(buffer);
  for(i=0; i < l; i++){
    if(buffer[i] == '\t') {
      col++;
      continue;
    }
    if(col == n)
      fprintf(f, "%c", buffer[i]);
  }
}

void word_sprint_col_n(char *s, word *w, int n)
{
  int i;
  int col = 0;
  int j = 0;
  char *buffer = w->input;
  if(buffer == NULL) return;
  int l= strlen(buffer);
  for(i=0; i < l; i++){
    if(buffer[i] == '\t') {
      col++;
      continue;
    }
    if(col == n)
      s[j++] = buffer[i];
  }
  s[j] = '\0';
}