Skip to content
Snippets Groups Projects
Select Git revision
  • ec0e2c51c6ef8714b193b810f8e3abb99e3882ca
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

word.c

Blame
  • word.c 3.93 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<ctype.h>
    #include"word.h"
    #include"util.h"
    
    
    word *word_new(char *input)
    {
      int i;
      word *w = (word *) memalloc(sizeof(word));
      if(input == NULL)
        w->input = NULL;
      else
        w->input = strdup(input);
      
      for(i=0; i < MCD_WF_NB; i++) w->wf_array[i] = -1;
    
      w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
      w->form = NULL;
    
      w->index = -1;
      w->signature = -1;
      w->is_root = 0;
      return w;
    }
    
    
    /* look for the next word in file f */
    word *word_read(FILE *f, mcd *mcd_struct)
    {
      char buffer[10000];
    
      /* look for a valid word */
      while(fgets(buffer, 10000, f)){
        /* printf("buffer = %s\n", buffer); */
        /* ignore empty lines */
        if((buffer[0] == '\n')) continue;
        /* lines beginning with ## are comments */ 
        if((buffer[0] == '#') && (buffer[1] == '#')) continue; 
        return word_parse_buffer(buffer, mcd_struct);
      }
      return NULL;
    }
    
    /* parse string buffer to extract the different word features */
    /* codes of the word features are stored in wf_array */
    
    word *word_parse_buffer(char *buffer, mcd *mcd_struct)
    {
      char *token;   
      word *w = NULL;
      int col = 0;
    
      /* remove newline from buffer */
      if(buffer[strlen(buffer)-1] == '\n')  buffer[strlen(buffer)-1] = '\0';  
    
      w = word_new(buffer);
      token = strtok(buffer, "\t");
      do{
        /* if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */
        if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1)){
          w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
        }
        if(mcd_struct->wf[col] == MCD_WF_FORM){
          w->form = strdup(token);
          w->U1 = isupper(token[0]) ? 1 : 0;
        }
        col++;
      } while((token = strtok(NULL , "\t")));
      
      return w;
    }
    
    
    /* out of date, must be updated */
    
    word *word_copy(word *w)
    {
      word *copy = word_new(w->input);
      int i;
    
      for(i=0; i < MCD_WF_NB; i++)
        copy->wf_array[i] = w->wf_array[i];
    
      copy->U1 = w->U1;
      copy->signature = w->signature;
      copy->label = w->label;
      copy->form = (w->form)? strdup(w->form): NULL;
      return copy;
    }
    
    void word_free(word *w)
    {
      if(w == NULL) return;
      if(w->input) free(w->input);
      if(w->form) free(w->form);
      free(w);
    }
    
    word *word_create_dummy(mcd *mcd_struct)
    {
      word *w = word_new(NULL);
      /* int type; */
    
      w->wf_array[MCD_WF_ID] = 0;
      w->index = 0;
      /*  for(type = 1; type < MCD_WF_NB; type++)
          w->wf_array[type] = -1;*/
        /*    if(mcd_struct->wf2col[type] != -1)
    	  w->wf_array[type] = mcd_get_code(mcd_struct, (char *) "ROOT", mcd_struct->wf2col[type]);*/
      
      return w;
    }
    
    void word_print2(FILE *f, word *w)
    {
      if(w == NULL) return;
      
      if(w->input) fprintf(f, "%s\t", w->input);
      printf("form = %d\t", word_get_form(w));
      printf("lemma = %d\t", word_get_lemma(w));
      printf("pos = %d\t", word_get_pos(w));
      printf("index = %d\t", word_get_id(w));
      printf("rel index = %d\n", word_get_index(w));
    }
    
    void word_print(FILE *f, word *w)
    {
      if(w == NULL) return;
      if(w->input == NULL)
        fprintf(f, "NULL");
      else
      fprintf(f, "%s", w->input);
    }
    
    int word_is_eos(word *w, mcd *mcd_struct)
    {
      if(w == NULL) return 0;
      if(mcd_get_sent_seg_col(mcd_struct) == -1) return 0;
      return word_get_sent_seg(w);
    
    }
    int word_get_gov_index(word *w)
    {
      int index;
      if(word_get_gov(w) == WORD_INVALID_GOV) return -1;
      index = (word_get_index(w)) + (word_get_gov(w));
      return index; 
    }
    
    void word_print_col_n(FILE *f, word *w, int n)
    {
      int i;
      int col = 0;
      char *buffer = w->input;
      if(buffer == NULL) return;
      int l= strlen(buffer);
      for(i=0; i < l; i++){
        if(buffer[i] == '\t') {
          col++;
          continue;
        }
        if(col == n)
          fprintf(f, "%c", buffer[i]);
      }
    }
    
    void word_sprint_col_n(char *s, word *w, int n)
    {
      int i;
      int col = 0;
      int j = 0;
      char *buffer = w->input;
      if(buffer == NULL) return;
      int l= strlen(buffer);
      for(i=0; i < l; i++){
        if(buffer[i] == '\t') {
          col++;
          continue;
        }
        if(col == n)
          s[j++] = buffer[i];
      }
      s[j] = '\0';
    }