Select Git revision
-
Alexis Nasr authored
added a 16 bits representation for words in order to accurately compute prefixes and suffixes of utf8 strings
Alexis Nasr authoredadded a 16 bits representation for words in order to accurately compute prefixes and suffixes of utf8 strings
word.h 10.05 KiB
#ifndef __WORD__
#define __WORD__
#include "mcd.h"
#include "char16.h"
#define WORD_INVALID_GOV 10000
typedef struct _word {
int wf_array[MCD_WF_NB]; /* array containing the codes corresponding to the different word features */
char *input; /* the string corresponding to the actual line in the corpus file */
int U1; /* does the form begin with an uppercase character */
int signature; /* pos tags that this form can have (represented as a boolean string) */
int label;
char *form;
char16 *form_char16;
int index;
int is_root;
} word;
/*
#define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6])
*/
#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6])
/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5])
*/
#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
#define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
#define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
#define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA])
#define word_get_cpos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS])
#define word_get_pos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_POS])
#define word_get_feats(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FEATS])
#define word_get_gov(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_GOV])
#define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL])
#define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG])
#define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG])
#define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A])
#define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B])
#define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C])
#define word_get_D(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_D])
#define word_get_E(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_E])
#define word_get_F(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_F])
#define word_get_G(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_G])
#define word_get_H(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_H])
#define word_get_I(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_I])
#define word_get_J(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_J])
#define word_get_K(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_K])
#define word_get_L(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_L])
#define word_get_M(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_M])
#define word_get_N(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_N])
#define word_get_O(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_O])
#define word_get_P(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_P])
#define word_get_Q(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_Q])
#define word_get_R(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_R])
#define word_get_S(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_S])
#define word_get_T(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_T])
#define word_get_U(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_U])
#define word_get_V(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_V])
#define word_get_W(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_W])
#define word_get_X(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_X])
#define word_get_Y(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_Y])
#define word_get_Z(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_Z])
#define word_get_input(w) (((w) == NULL) ? NULL : (w)->input)
#define word_get_signature(w) (((w) == NULL) ? -1 : (w)->signature)
#define word_get_U1(w) (((w) == NULL) ? -1 : (w)->U1)
#define word_get_index(w) (((w) == NULL) ? -1 : (w)->index)
#define word_set_id(w, val) ((w)->wf_array[MCD_WF_ID] = (val))
#define word_set_form(w, val) ((w)->wf_array[MCD_WF_FORM] = (val))
#define word_set_lemma(w, val) ((w)->wf_array[MCD_WF_LEMMA] = (val))
#define word_set_cpos(w, val) ((w)->wf_array[MCD_WF_CPOS] = (val))
#define word_set_pos(w, val) ((w)->wf_array[MCD_WF_POS] = (val))
#define word_set_feats(w, val) ((w)->wf_array[MCD_WF_FEATS] = (val))
#define word_set_gov(w, val) ((w)->wf_array[MCD_WF_GOV] = (val))
#define word_set_label(w, val) ((w)->wf_array[MCD_WF_LABEL] = (val))
#define word_set_stag(w, val) ((w)->wf_array[MCD_WF_STAG] = (val))
#define word_set_sent_seg(w, val) ((w)->wf_array[MCD_WF_SENT_SEG] = (val))
#define word_set_A(w, val) ((w)->wf_array[MCD_WF_A] = (val))
#define word_set_B(w, val) ((w)->wf_array[MCD_WF_B] = (val))
#define word_set_C(w, val) ((w)->wf_array[MCD_WF_C] = (val))
#define word_set_D(w, val) ((w)->wf_array[MCD_WF_D] = (val))
#define word_set_E(w, val) ((w)->wf_array[MCD_WF_E] = (val))
#define word_set_F(w, val) ((w)->wf_array[MCD_WF_F] = (val))
#define word_set_G(w, val) ((w)->wf_array[MCD_WF_G] = (val))
#define word_set_H(w, val) ((w)->wf_array[MCD_WF_H] = (val))
#define word_set_I(w, val) ((w)->wf_array[MCD_WF_I] = (val))
#define word_set_J(w, val) ((w)->wf_array[MCD_WF_J] = (val))
#define word_set_K(w, val) ((w)->wf_array[MCD_WF_K] = (val))
#define word_set_L(w, val) ((w)->wf_array[MCD_WF_L] = (val))
#define word_set_M(w, val) ((w)->wf_array[MCD_WF_M] = (val))
#define word_set_N(w, val) ((w)->wf_array[MCD_WF_N] = (val))
#define word_set_O(w, val) ((w)->wf_array[MCD_WF_O] = (val))
#define word_set_P(w, val) ((w)->wf_array[MCD_WF_P] = (val))
#define word_set_Q(w, val) ((w)->wf_array[MCD_WF_Q] = (val))
#define word_set_R(w, val) ((w)->wf_array[MCD_WF_R] = (val))
#define word_set_S(w, val) ((w)->wf_array[MCD_WF_S] = (val))
#define word_set_T(w, val) ((w)->wf_array[MCD_WF_T] = (val))
#define word_set_U(w, val) ((w)->wf_array[MCD_WF_U] = (val))
#define word_set_V(w, val) ((w)->wf_array[MCD_WF_V] = (val))
#define word_set_W(w, val) ((w)->wf_array[MCD_WF_W] = (val))
#define word_set_X(w, val) ((w)->wf_array[MCD_WF_X] = (val))
#define word_set_Y(w, val) ((w)->wf_array[MCD_WF_Y] = (val))
#define word_set_Z(w, val) ((w)->wf_array[MCD_WF_Z] = (val))
#define word_set_signature(w, val) ((w)->signature = (val))
#define word_set_index(w, val) ((w)->index = (val))
word *word_new(char *input);
word *word_create_dummy(mcd *mcd_struct);
word *word_copy(word *w);
void word_free(word *w);
void word_print2(FILE *f, word *w);
void word_print(FILE *f, word *w);
word *word_read(FILE *f, mcd *mcd_struct);
word *word_parse_buffer(char *buffer, mcd *mcd_struct);
int word_is_eos(word *w, mcd *mcd_struct);
int word_get_gov_index(word *w);
void word_print_col_n(FILE *f, word *w, int n);
void word_sprint_col_n(char *s, word *w, int n);
#endif