diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 503e9214c5a8190648c7c237e0e7e5f96b12cd54..ea4625b2b13c5452932cd1cef17f3590794d6304 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -12,6 +12,7 @@ set(SOURCES src/util.c src/feat_desc.c src/feat_lib.c src/feat_model.c + src/char16.c ) diff --git a/maca_common/include/word.h b/maca_common/include/word.h index e70677618306262579ae471d8b91b91b93ce1b64..33fd8c13605cbf2f9d47e58c57677089202a302e 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -2,6 +2,7 @@ #define __WORD__ #include "mcd.h" +#include "char16.h" #define WORD_INVALID_GOV 10000 @@ -12,24 +13,40 @@ typedef struct _word { int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; + char16 *form_char16; int index; int is_root; } word; - +/* #define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1]) #define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2]) #define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3]) #define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4]) #define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5]) #define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6]) +*/ +#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1]) +#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2]) +#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3]) +#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4]) +#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5]) +#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6]) -#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) +/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) #define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1]) #define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2]) #define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3]) #define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4]) #define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5]) +*/ + +#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0]) +#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1]) +#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2]) +#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3]) +#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4]) +#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5]) #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c index 74fbd94fe5d69055cfbccd54f65df3f8c0a6f59b..b07d59423b73ac83263c467d4152430cbed9aeb9 100644 --- a/maca_common/src/char16.c +++ b/maca_common/src/char16.c @@ -36,7 +36,7 @@ int utf8_strlen(char *utf8_string) char *char16toutf8(char16 *char16_string) { - + return NULL; } @@ -49,7 +49,6 @@ int char16_strlen(char16 *string) char16 *utf8tochar16(char *utf8_string) { - char16 c16; int i,j; int utf8_length = strlen(utf8_string); int char16_length = 0; @@ -57,28 +56,27 @@ char16 *utf8tochar16(char *utf8_string) for(i=0; i < utf8_length; i++) char16_length += length(utf8_string[i]); - char16_string = malloc((char16_length + 1)* sizeof(char)); + char16_string = malloc((char16_length + 1)* sizeof(char16)); for(i=0, j=0; i < utf8_length; i++, j++){ if(length(utf8_string[i]) == 1){ char16_string[j] = (char16)utf8_string[i]; } if(length(utf8_string[i]) == 2){ char16_string[j] = utf8_string[i]; - char16_string[j] << 8; + char16_string[j] = char16_string[j] << 8; char16_string[j] += utf8_string[++i]; } } char16_string[j] = 0; return char16_string; } - +/* int main(void) { int i; char string[200]; char16 *char16_string; strcpy(string, "élémentaire"); - /* strcpy(string, "konjunktúra-időszaknál"); */ printf("string = %s\n", string); printf("length = %d\n", (int)strlen(string)); @@ -92,3 +90,4 @@ int main(void) printf("char16_strlen = %d\n", char16_strlen(char16_string)); } +*/ diff --git a/maca_common/src/word.c b/maca_common/src/word.c index e18a4b62733df149caba098ac72d094f2a0f9293..d4c01a61d27af0ee0ad48cecca8a00290bf6c0d6 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -19,6 +19,7 @@ word *word_new(char *input) w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV; w->form = NULL; + w->form_char16 = NULL; w->index = -1; w->signature = -1; @@ -65,6 +66,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) } if(mcd_struct->wf[col] == MCD_WF_FORM){ w->form = strdup(token); + w->form_char16 = utf8tochar16(w->form); w->U1 = isupper(token[0]) ? 1 : 0; } col++; @@ -96,6 +98,7 @@ void word_free(word *w) if(w == NULL) return; if(w->input) free(w->input); if(w->form) free(w->form); + if(w->form_char16) free(w->form_char16); free(w); }