From 1a1934660f3a69849e837ea97a54efa94fee3e76 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 13 Apr 2017 18:55:39 +0200 Subject: [PATCH] added a 16 bits representation for words in order to accurately compute prefixes and suffixes of utf8 strings --- maca_common/CMakeLists.txt | 1 + maca_common/include/word.h | 21 +++++++++++++++++++-- maca_common/src/char16.c | 11 +++++------ maca_common/src/word.c | 3 +++ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 503e921..ea4625b 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -12,6 +12,7 @@ set(SOURCES src/util.c src/feat_desc.c src/feat_lib.c src/feat_model.c + src/char16.c ) diff --git a/maca_common/include/word.h b/maca_common/include/word.h index e706776..33fd8c1 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -2,6 +2,7 @@ #define __WORD__ #include "mcd.h" +#include "char16.h" #define WORD_INVALID_GOV 10000 @@ -12,24 +13,40 @@ typedef struct _word { int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; + char16 *form_char16; int index; int is_root; } word; - +/* #define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1]) #define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2]) #define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3]) #define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4]) #define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5]) #define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6]) +*/ +#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1]) +#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2]) +#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3]) +#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4]) +#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5]) +#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6]) -#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) +/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) #define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1]) #define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2]) #define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3]) #define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4]) #define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5]) +*/ + +#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0]) +#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1]) +#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2]) +#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3]) +#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4]) +#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5]) #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c index 74fbd94..b07d594 100644 --- a/maca_common/src/char16.c +++ b/maca_common/src/char16.c @@ -36,7 +36,7 @@ int utf8_strlen(char *utf8_string) char *char16toutf8(char16 *char16_string) { - + return NULL; } @@ -49,7 +49,6 @@ int char16_strlen(char16 *string) char16 *utf8tochar16(char *utf8_string) { - char16 c16; int i,j; int utf8_length = strlen(utf8_string); int char16_length = 0; @@ -57,28 +56,27 @@ char16 *utf8tochar16(char *utf8_string) for(i=0; i < utf8_length; i++) char16_length += length(utf8_string[i]); - char16_string = malloc((char16_length + 1)* sizeof(char)); + char16_string = malloc((char16_length + 1)* sizeof(char16)); for(i=0, j=0; i < utf8_length; i++, j++){ if(length(utf8_string[i]) == 1){ char16_string[j] = (char16)utf8_string[i]; } if(length(utf8_string[i]) == 2){ char16_string[j] = utf8_string[i]; - char16_string[j] << 8; + char16_string[j] = char16_string[j] << 8; char16_string[j] += utf8_string[++i]; } } char16_string[j] = 0; return char16_string; } - +/* int main(void) { int i; char string[200]; char16 *char16_string; strcpy(string, "élémentaire"); - /* strcpy(string, "konjunktúra-időszaknál"); */ printf("string = %s\n", string); printf("length = %d\n", (int)strlen(string)); @@ -92,3 +90,4 @@ int main(void) printf("char16_strlen = %d\n", char16_strlen(char16_string)); } +*/ diff --git a/maca_common/src/word.c b/maca_common/src/word.c index e18a4b6..d4c01a6 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -19,6 +19,7 @@ word *word_new(char *input) w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV; w->form = NULL; + w->form_char16 = NULL; w->index = -1; w->signature = -1; @@ -65,6 +66,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) } if(mcd_struct->wf[col] == MCD_WF_FORM){ w->form = strdup(token); + w->form_char16 = utf8tochar16(w->form); w->U1 = isupper(token[0]) ? 1 : 0; } col++; @@ -96,6 +98,7 @@ void word_free(word *w) if(w == NULL) return; if(w->input) free(w->input); if(w->form) free(w->form); + if(w->form_char16) free(w->form_char16); free(w); } -- GitLab