Skip to content
Snippets Groups Projects
Commit 1a193466 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added a 16 bits representation for words in order to accurately compute...

added a 16 bits representation for words in order to accurately compute prefixes and suffixes of utf8 strings
parent c63e1d70
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,7 @@ set(SOURCES src/util.c
src/feat_desc.c
src/feat_lib.c
src/feat_model.c
src/char16.c
)
......
......@@ -2,6 +2,7 @@
#define __WORD__
#include "mcd.h"
#include "char16.h"
#define WORD_INVALID_GOV 10000
......@@ -12,24 +13,40 @@ typedef struct _word {
int signature; /* pos tags that this form can have (represented as a boolean string) */
int label;
char *form;
char16 *form_char16;
int index;
int is_root;
} word;
/*
#define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6])
*/
#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6])
#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0])
/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5])
*/
#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
#define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
#define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
......
......@@ -36,7 +36,7 @@ int utf8_strlen(char *utf8_string)
char *char16toutf8(char16 *char16_string)
{
return NULL;
}
......@@ -49,7 +49,6 @@ int char16_strlen(char16 *string)
char16 *utf8tochar16(char *utf8_string)
{
char16 c16;
int i,j;
int utf8_length = strlen(utf8_string);
int char16_length = 0;
......@@ -57,28 +56,27 @@ char16 *utf8tochar16(char *utf8_string)
for(i=0; i < utf8_length; i++)
char16_length += length(utf8_string[i]);
char16_string = malloc((char16_length + 1)* sizeof(char));
char16_string = malloc((char16_length + 1)* sizeof(char16));
for(i=0, j=0; i < utf8_length; i++, j++){
if(length(utf8_string[i]) == 1){
char16_string[j] = (char16)utf8_string[i];
}
if(length(utf8_string[i]) == 2){
char16_string[j] = utf8_string[i];
char16_string[j] << 8;
char16_string[j] = char16_string[j] << 8;
char16_string[j] += utf8_string[++i];
}
}
char16_string[j] = 0;
return char16_string;
}
/*
int main(void)
{
int i;
char string[200];
char16 *char16_string;
strcpy(string, "élémentaire");
/* strcpy(string, "konjunktúra-időszaknál"); */
printf("string = %s\n", string);
printf("length = %d\n", (int)strlen(string));
......@@ -92,3 +90,4 @@ int main(void)
printf("char16_strlen = %d\n", char16_strlen(char16_string));
}
*/
......@@ -19,6 +19,7 @@ word *word_new(char *input)
w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
w->form = NULL;
w->form_char16 = NULL;
w->index = -1;
w->signature = -1;
......@@ -65,6 +66,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct)
}
if(mcd_struct->wf[col] == MCD_WF_FORM){
w->form = strdup(token);
w->form_char16 = utf8tochar16(w->form);
w->U1 = isupper(token[0]) ? 1 : 0;
}
col++;
......@@ -96,6 +98,7 @@ void word_free(word *w)
if(w == NULL) return;
if(w->input) free(w->input);
if(w->form) free(w->form);
if(w->form_char16) free(w->form_char16);
free(w);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment