diff --git a/maca_common/include/char16.h b/maca_common/include/char16.h index a46a23425d1815fb3e704f25c4f3265d55e680a6..9aaa0d814fdb830127a22755ca86a2d9508304c2 100644 --- a/maca_common/include/char16.h +++ b/maca_common/include/char16.h @@ -1,7 +1,7 @@ #ifndef __CHAR16__ #define __CHAR16__ -typedef short char16; +typedef unsigned short char16; int utf8_strlen(char *utf8_string); char *char16toutf8(char16 *char16_string); diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c index 311e618726db9c7e060513b47b67d91eb2323e5a..0631fdb0bb8190ed8a160c492ee57e231b215464 100644 --- a/maca_common/src/char16.c +++ b/maca_common/src/char16.c @@ -2,8 +2,8 @@ #include<stdlib.h> #include<string.h> -typedef short char16; - +//#include"char16.h" +typedef unsigned short char16; #define char_bit1(c) ((c) & 1) #define char_bit2(c) (((c) & 2) >> 1) #define char_bit3(c) (((c) & 4) >> 2) @@ -34,12 +34,6 @@ int utf8_strlen(char *utf8_string) return l; } -char *char16toutf8(char16 *char16_string) -{ - return NULL; -} - - int char16_strlen(char16 *string) { int i=0; @@ -47,6 +41,39 @@ int char16_strlen(char16 *string) return i; } +char *char16toutf8(char16 *char16_string) +{ + char16 c; + int i, j; + int length_char16 = char16_strlen(char16_string); + int length_utf8 = 0; + int hi,lo; + char *utf8_string; + + for(i=0; i < length_char16; i++){ + c = char16_string[i]; + hi = c >> 8; + if(hi != 0) + length_utf8 += 2; + else + length_utf8 += 1; + } + utf8_string = (char *)malloc(length_utf8 * sizeof(char)); + j = 0; + for(i=0; i < length_char16; i++){ + c = char16_string[i]; + lo = c & 255; + hi = c >> 8; + printf("c = %d hi = %d lo = %d\n", c, hi, lo); + if(hi != 0) + utf8_string[j++] = (char)hi; + utf8_string[j++] = (char)lo; + } + utf8_string[j] = 0; + return utf8_string; +} + + char16 *utf8tochar16(char *utf8_string) { int i,j; @@ -75,8 +102,9 @@ int main(void) { int i; char string[200]; + char *utf8_string; char16 *char16_string; - strcpy(string, "élémentaire"); + strcpy(string, "élèmentaire"); printf("string = %s\n", string); printf("length = %d\n", (int)strlen(string)); @@ -88,6 +116,12 @@ int main(void) char16_string = utf8tochar16(string); printf("char16_strlen = %d\n", char16_strlen(char16_string)); + + utf8_string = char16toutf8(char16_string); + for(i=0; i < strlen(utf8_string); i++){ + printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, utf8_string[i], (int)utf8_string[i], char_bit1(utf8_string[i]), char_bit2(utf8_string[i]), char_bit3(utf8_string[i]), char_bit4(utf8_string[i]), char_bit5(utf8_string[i]), char_bit6(utf8_string[i]), char_bit7(utf8_string[i]), char_bit8(utf8_string[i]), length(utf8_string[i])); + } +*/ } -*/ + diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt index 9b5a40e8317025b1b7915b23b3d9b6a10a87bfb5..7ba50b065165c74f28aaa019f8cb2e895846fb56 100644 --- a/maca_tools/CMakeLists.txt +++ b/maca_tools/CMakeLists.txt @@ -6,3 +6,7 @@ target_link_libraries(mcf2conll transparse) target_link_libraries(mcf2conll maca_common) install (TARGETS mcf2conll DESTINATION bin) +add_executable(fplm_suff ./src/fplm_suff.c) +target_link_libraries(fplm_suff maca_common) +install (TARGETS fplm_suff DESTINATION bin) + diff --git a/maca_tools/src/fplm_suff.c b/maca_tools/src/fplm_suff.c new file mode 100644 index 0000000000000000000000000000000000000000..941949f6b7a35f7f5827daf708a134e5c7e670f3 --- /dev/null +++ b/maca_tools/src/fplm_suff.c @@ -0,0 +1,186 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<getopt.h> + +#include"util.h" +#include"char16.h" + + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *fplm_filename; +} context; + +void context_free(context *ctx) +{ + if(ctx){ + if(ctx->program_name) + free(ctx->program_name); + if(ctx->fplm_filename) + free(ctx->fplm_filename); + free(ctx); + } +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-f --fplm : fplm filename (read from stdin if absent)\n"); +} + +void fplm_suff_check_options(context *ctx){ + if(ctx->help){ + context_general_help_message(ctx); + exit(1); + } +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[4] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"fplm", required_argument, 0, 'f'}, + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdf:", long_options, &option_index)) != -1){ + switch (c) + { + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'd': + ctx->debug_mode = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + } + } + return ctx; +} + + +int compute_classe(char16 *lemma_char16, char16 *form_char16) +{ + int i,j,k; + int lemma_suffix_length; + int form_suffix_length; + int lemma_length = char16_strlen(lemma_char16); + int form_length = char16_strlen(form_char16); + int *classe; + + for(i=0; (i < lemma_length) && (i < form_length); i++) + if(form_char16[i] != lemma_char16[i]) + break; + + lemma_suffix_length = lemma_length - i; + form_suffix_length = form_length - i; + + // printf("lemma suffix length = %d form suffix length = %d\n", lemma_suffix_length, form_suffix_length); + classe = (int *)memalloc((lemma_suffix_length + form_suffix_length + 2) * sizeof(int)); + + j = 0; + classe[j++] = form_suffix_length; + for(k=0; k < form_suffix_length; k++) + classe[j++] = form_char16[form_length - k - 1]; + classe[j++] = lemma_suffix_length; + for(k=0; k < lemma_suffix_length; k++) + classe[j++] = lemma_char16[i + k]; + + printf("%d ", classe[0]); + for(k=0; k < classe[0]; k++) + printf("%d ", classe[k+1]); + printf("%d ", classe[classe[0] + 1]); + for(k=0; k < classe[classe[0] + 1]; k++) + printf("%d ", classe[classe[0] + 1 + k+1]); + printf("\n"); + + return 0; +} + + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + + /*if(ctx->help){ + context_general_help_message(ctx); + context_language_help_message(ctx); + context_fplm_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_features_filename_help_message(ctx); + context_features_model_help_message(ctx); + exit(1); + }*/ + + char form_utf8[100]; + char *form_utf8_2; + char16 *form_char16; + char pos[100]; + char lemma_utf8[100]; + char *lemma_utf8_2; + char16 *lemma_char16; + char morpho[100]; + FILE *F_fplm = stdin; + + char buffer[1000]; + + if(ctx->fplm_filename) + F_fplm = myfopen(ctx->fplm_filename, "r"); + + while(fgets(buffer, 1000, F_fplm)){ + if(feof(F_fplm)) + break; + // printf("%s", buffer); + buffer[strlen(buffer) - 1] = '\0'; + sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form_utf8, pos, lemma_utf8, morpho); + // printf("form = %s pos = %s lemma = %s morpho = %s\n", form_utf8, pos, lemma_utf8, morpho); + // printf("%s -> %s ", form_utf8, lemma_utf8); + lemma_char16 = utf8tochar16(lemma_utf8); + form_char16 = utf8tochar16(form_utf8); + + form_utf8_2 = char16toutf8(form_char16); + lemma_utf8_2 = char16toutf8(lemma_char16); + + printf("lemma avant = %s lemme après = %s\n", lemma_utf8, lemma_utf8_2); + + compute_classe(lemma_char16, form_char16); + + } + if(ctx->fplm_filename) + fclose(F_fplm); +}