Skip to content
Snippets Groups Projects
Commit 972ca29a authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fplm_suff.c a program to compute edition operations to transform a form into a lemma WIP

parent aa5addef
No related branches found
No related tags found
No related merge requests found
#ifndef __CHAR16__
#define __CHAR16__
typedef short char16;
typedef unsigned short char16;
int utf8_strlen(char *utf8_string);
char *char16toutf8(char16 *char16_string);
......
......@@ -2,8 +2,8 @@
#include<stdlib.h>
#include<string.h>
typedef short char16;
//#include"char16.h"
typedef unsigned short char16;
#define char_bit1(c) ((c) & 1)
#define char_bit2(c) (((c) & 2) >> 1)
#define char_bit3(c) (((c) & 4) >> 2)
......@@ -34,12 +34,6 @@ int utf8_strlen(char *utf8_string)
return l;
}
char *char16toutf8(char16 *char16_string)
{
return NULL;
}
int char16_strlen(char16 *string)
{
int i=0;
......@@ -47,6 +41,39 @@ int char16_strlen(char16 *string)
return i;
}
char *char16toutf8(char16 *char16_string)
{
char16 c;
int i, j;
int length_char16 = char16_strlen(char16_string);
int length_utf8 = 0;
int hi,lo;
char *utf8_string;
for(i=0; i < length_char16; i++){
c = char16_string[i];
hi = c >> 8;
if(hi != 0)
length_utf8 += 2;
else
length_utf8 += 1;
}
utf8_string = (char *)malloc(length_utf8 * sizeof(char));
j = 0;
for(i=0; i < length_char16; i++){
c = char16_string[i];
lo = c & 255;
hi = c >> 8;
printf("c = %d hi = %d lo = %d\n", c, hi, lo);
if(hi != 0)
utf8_string[j++] = (char)hi;
utf8_string[j++] = (char)lo;
}
utf8_string[j] = 0;
return utf8_string;
}
char16 *utf8tochar16(char *utf8_string)
{
int i,j;
......@@ -75,8 +102,9 @@ int main(void)
{
int i;
char string[200];
char *utf8_string;
char16 *char16_string;
strcpy(string, "élémentaire");
strcpy(string, "élèmentaire");
printf("string = %s\n", string);
printf("length = %d\n", (int)strlen(string));
......@@ -89,5 +117,11 @@ int main(void)
char16_string = utf8tochar16(string);
printf("char16_strlen = %d\n", char16_strlen(char16_string));
utf8_string = char16toutf8(char16_string);
for(i=0; i < strlen(utf8_string); i++){
printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, utf8_string[i], (int)utf8_string[i], char_bit1(utf8_string[i]), char_bit2(utf8_string[i]), char_bit3(utf8_string[i]), char_bit4(utf8_string[i]), char_bit5(utf8_string[i]), char_bit6(utf8_string[i]), char_bit7(utf8_string[i]), char_bit8(utf8_string[i]), length(utf8_string[i]));
}
*/
}
......@@ -6,3 +6,7 @@ target_link_libraries(mcf2conll transparse)
target_link_libraries(mcf2conll maca_common)
install (TARGETS mcf2conll DESTINATION bin)
add_executable(fplm_suff ./src/fplm_suff.c)
target_link_libraries(fplm_suff maca_common)
install (TARGETS fplm_suff DESTINATION bin)
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<getopt.h>
#include"util.h"
#include"char16.h"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *fplm_filename;
} context;
void context_free(context *ctx)
{
if(ctx){
if(ctx->program_name)
free(ctx->program_name);
if(ctx->fplm_filename)
free(ctx->fplm_filename);
free(ctx);
}
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->fplm_filename = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-f --fplm : fplm filename (read from stdin if absent)\n");
}
void fplm_suff_check_options(context *ctx){
if(ctx->help){
context_general_help_message(ctx);
exit(1);
}
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[4] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"fplm", required_argument, 0, 'f'},
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdf:", long_options, &option_index)) != -1){
switch (c)
{
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'd':
ctx->debug_mode = 1;
break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
}
}
return ctx;
}
int compute_classe(char16 *lemma_char16, char16 *form_char16)
{
int i,j,k;
int lemma_suffix_length;
int form_suffix_length;
int lemma_length = char16_strlen(lemma_char16);
int form_length = char16_strlen(form_char16);
int *classe;
for(i=0; (i < lemma_length) && (i < form_length); i++)
if(form_char16[i] != lemma_char16[i])
break;
lemma_suffix_length = lemma_length - i;
form_suffix_length = form_length - i;
// printf("lemma suffix length = %d form suffix length = %d\n", lemma_suffix_length, form_suffix_length);
classe = (int *)memalloc((lemma_suffix_length + form_suffix_length + 2) * sizeof(int));
j = 0;
classe[j++] = form_suffix_length;
for(k=0; k < form_suffix_length; k++)
classe[j++] = form_char16[form_length - k - 1];
classe[j++] = lemma_suffix_length;
for(k=0; k < lemma_suffix_length; k++)
classe[j++] = lemma_char16[i + k];
printf("%d ", classe[0]);
for(k=0; k < classe[0]; k++)
printf("%d ", classe[k+1]);
printf("%d ", classe[classe[0] + 1]);
for(k=0; k < classe[classe[0] + 1]; k++)
printf("%d ", classe[classe[0] + 1 + k+1]);
printf("\n");
return 0;
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
/*if(ctx->help){
context_general_help_message(ctx);
context_language_help_message(ctx);
context_fplm_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_features_filename_help_message(ctx);
context_features_model_help_message(ctx);
exit(1);
}*/
char form_utf8[100];
char *form_utf8_2;
char16 *form_char16;
char pos[100];
char lemma_utf8[100];
char *lemma_utf8_2;
char16 *lemma_char16;
char morpho[100];
FILE *F_fplm = stdin;
char buffer[1000];
if(ctx->fplm_filename)
F_fplm = myfopen(ctx->fplm_filename, "r");
while(fgets(buffer, 1000, F_fplm)){
if(feof(F_fplm))
break;
// printf("%s", buffer);
buffer[strlen(buffer) - 1] = '\0';
sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form_utf8, pos, lemma_utf8, morpho);
// printf("form = %s pos = %s lemma = %s morpho = %s\n", form_utf8, pos, lemma_utf8, morpho);
// printf("%s -> %s ", form_utf8, lemma_utf8);
lemma_char16 = utf8tochar16(lemma_utf8);
form_char16 = utf8tochar16(form_utf8);
form_utf8_2 = char16toutf8(form_char16);
lemma_utf8_2 = char16toutf8(lemma_char16);
printf("lemma avant = %s lemme après = %s\n", lemma_utf8, lemma_utf8_2);
compute_classe(lemma_char16, form_char16);
}
if(ctx->fplm_filename)
fclose(F_fplm);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment