From eb06d12ba67b175009769447d4d20468bc721479 Mon Sep 17 00:00:00 2001 From: Marjorie Armando <marjorie.ARMANDO.1@etu.univ-amu.fr> Date: Thu, 4 May 2017 22:42:21 +0200 Subject: [PATCH] can predict all targets in one, each targets, each targets then all targets in one -- made pos and person as a pattern feature --- Files/fm1.txt | 1 + Files/fm2.txt | 3 + Files/fm3.txt | 5 + Files/fm4.txt | 7 + Files/fm5.txt | 9 + Files/fm6.txt | 11 + Files/fm7.txt | 13 + maca_common/include/char16.h | 11 + maca_common/include/feat_desc.h | 5 +- maca_common/src/char16.c | 67 ++++ maca_morpho/src/fplm.h | 11 +- maca_morpho/src/fplm2cff.c | 14 +- maca_morpho/src/fplm2cff.h | 10 +- maca_morpho/src/fplm2cff_fct.c | 194 ++++++++++-- maca_morpho/src/fplm_fct.c | 80 ++--- maca_morpho/src/maca_morpho_context.c | 34 ++- maca_morpho/src/maca_morpho_context.h | 8 +- maca_morpho/src/maca_morpho_feat_fct.c | 87 +++--- maca_morpho/src/predict.c | 11 +- maca_morpho/src/predict.h | 31 +- maca_morpho/src/predict_fct.c | 406 ++++++++++++++++++++++--- maca_morpho/src/vectorize.c | 8 +- maca_morpho/src/vectorize.h | 2 +- 23 files changed, 857 insertions(+), 171 deletions(-) create mode 100644 Files/fm1.txt create mode 100644 Files/fm2.txt create mode 100644 Files/fm3.txt create mode 100644 Files/fm4.txt create mode 100644 Files/fm5.txt create mode 100644 Files/fm6.txt create mode 100644 Files/fm7.txt create mode 100644 maca_common/include/char16.h create mode 100644 maca_common/src/char16.c diff --git a/Files/fm1.txt b/Files/fm1.txt new file mode 100644 index 0000000..cd1b0f4 --- /dev/null +++ b/Files/fm1.txt @@ -0,0 +1 @@ +feat_pos s1 diff --git a/Files/fm2.txt b/Files/fm2.txt new file mode 100644 index 0000000..50530cf --- /dev/null +++ b/Files/fm2.txt @@ -0,0 +1,3 @@ +feat_person feat_pos s1 +feat_person feat_pos s2 +feat_person feat_pos s1 s2 diff --git a/Files/fm3.txt b/Files/fm3.txt new file mode 100644 index 0000000..c69aac6 --- /dev/null +++ b/Files/fm3.txt @@ -0,0 +1,5 @@ +feat_person feat_pos s1 +feat_person feat_pos s2 +feat_person feat_pos s3 +feat_person feat_pos s1 s2 +feat_person feat_pos s1 s2 s3 diff --git a/Files/fm4.txt b/Files/fm4.txt new file mode 100644 index 0000000..9b8dc89 --- /dev/null +++ b/Files/fm4.txt @@ -0,0 +1,7 @@ +s1 +s2 +s3 +s4 +s1 s2 +s1 s2 s3 +s1 s2 s3 s4 diff --git a/Files/fm5.txt b/Files/fm5.txt new file mode 100644 index 0000000..0d37c50 --- /dev/null +++ b/Files/fm5.txt @@ -0,0 +1,9 @@ +feat_person feat_pos s1 +feat_person feat_pos s2 +feat_person feat_pos s3 +feat_person feat_pos s4 +feat_person feat_pos s5 +feat_person feat_pos s1 s2 +feat_person feat_pos s1 s2 s3 +feat_person feat_pos s1 s2 s3 s4 +feat_person feat_pos s1 s2 s3 s4 s5 diff --git a/Files/fm6.txt b/Files/fm6.txt new file mode 100644 index 0000000..980c9ad --- /dev/null +++ b/Files/fm6.txt @@ -0,0 +1,11 @@ +s1 +s2 +s3 +s4 +s5 +s6 +s1 s2 +s1 s2 s3 +s1 s2 s3 s4 +s1 s2 s3 s4 s5 +s1 s2 s3 s4 s5 s6 diff --git a/Files/fm7.txt b/Files/fm7.txt new file mode 100644 index 0000000..ede6e21 --- /dev/null +++ b/Files/fm7.txt @@ -0,0 +1,13 @@ +s1 +s2 +s3 +s4 +s5 +s6 +s7 +s1 s2 +s1 s2 s3 +s1 s2 s3 s4 +s1 s2 s3 s4 s5 +s1 s2 s3 s4 s5 s6 +s1 s2 s3 s4 s5 s6 s7 diff --git a/maca_common/include/char16.h b/maca_common/include/char16.h new file mode 100644 index 0000000..9ef98d1 --- /dev/null +++ b/maca_common/include/char16.h @@ -0,0 +1,11 @@ +#ifndef __CHAR16__ +#define __CHAR16__ + +typedef short char16; + +int utf8_strlen(char *utf8_string); +char *char16toutf8(char16 *char16_string); +int char16_strlen(char16 *string); +char16 *utf8tochar16(char *utf8_string); + +#endif diff --git a/maca_common/include/feat_desc.h b/maca_common/include/feat_desc.h index ff5dc46..1fa17ce 100644 --- a/maca_common/include/feat_desc.h +++ b/maca_common/include/feat_desc.h @@ -1,15 +1,14 @@ #ifndef __FEAT_DESC__ #define __FEAT_DESC__ -#include "char16.h"/* typedef struct { char* form; char* pos; + char* morpho; }FP; -typedef int (*feat_fct) (FP *c);*/ -typedef int (*feat_fct) (void *c); +typedef int (*feat_fct) (FP *c); typedef struct { char *name; diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c new file mode 100644 index 0000000..16d395f --- /dev/null +++ b/maca_common/src/char16.c @@ -0,0 +1,67 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include "char16.h" + +#define char_bit1(c) ((c) & 1) +#define char_bit2(c) (((c) & 2) >> 1) +#define char_bit3(c) (((c) & 4) >> 2) +#define char_bit4(c) (((c) & 8) >> 3) +#define char_bit5(c) (((c) & 16) >> 4) +#define char_bit6(c) (((c) & 32) >> 5) +#define char_bit7(c) (((c) & 64) >> 6) +#define char_bit8(c) (((c) & 128) >> 7) +#define length(c) ((!char_bit8((c)) || (char_bit8(c) && !char_bit7(c)))? 1 : 2) +/* +int length(char c) +{ + if(!char_bit8(c)) return 1; + if(char_bit8(c) && !char_bit7(c)) return 1; + if(char_bit7(c)) return 2; + if(char_bit6(c)) return 3; + if(char_bit5(4)) return 4; + +} +*/ +int utf8_strlen(char *utf8_string) +{ + int l = 0; + while(*utf8_string){ + l += (length(*utf8_string) == 1) ? 1 : 0; + utf8_string++; + } + return l; +} +char *char16toutf8(char16 *char16_string) +{ + return NULL; +} +int char16_strlen(char16 *string) +{ + int i=0; + while(string[i]) i++; + return i; +} +char16 *utf8tochar16(char *utf8_string) +{ + int i,j; + int utf8_length = strlen(utf8_string); + int char16_length = 0; + char16 *char16_string; + for(i=0; i < utf8_length; i++) + char16_length += length(utf8_string[i]); + + char16_string = (char16*) malloc((char16_length + 1)* sizeof(char16)); + for(i=0, j=0; i < utf8_length; i++, j++){ + if(length(utf8_string[i]) == 1){ + char16_string[j] = (char16)utf8_string[i]; + } + if(length(utf8_string[i]) == 2){ + char16_string[j] = utf8_string[i]; + char16_string[j] = char16_string[j] << 8; + char16_string[j] += utf8_string[++i]; + } + } + char16_string[j] = 0; + return char16_string; +} diff --git a/maca_morpho/src/fplm.h b/maca_morpho/src/fplm.h index 5806772..e6d539c 100644 --- a/maca_morpho/src/fplm.h +++ b/maca_morpho/src/fplm.h @@ -5,12 +5,13 @@ typedef enum { TENSE, PERSON, GENDER, NUMBER - }CLASS; + }TARGET; int read_line_fplm(FILE* fplm, char* form, char* pos, char* lemma, char* morpho); - int choose_class(char* class); - int extract_class_position(CLASS class); - void extract_morpho_feature(CLASS class, char* morpho_feature, char* morpho); - int associate_number_to_classes(FILE* classes_code, CLASS class, char* morpho_feature, int current_morpho_feature); + int extract_class_position(TARGET target); + int choose_target(char* target); + int associate_number_to_classes_separate(FILE* code_class, char* target_class, int current_target_class); + int associate_number_to_classes(FILE* code_class, char* target_class, int all_classes); + void extract_classes_from_morpho(TARGET target, char* target_class, char* morpho); #endif diff --git a/maca_morpho/src/fplm2cff.c b/maca_morpho/src/fplm2cff.c index 6547410..e9208f5 100644 --- a/maca_morpho/src/fplm2cff.c +++ b/maca_morpho/src/fplm2cff.c @@ -1,5 +1,6 @@ #include <stdlib.h> #include <stdio.h> +#include <string.h> #include "fplm2cff.h" int main(int argc, char *argv[]) @@ -7,9 +8,18 @@ int main(int argc, char *argv[]) context *ctx = context_read_options(argc, argv); if(ctx->help) fplm2cff_help_message(ctx); - create_cff(ctx); + if(ctx->target_name == NULL) + { + fprintf(stderr,"target name required\n"); + return -1; + } + if(!strcmp(ctx->target_name,"all")) + create_cff_all_classes(ctx); + else + create_cff(ctx); printf("cff.txt has been generated in the Files directory.\n"); - printf("code_class has been generated in the Files directory.\n"); + printf("The code class file has been generated in the Files directory.\n"); + context_free(ctx); return 0; } diff --git a/maca_morpho/src/fplm2cff.h b/maca_morpho/src/fplm2cff.h index ebe8910..cc1c4f1 100644 --- a/maca_morpho/src/fplm2cff.h +++ b/maca_morpho/src/fplm2cff.h @@ -9,8 +9,14 @@ #include "vectorize.h" #include "fplm.h" - void create_cff(context* ctx); - void write_cff(FILE *cff, CLASS class, FILE* code_class, char* form, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features); void fplm2cff_help_message(context *ctx); + + void create_cff_all_classes(context* ctx); + void write_cff_all_classes_non_separate(FILE *cff, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features); + void write_cff_all_classes_separate(FILE *cff, FILE* code_class_big, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features); + + void create_cff(context* ctx); + void write_cff_non_separate(FILE *cff, TARGET target, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features); + void write_cff_separate(FILE *cff, TARGET target, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features); #endif diff --git a/maca_morpho/src/fplm2cff_fct.c b/maca_morpho/src/fplm2cff_fct.c index e17d0bd..47f4952 100644 --- a/maca_morpho/src/fplm2cff_fct.c +++ b/maca_morpho/src/fplm2cff_fct.c @@ -11,10 +11,137 @@ void fplm2cff_help_message(context *ctx) context_maca_data_path_help_message(ctx); context_features_filename_help_message(ctx); context_features_model_help_message(ctx); - context_class_help_message(ctx); + context_separate_classes_help_message(ctx); + context_target_help_message(ctx); exit(1); } + /*Predict all morpho features -tense, person, gender, number- in one*/ +void create_cff_all_classes(context* ctx) +{ + FILE* fplm = NULL; + FILE* cff = NULL; + FILE* code_class = NULL; + feat_vec *fv = NULL; + dico *dico_features = NULL; + feat_model *fm = NULL; + int fields_nb; + char form[100]; + char pos[50]; + char lemma[100]; + char morpho[50]; + FP* fp = malloc(sizeof(FP)); + fp->form = malloc(sizeof(char)*100); + fp->pos = malloc(sizeof(char)*50); + fp->morpho = malloc(sizeof(char)*10); + fplm = fopen(ctx->fplm_filename,"r"); + if(fplm == NULL) + { + fprintf(stderr,"Could not open the fplm file.\n"); + exit(EXIT_FAILURE); + } + fv = feat_vec_new(10); + dico_features = dico_new("dico_features", 1000); + fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); + cff = fopen("../../Files/cff.txt","w"); + if(cff==NULL) + { + fprintf(stderr,"Problem with the cff file.\n"); + exit(EXIT_FAILURE); + } + code_class = fopen("../../Files/code_class","w+"); + if(code_class==NULL) + { + fprintf(stderr,"Problem with the classes_code file.\n"); + exit(EXIT_FAILURE); + } + while((fields_nb = read_line_fplm(fplm, form, pos, lemma, morpho)) != -1) + { + if(fields_nb!=4) + { + if(1) + { + fprintf(stderr, "form = %s pos = %s lemma = %s morpho = %s\n", form, pos, lemma, morpho); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + strcpy(fp->form,form); + strcpy(fp->pos,pos); + strcpy(fp->morpho,morpho); + if(ctx->separate_classes) + write_cff_all_classes_separate(cff, code_class, fp, morpho, fv, fm, dico_features); + else + write_cff_all_classes_non_separate(cff, code_class, fp, morpho, fv, fm, dico_features); + + } + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + free(fp->form); + free(fp->pos); + free(fp); + fclose(fplm); + fclose(cff); + fclose(code_class); +} + +void write_cff_all_classes_non_separate(FILE *cff, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features) +{ + fprintf(cff,"%d", associate_number_to_classes(code_class,morpho,1)); + form2fv(fp, fv, fm, dico_features, ADD_MODE); + feat_vec_print(cff, fv); +} + +void write_cff_all_classes_separate(FILE *cff, FILE* code_class_big, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features) +{ + char tense_class[10]; + char person_class[10]; + char gender_class[10]; + char number_class[10]; + char all_target[20]; + int i,j; + extract_classes_from_morpho(TENSE, tense_class, morpho); + extract_classes_from_morpho(PERSON, person_class, morpho); + extract_classes_from_morpho(GENDER, gender_class, morpho); + extract_classes_from_morpho(NUMBER, number_class, morpho); + if(tense_class[0] == '\0') + { + tense_class[0]='#'; + tense_class[1]='\0'; + } + if(person_class[0] == '\0') + { + person_class[0]='#'; + person_class[1]='\0'; + } + if(gender_class[0] == '\0') + { + gender_class[0]='#'; + gender_class[1]='\0'; + } + if(number_class[0] == '\0') + { + number_class[0]='#'; + number_class[1]='\0'; + } + for(i=0; i<(int)strlen(tense_class); i++) + { + for(j=0; j<(int)strlen(person_class); j++) + { + all_target[0] = tense_class[i]; + all_target[1] = person_class[j]; + all_target[2] = gender_class[0]; + all_target[3] = number_class[0]; + all_target[4] = '\0'; + + fprintf(cff,"%d", associate_number_to_classes(code_class_big, all_target, 1)); + form2fv(fp, fv, fm, dico_features, ADD_MODE); + feat_vec_print(cff, fv); + } + } +} + + /*Predict one morpho feature */ void create_cff(context* ctx) { FILE* fplm = NULL; @@ -28,7 +155,12 @@ void create_cff(context* ctx) char pos[50]; char lemma[100]; char morpho[50]; - CLASS class = choose_class(ctx->class_name); + char code_class_name[30]; + FP* fp = malloc(sizeof(FP)); + fp->form = malloc(sizeof(char)*100); + fp->pos = malloc(sizeof(char)*50); + fp->morpho = malloc(sizeof(char)*10); + TARGET target = choose_target(ctx->target_name); fplm = fopen(ctx->fplm_filename,"r"); if(fplm == NULL) { @@ -44,13 +176,15 @@ void create_cff(context* ctx) fprintf(stderr,"Problem with the cff file.\n"); exit(EXIT_FAILURE); } - code_class = fopen("../../Files/code_class","w+"); + strcpy(code_class_name,"../../Files/code_class_"); + strcat(code_class_name,ctx->target_name); + code_class = fopen(code_class_name,"w+"); if(code_class==NULL) { fprintf(stderr,"Problem with the classes_code file.\n"); exit(EXIT_FAILURE); } - fprintf(code_class,"%s\n",ctx->class_name); + fprintf(code_class,"%s\n",ctx->target_name); while((fields_nb = read_line_fplm(fplm, form, pos, lemma, morpho)) != -1) { if(fields_nb!=4) @@ -62,43 +196,57 @@ void create_cff(context* ctx) } continue; } - write_cff(cff, class, code_class, form, morpho, fv, fm, dico_features); + strcpy(fp->form,form); + strcpy(fp->pos,pos); + strcpy(fp->morpho,morpho); + if(ctx->separate_classes) + write_cff_separate(cff, target, code_class, fp, morpho, fv, fm, dico_features); + else + write_cff_non_separate(cff, target, code_class, fp, morpho, fv, fm, dico_features); } if(ctx->features_filename) dico_print(ctx->features_filename, dico_features); + free(fp->form); + free(fp->pos); + free(fp); fclose(fplm); fclose(cff); fclose(code_class); } -void write_cff(FILE *cff, CLASS class, FILE* code_class, char* form, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features) +void write_cff_non_separate(FILE *cff, TARGET target, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features) +{ + char target_class[10]; + extract_classes_from_morpho(target, target_class, morpho); + + /*write the class' code in cff*/ + if(target_class[0] == '\0') + fprintf(cff,"0"); + else + fprintf(cff,"%d", associate_number_to_classes(code_class,target_class,0)); + form2fv(fp, fv, fm, dico_features, ADD_MODE); + feat_vec_print(cff, fv); +} + +void write_cff_separate(FILE *cff, TARGET target, FILE* code_class, FP* fp, char* morpho, feat_vec *fv, feat_model *fm, dico *dico_features) { + int i=0; - char morpho_feature[10]; - extract_morpho_feature(class, morpho_feature, morpho); + char target_class[10]; + extract_classes_from_morpho(target, target_class, morpho); /*write the class' code in cff*/ - if(morpho_feature[0] == '\0') + if(target_class[0] == '\0') { fprintf(cff,"0"); - form2fv(form, fv, fm, dico_features, ADD_MODE); + form2fv(fp, fv, fm, dico_features, ADD_MODE); feat_vec_print(cff, fv); return; } - /*if(class == TENSE) + for(i=0; i<(int)strlen(target_class);i++) { - fprintf(cff,"%d", associate_number_to_classes(code_class,class,morpho_feature,0)); - form2fv(form, fv, fm, dico_features, ADD_MODE); + fprintf(cff,"%d", associate_number_to_classes_separate(code_class,target_class,i)); + form2fv(fp, fv, fm, dico_features, ADD_MODE); feat_vec_print(cff, fv); } - else - {*/ - for(i=0; i<(int)strlen(morpho_feature);i++) - { - fprintf(cff,"%d", associate_number_to_classes(code_class,class,morpho_feature,i)); - form2fv(form, fv, fm, dico_features, ADD_MODE); - feat_vec_print(cff, fv); - } - //} - } diff --git a/maca_morpho/src/fplm_fct.c b/maca_morpho/src/fplm_fct.c index b74ca14..f70f391 100644 --- a/maca_morpho/src/fplm_fct.c +++ b/maca_morpho/src/fplm_fct.c @@ -15,11 +15,10 @@ int read_line_fplm(FILE* fplm, char* form, char* pos, char* lemma, char* morpho) return fields_nb; } -/** Return the class' position in morpho - * (the class could be the tense, the person, the gender or the number of a word)**/ -int extract_class_position(CLASS class) +/** Return the class' position in morpho**/ +int extract_class_position(TARGET target) { - switch(class) + switch(target) { case TENSE: return 0; @@ -36,40 +35,40 @@ int extract_class_position(CLASS class) return -1; } -/** Return the class choosen by the user if their class exists**/ -int choose_class(char* class) +/** Return the target choosen by the user if their target exists**/ +int choose_target(char* target) { - if(!strcmp(class,"tense")) + if(!strcmp(target,"tense")) return TENSE; - else if(!strcmp(class,"person")) + else if(!strcmp(target,"person")) return PERSON; - else if(!strcmp(class,"gender")) + else if(!strcmp(target,"gender")) return GENDER; - else if(!strcmp(class,"number")) + else if(!strcmp(target,"number")) return NUMBER; else { - fprintf(stderr,"-c argument must be \"tense\", \"person\", \"gender\" or \"number\"\n"); + fprintf(stderr,"-t argument must be \"tense\", \"person\", \"gender\" or \"number\"\n"); exit(EXIT_FAILURE); } return -1; } -/** Extract the class we want from morpho and write it in morpho_feature**/ -void extract_morpho_feature(CLASS class, char* morpho_feature, char* morpho) +/** Extract the class we want from morpho and write it in target_class**/ +void extract_classes_from_morpho(TARGET target, char* target_class, char* morpho) { int cpt_diese = 0; int i = 0; int j; - int position = extract_class_position(class); - if(class==TENSE) + int position = extract_class_position(target); + if(target==TENSE) { while(morpho[i]!='#') { - morpho_feature[i] = morpho[i]; + target_class[i] = morpho[i]; i++; } - morpho_feature[i] = '\0'; + target_class[i] = '\0'; } else { @@ -81,38 +80,41 @@ void extract_morpho_feature(CLASS class, char* morpho_feature, char* morpho) { i++; for(j=0; morpho[i]!='#'; j++, i++) - morpho_feature[j] = morpho[i]; + target_class[j] = morpho[i]; } i++; } - morpho_feature[j] = '\0'; + target_class[j] = '\0'; } } /** Write the code_class file (use in predict to know the real class) * Return the class' code**/ -int associate_number_to_classes(FILE* classes_code, CLASS class, char* morpho_feature, int current_morpho_feature) +int associate_number_to_classes_separate(FILE* code_class, char* target_class, int current_target_class) { int code = 0; char tmp[20]; - rewind(classes_code); - fscanf(classes_code,"%s",tmp); - while(fscanf(classes_code,"%d %s\n",&code,tmp) == 2) - { - /*if(class == TENSE) - { - if(!strcmp(tmp, morpho_feature)) - return code; - } - else - {*/ - if(tmp[0] == morpho_feature[current_morpho_feature]) - return code; - //} - } - //if(class == TENSE) - //fprintf(classes_code, "%d %s\n", code+1, morpho_feature); - //else - fprintf(classes_code, "%d %c\n", code+1, morpho_feature[current_morpho_feature]); + rewind(code_class); + fscanf(code_class,"%s",tmp); //the first line is the target + while(fscanf(code_class,"%d %s\n",&code,tmp) == 2) + if(tmp[0] == target_class[current_target_class]) + return code; + fprintf(code_class, "%d %c\n", code+1, target_class[current_target_class]); + return code+1; +} + +int associate_number_to_classes(FILE* code_class, char* target_class, int all_classes) +{ + int code; + if(all_classes) code = -1; //if we want all classes in one + else code = 0; //else we start code with 1 (0 means no class) + char tmp[20]; + rewind(code_class); + if(!all_classes) + fscanf(code_class,"%s",tmp); + while(fscanf(code_class,"%d %s\n",&code,tmp) == 2) + if(!strcmp(tmp, target_class)) + return code; + fprintf(code_class, "%d %s\n", code+1, target_class); return code+1; } diff --git a/maca_morpho/src/maca_morpho_context.c b/maca_morpho/src/maca_morpho_context.c index 28fa8a0..43aa68a 100644 --- a/maca_morpho/src/maca_morpho_context.c +++ b/maca_morpho/src/maca_morpho_context.c @@ -16,7 +16,8 @@ void context_free(context *ctx) if(ctx->cfw_filename) free(ctx->cfw_filename); if(ctx->language) free(ctx->language); if(ctx->maca_data_path) free(ctx->maca_data_path); - if(ctx->class_name) free(ctx->class_name); + if(ctx->target_name) free(ctx->target_name); + if(ctx->code_class_filename) free(ctx->code_class_filename); free(ctx); } @@ -27,14 +28,15 @@ context *context_new(void) ctx->help = 0; ctx->verbose = 0; ctx->debug_mode = 0; + ctx->separate_classes = 0; ctx->program_name = NULL; ctx->fplm_filename = NULL; ctx->language = strdup("fr"); ctx->maca_data_path = NULL; ctx->features_filename = NULL; ctx->cfw_filename = NULL; - ctx->class_name = NULL; - ctx->code_class_name = NULL; + ctx->target_name = NULL; + ctx->code_class_filename = NULL; return ctx; } @@ -75,8 +77,8 @@ void context_features_model_help_message(context *ctx){ fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n"); } -void context_class_help_message(context *ctx){ - fprintf(stderr, "\t-c --class <string> : tense, person, gender or number\n"); +void context_target_help_message(context *ctx){ + fprintf(stderr, "\t-t --target : enter 'tense', 'person', 'gender', 'number' or 'all'\n"); } void context_fplm_test_percent_help_message(context *ctx){ @@ -87,6 +89,10 @@ void context_code_class_help_message(context* ctx){ fprintf(stderr, "\t-y --code_class <file> : code_class file name\n"); } +void context_separate_classes_help_message(context *ctx){ + fprintf(stderr, "\t-s --separate <int> : separate the classes for tense and person if you enter 1\n"); +} + context *context_read_options(int argc, char *argv[]) { int c; @@ -95,7 +101,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[13] = + static struct option long_options[14] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -107,14 +113,15 @@ context *context_read_options(int argc, char *argv[]) {"fm", required_argument, 0, 'F'}, {"feat", required_argument, 0, 'x'}, {"weights", required_argument, 0, 'w'}, - {"class", required_argument, 0, 'c'}, + {"target", required_argument, 0, 't'}, {"percent", required_argument, 0, 'p'}, - {"code_class", required_argument, 0, 'y'} + {"code_class", required_argument, 0, 'y'}, + {"separate", required_argument, 0, 's'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:y:x:w:c:p:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:y:x:w:t:p:s:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -144,15 +151,18 @@ context *context_read_options(int argc, char *argv[]) case 'w': ctx->cfw_filename = strdup(optarg); break; - case 'c': - ctx->class_name = strdup(optarg); + case 't': + ctx->target_name = strdup(optarg); break; case 'p': ctx->fplm_test_percent = atoi(optarg); break; case 'y': - ctx->code_class_name = strdup(optarg); + ctx->code_class_filename = strdup(optarg); break; + case 's': + ctx->separate_classes = atoi(optarg); + break; } } diff --git a/maca_morpho/src/maca_morpho_context.h b/maca_morpho/src/maca_morpho_context.h index cf66ad2..333e1a2 100644 --- a/maca_morpho/src/maca_morpho_context.h +++ b/maca_morpho/src/maca_morpho_context.h @@ -13,6 +13,7 @@ typedef struct { int verbose; int debug_mode; int fplm_test_percent; + int separate_classes; char *program_name; char *fplm_filename; char *language; @@ -20,8 +21,8 @@ typedef struct { char *fm_filename; char *features_filename; char *cfw_filename; - char *class_name; - char *code_class_name; + char *target_name; + char *code_class_filename; } context; @@ -37,7 +38,8 @@ void context_maca_data_path_help_message(context *ctx); void context_features_filename_help_message(context *ctx); void context_weights_matrix_filename_help_message(context *ctx); void context_features_model_help_message(context *ctx); -void context_class_help_message(context *ctx); +void context_target_help_message(context *ctx); void context_fplm_test_percent_help_message(context *ctx); void context_code_class_help_message(context* ctx); +void context_separate_classes_help_message(context *ctx); #endif diff --git a/maca_morpho/src/maca_morpho_feat_fct.c b/maca_morpho/src/maca_morpho_feat_fct.c index b9a33ea..2109f0e 100644 --- a/maca_morpho/src/maca_morpho_feat_fct.c +++ b/maca_morpho/src/maca_morpho_feat_fct.c @@ -3,102 +3,121 @@ #include <string.h> #include "feat_lib.h" #include "char16.h" -/* -char* type[22] = {"np","adj","nc","adv","prep","poncts","csu","v","vprespart","vppart", - "vinf","pres","ponctw","clr","det","coo","cln","pro","pri","prorel","clo","advneg"}; -int find_type(char* pos) +#include "fplm.h" + +char* all_pos[23] = {"np","adj","nc","adv","prep","poncts","csu","v","vprespart","vppart", + "vinf","pres","ponctw","clr","det","coo","cln","pro","pri","prorel","clo","advneg","titre"}; +char* all_person_non_separated[5] = {"1","2","3","12","13"}; + +int code_pos(char* pos) { int i; - for(i=0;i<22;i++) - if(!strcmp(pos,type[i])) + for(i=0; i<23; i++) + if(!strcmp(pos, all_pos[i])) return i; return -1; -}*/ +} +int code_person(char* class) +{ + int i; + for(i=0; i<5; i++) + if(!strcmp(class, all_person_non_separated[i])) + return i; + return -1; +} + /*patterns feature*/ -int s1(void* form) +int s1(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 1 < 0) return -1; return tmp[size - 1]; } -int s2(void* form) +int s2(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 2 < 0) return -1; return tmp[size - 2]; } -int s3(void* form) +int s3(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 3 < 0) return -1; return tmp[size - 3]; } -int s4(void* form) +int s4(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 4 < 0) return -1; return tmp[size - 4]; } -int s5(void* form) +int s5(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 5 < 0) return -1; return tmp[size - 5]; } -int s6(void* form) +int s6(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 6 < 0) return -1; return tmp[size - 6]; } -int s7(void* form) +int s7(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 7 < 0) return -1; return tmp[size - 7]; } -int s8(void* form) +int s8(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 8 < 0) return -1; return tmp[size - 8]; } -int s9(void* form) +int s9(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 9 < 0) return -1; return tmp[size - 9]; } -int s10(void* form) +int s10(FP* fp) { - char16* tmp = utf8tochar16(form); + char16* tmp = utf8tochar16(fp->form); int size = char16_strlen(tmp); if(tmp == NULL || size - 10 < 0) return -1; return tmp[size - 10]; -}/* -int code_pos(FP* fp) +} +int feat_person(FP* fp) +{ + char class[10]; + extract_classes_from_morpho(PERSON, class, fp->morpho); + return code_person(class); +} + +int feat_pos(FP* fp) { - return find_type(fp->pos); -}*/ + return code_pos(fp->pos); +} feat_lib *feat_lib_build(void) { @@ -114,10 +133,8 @@ feat_lib *feat_lib_build(void) { feat_lib_add(fl, 1, (char *)"s8", s8); feat_lib_add(fl, 1, (char *)"s9", s9); feat_lib_add(fl, 1, (char *)"s10", s10); - //feat_lib_add(fl, 1, (char *)"code_pose", code_pos); + feat_lib_add(fl, 1, (char *)"feat_pos", feat_pos); + feat_lib_add(fl, 1, (char *)"feat_person", feat_person); return fl; } - - - diff --git a/maca_morpho/src/predict.c b/maca_morpho/src/predict.c index 9fc8a1c..5d0f0bc 100644 --- a/maca_morpho/src/predict.c +++ b/maca_morpho/src/predict.c @@ -1,14 +1,19 @@ #include <stdlib.h> +#include <string.h> #include <stdio.h> #include "predict.h" - int main(int argc, char *argv[]) { context *ctx = context_read_options(argc, argv); if(ctx->help) predict_help_message(ctx); - create_predictions_file(ctx); - printf("prediction.txt has been generated in the Files directory.\n"); + if(ctx->target_name!=NULL && !strcmp(ctx->target_name, "all")) + predict_all_classes(ctx); + else if(ctx->code_class_filename == NULL) + predict_each_and_all_targets(ctx); + else + predict_target(ctx); + context_free(ctx); return 0; } diff --git a/maca_morpho/src/predict.h b/maca_morpho/src/predict.h index 28dade5..b50bb1d 100644 --- a/maca_morpho/src/predict.h +++ b/maca_morpho/src/predict.h @@ -10,10 +10,33 @@ #include "feature_table.h" #include "fplm.h" - void create_predictions_file(context* ctx); - void make_prediction(FILE* predictions, FILE* code_class, CLASS cl, int* errors, char* form, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm); - void errors_nb(FILE* code_class, CLASS class, int class_predicted, int* errors, char* morpho); - int extract_real_class(FILE* code_class, CLASS class, char* morpho_feature, int* real_class); void predict_help_message(context *ctx); + + /*Predict all classes in one*/ + void predict_all_classes(context* ctx); + /*classes are not separated*/ + void make_prediction_all_classes_non_separate(FILE* error_file, FILE* code_class, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm); + void errors_nb_all_classes_non_separate(FILE* error_file, FILE* code_class, FP* fp, int class_predicted, int* errors, char* morpho); + int extract_real_class_non_separate(FILE* code_class, char* target_class, int all_classes); + /*classes are separated*/ + void make_prediction_all_classes_separate(FILE* predictions, FILE* code_class_big, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm); + void errors_nb_all_classes_separate(FILE* error_file, FILE* code_class, FP* fp, int class_predicted, int* errors, char* morpho); + int extract_real_class_all_classes_separate(FILE* code_class, int* real_class, char* morpho); + + /*Predict a target*/ + void predict_target(context* ctx); + /*classes are separated*/ + void make_prediction_separate(FILE* error_file, FILE* predict_file, FILE* code_class, TARGET target, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm); + void errors_nb_separate(FILE* error_file, FILE* code_class, TARGET target, int class_predicted, int* errors, char* morpho, FP* fp); + int extract_real_class_separate(FILE* code_class, char* target_class, int* real_class); + /*classes are not separated*/ + void make_prediction_non_separate(FILE* error_file, FILE* y, FILE* code_class, TARGET target, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm); + void errors_nb_non_separate(FILE* error_file, FILE* code_class, TARGET target, FP* fp, int class_predicted, int* errors, char* morpho); + + /*Predict each targets then predict all classes in one*/ + void predict_each_and_all_targets(context* ctx); + void calculate_global_success_rate(context* ctx, FILE* y, FILE* y2, FILE* y3, FILE* y4,int* global_error, int* class_predicted_array,FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho); + void put_in_array_real_classes(int* real_classes_array, FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho); + void compare_predicted_and_real_class(FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho, int* global_error, int* class_predicted_array); #endif diff --git a/maca_morpho/src/predict_fct.c b/maca_morpho/src/predict_fct.c index 8fdbfc3..4591ffa 100644 --- a/maca_morpho/src/predict_fct.c +++ b/maca_morpho/src/predict_fct.c @@ -16,10 +16,11 @@ void predict_help_message(context *ctx) exit(1); } -void create_predictions_file(context* ctx) + /*Fonctions for all classes' prediction*/ +void predict_all_classes(context* ctx) { FILE* fplm_test = NULL; - FILE* predictions = NULL; + FILE* error_file = NULL; FILE* code_class = NULL; feature_table *cfw = NULL; feat_vec *fv = NULL; @@ -32,34 +33,225 @@ void create_predictions_file(context* ctx) char pos[50]; char lemma[100]; char morpho[50]; - char class_name[10]; - CLASS class; - code_class = fopen(ctx->code_class_name,"r"); + FP* fp = malloc(sizeof(FP)); + fp->form = malloc(sizeof(char)*100); + fp->pos = malloc(sizeof(char)*50); + fp->morpho = malloc(sizeof(char)*10); + code_class = fopen(ctx->code_class_filename,"r"); + if(code_class==NULL) + { + fprintf(stderr, "Could not open the code_class file.\n"); + exit(EXIT_FAILURE); + } + fplm_test = fopen(ctx->fplm_filename,"r"); + if(fplm_test == NULL) + { + fprintf(stderr,"Could not open input file.\nYou can generate a fplm_test file with fplm2train_test\nThe fplm_test file will be in the Files directory.\n"); + exit(EXIT_FAILURE); + } + cfw = feature_table_load(ctx->cfw_filename, ctx->verbose); + fv = feat_vec_new(10); + dico_features = dico_read(ctx->features_filename, 0.5); + fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); + error_file = fopen("../../Files/predict_error.txt","w"); + if(error_file==NULL) + { + fprintf(stderr,"Problem with the error file.\n"); + exit(EXIT_FAILURE); + } + while((fields_nb = read_line_fplm(fplm_test, form, pos, lemma, morpho)) != -1) + { + if(fields_nb!=4) + { + if(1) + { + fprintf(stderr, "form = %s pos = %s lemma = %s morpho = %s\n", form, pos, lemma, morpho); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + line_nb++; + strcpy(fp->form,form); + strcpy(fp->pos,pos); + strcpy(fp->morpho,morpho); + if(ctx->separate_classes) + make_prediction_all_classes_separate(error_file, code_class, &errors, fp, morpho, cfw, fv, dico_features, fm); + else + make_prediction_all_classes_non_separate(error_file, code_class, &errors, fp, morpho, cfw, fv, dico_features, fm); + } + printf("Success rate : %lf %%\n", (float)100-((float)errors*100/line_nb)); + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + free(fp->form); + free(fp->pos); + free(fp); + fclose(fplm_test); + fclose(error_file); + fclose(code_class); +} + + /*Fonctions for not-separated classes*/ +/** Predict a class for the current word **/ +void make_prediction_all_classes_non_separate(FILE* error_file, FILE* code_class, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm) +{ + int class_predicted; + float max; + form2fv(fp, fv, fm, dico_features, LOOKUP_MODE); + class_predicted = feature_table_argmax(fv, cfw, &max); + errors_nb_all_classes_non_separate(error_file, code_class, fp,class_predicted, errors, morpho); +} + +/** Increment the number of errors if the programm has predicted the wrong class **/ +void errors_nb_all_classes_non_separate(FILE* error_file, FILE* code_class, FP* fp, int class_predicted, int* errors, char* morpho) +{ + int real_class; + real_class = extract_real_class_non_separate(code_class, morpho, 1); + if(class_predicted != real_class) + { + *errors = *errors+1; + fprintf(error_file, "form = %s | class predicted = %d | real class = %d\n", fp->form, class_predicted, real_class); + } +} + +/** Return the word's real class (also used to predict one target)**/ +int extract_real_class_non_separate(FILE* code_class, char* target_class, int all_classes) +{ + if(target_class[0]=='\0') + return 0; + return associate_number_to_classes(code_class, target_class,all_classes); +} + + /*Fonctions for separated classes*/ +void make_prediction_all_classes_separate(FILE* error_file, FILE* code_class, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm) +{ + int class_predicted; + float max; + form2fv(fp, fv, fm, dico_features, LOOKUP_MODE); + class_predicted = feature_table_argmax(fv, cfw, &max); + errors_nb_all_classes_separate(error_file, code_class, fp, class_predicted, errors, morpho); +} + +void errors_nb_all_classes_separate(FILE* error_file, FILE* code_class, FP* fp, int class_predicted, int* errors, char* morpho) +{ + int i; + int size = 10; + int real_class[40]; + size = extract_real_class_all_classes_separate(code_class, real_class, morpho); + for(i=0; i<=size; i++) + if(class_predicted == real_class[i]) + return; + *errors = *errors+1; + fprintf(error_file, "form = %s | class predicted = %d | real class =", fp->form, class_predicted); + for(i=0; i<=size; i++) + fprintf(error_file, " %d", real_class[i]); + fprintf(error_file,"\n"); +} + +int extract_real_class_all_classes_separate(FILE* code_class, int* real_class, char* morpho) +{ + int size = -1; + char tense_class[10]; + char person_class[10]; + char gender_class[10]; + char number_class[10]; + char all_target[20]; + int i,j; + extract_classes_from_morpho(TENSE, tense_class, morpho); + extract_classes_from_morpho(PERSON, person_class, morpho); + extract_classes_from_morpho(GENDER, gender_class, morpho); + extract_classes_from_morpho(NUMBER, number_class, morpho); + if(tense_class[0] == '\0') + { + tense_class[0]='#'; + tense_class[1]='\0'; + } + if(person_class[0] == '\0') + { + person_class[0]='#'; + person_class[1]='\0'; + } + if(gender_class[0] == '\0') + { + gender_class[0]='#'; + gender_class[1]='\0'; + } + if(number_class[0] == '\0') + { + number_class[0]='#'; + number_class[1]='\0'; + } + for(i=0; i<(int)strlen(tense_class); i++) + { + for(j=0; j<(int)strlen(person_class); j++) + { + all_target[0] = tense_class[i]; + all_target[1] = person_class[j]; + all_target[2] = gender_class[0]; + all_target[3] = number_class[0]; + all_target[4] = '\0'; + size++; + real_class[size] = associate_number_to_classes(code_class, all_target, 1); + } + } + + return size; +} + + + /*Fonctions for a target's prediction*/ +void predict_target(context* ctx) +{ + FILE* y = NULL; + FILE* fplm_test = NULL; + FILE* error_file = NULL; + FILE* code_class = NULL; + feature_table *cfw = NULL; + feat_vec *fv = NULL; + dico *dico_features = NULL; + feat_model *fm = NULL; + int line_nb=0; + int fields_nb; + int errors = 0; + char form[100]; + char pos[50]; + char lemma[100]; + char morpho[50]; + char target_name[10]; + char predict_name[50]; + FP* fp = malloc(sizeof(FP)); + fp->form = malloc(sizeof(char)*100); + fp->pos = malloc(sizeof(char)*50); + fp->morpho = malloc(sizeof(char)*10); + TARGET target; + code_class = fopen(ctx->code_class_filename,"r"); if(code_class==NULL) { fprintf(stderr, "Could not the code_class file.\n"); exit(EXIT_FAILURE); } - if(fscanf(code_class,"%s",class_name)!=1) + if(fscanf(code_class,"%s",target_name)!=1) { fprintf(stderr, "Your code_class file is not conform.\n"); exit(EXIT_FAILURE); } + strcpy(predict_name,"../../Files/predict_"); + strcat(predict_name,target_name); + y = fopen(predict_name,"w"); fplm_test = fopen(ctx->fplm_filename,"r"); if(fplm_test == NULL) { fprintf(stderr,"Could not open input file.\nYou can generate a fplm_test file with fplm2train_test\nThe fplm_test file will be in the Files directory.\n"); exit(EXIT_FAILURE); } - class = choose_class(class_name); + target = choose_target(target_name); cfw = feature_table_load(ctx->cfw_filename, ctx->verbose); fv = feat_vec_new(10); dico_features = dico_read(ctx->features_filename, 0.5); fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); - predictions = fopen("../../Files/prediction.txt","w"); - if(predictions==NULL) + error_file = fopen("../../Files/predict_error.txt","w"); + if(error_file==NULL) { - fprintf(stderr,"Problem with the prediction file.\n"); + fprintf(stderr,"Problem with the error file.\n"); exit(EXIT_FAILURE); } while((fields_nb = read_line_fplm(fplm_test, form, pos, lemma, morpho)) != -1) @@ -74,71 +266,215 @@ void create_predictions_file(context* ctx) continue; } line_nb++; - make_prediction(predictions, code_class, class, &errors, form, morpho, cfw, fv, dico_features, fm); + strcpy(fp->form,form); + strcpy(fp->pos,pos); + strcpy(fp->morpho,morpho); + if(ctx->separate_classes) + make_prediction_separate(error_file, y, code_class, target, &errors, fp, morpho, cfw, fv, dico_features, fm); + else + make_prediction_non_separate(error_file, y, code_class, target, &errors, fp, morpho, cfw, fv, dico_features, fm); } - printf("Error rate : %lf %%\n", (float)errors*100/line_nb); + printf("Success rate : %lf %%\n", (float)100-((float)errors*100/line_nb)); if(ctx->features_filename) dico_print(ctx->features_filename, dico_features); + + free(fp->form); + free(fp->pos); + free(fp); + fclose(y); fclose(fplm_test); - fclose(predictions); + fclose(error_file); fclose(code_class); } -void make_prediction(FILE* predictions, FILE* code_class, CLASS cl, int* errors, char* form, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm) + /*Fonctions for separated classes*/ +void make_prediction_separate(FILE* error_file, FILE* predict_file, FILE* code_class, TARGET target, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm) { - int class; + int class_predicted; float max; - fprintf(predictions, "form = %s\n", form); - form2fv(form, fv, fm, dico_features, LOOKUP_MODE); - class = feature_table_argmax(fv, cfw, &max); - feat_vec_print(predictions, fv); - fprintf(predictions, "class predicted = %d ", class); - errors_nb(code_class, cl, class, errors, morpho); + form2fv(fp, fv, fm, dico_features, LOOKUP_MODE); + class_predicted = feature_table_argmax(fv, cfw, &max); + fprintf(predict_file,"%d\n",class_predicted); + errors_nb_separate(error_file, code_class, target, class_predicted, errors, morpho, fp); } -void errors_nb(FILE* code_class, CLASS class, int class_predicted, int* errors, char* morpho) +void errors_nb_separate(FILE* error_file, FILE* code_class, TARGET target, int class_predicted, int* errors, char* morpho, FP* fp) { int i; int size = 10; int real_class[10]; - char morpho_feature[10]; - extract_morpho_feature(class,morpho_feature,morpho); - size = extract_real_class(code_class, class, morpho_feature, real_class); + char target_class[10]; + extract_classes_from_morpho(target,target_class,morpho); + size = extract_real_class_separate(code_class, target_class, real_class); for(i=0; i<=size; i++) if(class_predicted == real_class[i]) return; *errors = *errors+1; + fprintf(error_file, "form = %s | class predicted = %d | real class =", fp->form, class_predicted); + for(i=0; i<=size; i++) + fprintf(error_file, " %d",real_class[i]); + fprintf(error_file, "\n"); + } -int extract_real_class(FILE* code_class, CLASS class, char* morpho_feature, int* real_class) +int extract_real_class_separate(FILE* code_class, char* target_class, int* real_class) { int size = -1; int i; - if(morpho_feature[0]=='\0') + if(target_class[0]=='\0') { size++; real_class[size]=0; } - /*else if(class == TENSE) - { - size++; - real_class[size] = associate_number_to_classes(code_class,class,morpho_feature,0); - }*/ - // else { - for(i=0; i<(int)strlen(morpho_feature);i++) //do not parcour the array if it's the tense !! + for(i=0; i<(int)strlen(target_class); i++) { size++; - real_class[size] = associate_number_to_classes(code_class,class,morpho_feature,i); + real_class[size] = associate_number_to_classes_separate(code_class,target_class,i); } } return size; } + /*Fonctions for not-separated classes*/ +void make_prediction_non_separate(FILE* error_file, FILE* y, FILE* code_class, TARGET target, int* errors, FP* fp, char* morpho, feature_table *cfw, feat_vec *fv, dico *dico_features, feat_model *fm) +{ + int class_predicted; + float max; + form2fv(fp, fv, fm, dico_features, LOOKUP_MODE); + class_predicted = feature_table_argmax(fv, cfw, &max); + fprintf(y,"%d\n",class_predicted); + errors_nb_non_separate(error_file, code_class, target, fp, class_predicted, errors, morpho); +} +void errors_nb_non_separate(FILE* error_file, FILE* code_class, TARGET target, FP* fp, int class_predicted, int* errors, char* morpho) +{ + int real_class; + char target_class[10]; + extract_classes_from_morpho(target, target_class, morpho); + real_class = extract_real_class_non_separate(code_class, target_class, 0); + if(class_predicted != real_class) + { + *errors = *errors+1; + fprintf(error_file, "form = %s | class predicted = %d | real class = %d\n", fp->form, class_predicted, real_class); + } +} + /* Fonctions for all target's prediction after having predicted each targets */ +void predict_each_and_all_targets(context* ctx) +{ + FILE* fplm_test = fopen(ctx->fplm_filename,"r"); + FILE* y = fopen("../../Files/predict_tense","r"); + FILE* y2 = fopen("../../Files/predict_person","r"); + FILE* y3 = fopen("../../Files/predict_gender","r"); + FILE* y4 = fopen("../../Files/predict_number","r"); + FILE* ycode_class = fopen("../../Files/code_class_tense","r"); + FILE* y2code_class = fopen("../../Files/code_class_person","r"); + FILE* y3code_class = fopen("../../Files/code_class_gender","r"); + FILE* y4code_class = fopen("../../Files/code_class_number","r"); + int class_predicted_array[4]; + int fields_nb; + int global_error = 0; + int line_nb = 0; + char form[100]; + char pos[20]; + char lemma[100]; + char morpho[20]; + while((fields_nb = read_line_fplm(fplm_test, form, pos, lemma, morpho)) != -1) + { + line_nb++; + calculate_global_success_rate(ctx,y,y2,y3,y4,&global_error,class_predicted_array,ycode_class,y2code_class,y3code_class,y4code_class,morpho); + } + printf("Global success rate : %lf %%\n", 100-((float)global_error*100/line_nb)); +} +void calculate_global_success_rate(context* ctx, FILE* y, FILE* y2, FILE* y3, FILE* y4,int* global_error, int* class_predicted_array,FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho) +{ + fscanf(y, "%d", &class_predicted_array[0]); //tense + fscanf(y2, "%d", &class_predicted_array[1]); //person + fscanf(y3, "%d", &class_predicted_array[2]); //gender + fscanf(y4, "%d", &class_predicted_array[3]); //number + + if(ctx->separate_classes) + compare_predicted_and_real_class(ycode_class,y2code_class,y3code_class,y4code_class,morpho,global_error,class_predicted_array); + else + { + int real_classes_array[4]; + put_in_array_real_classes(real_classes_array,ycode_class,y2code_class,y3code_class,y4code_class,morpho); + for(int i=0; i<4; i++) + if(class_predicted_array[i]!=real_classes_array[i]) + { + *global_error = *global_error + 1; + return; + } + } +} +/*for not-separated classes*/ +void put_in_array_real_classes(int* real_classes_array, FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho) +{ + char target_class[10]; + extract_classes_from_morpho(TENSE,target_class,morpho); + real_classes_array[0] = extract_real_class_non_separate(ycode_class, target_class, 0); + extract_classes_from_morpho(PERSON,target_class,morpho); + real_classes_array[1] = extract_real_class_non_separate(y2code_class, target_class, 0); + extract_classes_from_morpho(GENDER,target_class,morpho); + real_classes_array[2] = extract_real_class_non_separate(y3code_class, target_class, 0); + extract_classes_from_morpho(NUMBER,target_class,morpho); + real_classes_array[3] = extract_real_class_non_separate(y4code_class, target_class, 0); +} +/*for separated classes*/ +void compare_predicted_and_real_class(FILE* ycode_class,FILE* y2code_class,FILE* y3code_class,FILE* y4code_class,char* morpho, int* global_error, int* class_predicted_array) +{ + char target_class[10]; + int real_class[10]; + int size = -1; + int i; + int err_glo=1; + + extract_classes_from_morpho(TENSE,target_class,morpho); + size = extract_real_class_separate(ycode_class, target_class, real_class); + for(i=0; i<=size; i++) + if(class_predicted_array[0] == real_class[i]) + err_glo=0; + if(err_glo) + { + *global_error = *global_error+1; + return; + } + extract_classes_from_morpho(PERSON,target_class,morpho); + size = extract_real_class_separate(y2code_class, target_class, real_class); + err_glo=1; + for(i=0; i<=size; i++) + if(class_predicted_array[1] == real_class[i]) + err_glo=0; + if(err_glo) + { + *global_error = *global_error+1; + return; + } + + extract_classes_from_morpho(GENDER,target_class,morpho); + size = extract_real_class_separate(y3code_class, target_class, real_class); + err_glo=1; + for(i=0; i<=size; i++) + if(class_predicted_array[2] == real_class[i]) + err_glo=0; + if(err_glo) + { + *global_error = *global_error+1; + return; + } + + extract_classes_from_morpho(NUMBER,target_class,morpho); + size = extract_real_class_separate(y4code_class, target_class, real_class); + err_glo=1; + for(i=0; i<=size; i++) + if(class_predicted_array[3] == real_class[i]) + err_glo=0; + if(err_glo) + *global_error = *global_error+1; +} diff --git a/maca_morpho/src/vectorize.c b/maca_morpho/src/vectorize.c index f7f4313..15e38eb 100644 --- a/maca_morpho/src/vectorize.c +++ b/maca_morpho/src/vectorize.c @@ -3,7 +3,7 @@ #include<string.h> #include"vectorize.h" -int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, int mode) +int get_feat_value(feat_model *fm, FP* fp, dico *dico_features, int feat_nb, int mode) { feat_desc *fd = fm->array[feat_nb]; int i; @@ -14,7 +14,7 @@ int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, fm->string[0] = '\0'; for(i=0; i < fd->nbelem; i++){ strcat(fm->string, fd->array[i]->name); - feat_val = fd->array[i]->fct(form); + feat_val = fd->array[i]->fct(fp); sprintf(str, "%d", feat_val); strcat(fm->string, str); @@ -28,11 +28,11 @@ int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, } -feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode) +feat_vec *form2fv(FP* fp, feat_vec *fv, feat_model *fm, dico *dico_features, int mode) { int i; feat_vec_empty(fv); for(i=0; i < fm->nbelem; i++) - feat_vec_add(fv, get_feat_value(fm, form, dico_features, i, mode)); + feat_vec_add(fv, get_feat_value(fm, fp, dico_features, i, mode)); return fv; } diff --git a/maca_morpho/src/vectorize.h b/maca_morpho/src/vectorize.h index c859605..7779a96 100644 --- a/maca_morpho/src/vectorize.h +++ b/maca_morpho/src/vectorize.h @@ -9,6 +9,6 @@ #define ADD_MODE 2 -feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode); +feat_vec *form2fv(FP* fp, feat_vec *fv, feat_model *fm, dico *dico_features, int mode); #endif -- GitLab