diff --git a/maca_morpho/CMakeLists.txt b/maca_morpho/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..eceae0e3bd3e790cfd5fff7ea8d6cdbfd227798d --- /dev/null +++ b/maca_morpho/CMakeLists.txt @@ -0,0 +1,30 @@ +set(SOURCES + src/maca_morpho_feat_fct.c + src/maca_morpho_context.c + src/vectorize.c +) + + + +#compiling library +include_directories(src) +add_library(maca_morpho STATIC ${SOURCES}) +target_link_libraries(maca_morpho perceptron) +target_link_libraries(maca_morpho maca_common) + + + +#compiling, linking and installing executables + +add_executable(fplm2cff ./src/fplm2cff.c) +target_link_libraries(fplm2cff perceptron) +target_link_libraries(fplm2cff maca_common) +target_link_libraries(fplm2cff maca_morpho) +install (TARGETS fplm2cff DESTINATION bin) + +add_executable(predict ./src/predict.c) +target_link_libraries(predict perceptron) +target_link_libraries(predict maca_common) +target_link_libraries(predict maca_morpho) +install (TARGETS predict DESTINATION bin) + diff --git a/maca_morpho/src/fplm2cff.c b/maca_morpho/src/fplm2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..f2a311888fc0e22d03b69f20b4df71870ba5cf5b --- /dev/null +++ b/maca_morpho/src/fplm2cff.c @@ -0,0 +1,92 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "maca_morpho_context.h" +#include "feat_model.h" +#include "feat_vec.h" +#include "dico.h" +#include "util.h" +#include "vectorize.h" + +void decompose_feature_value(char *feature_value, char *feature, char *value) +{ + int i,j; + int l = strlen(feature_value); + int before = 1; + for(i=0; (i < l) && (feature_value[i] != '='); i++){ + feature[i] = feature_value[i]; + } + feature[i] = '\0'; + i++; + for(j=0; i<l; i++, j++){ + value[j] = feature_value[i]; + } + value[j] = '\0'; +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + if(ctx->help){ + context_general_help_message(ctx); + context_language_help_message(ctx); + context_fplm_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_features_filename_help_message(ctx); + context_features_model_help_message(ctx); + exit(1); + } + feat_vec *fv = feat_vec_new(10); + dico *dico_features = dico_new("dico_features", 1000); + /* feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); */ + char form[100]; + char pos[100]; + char lemma[100]; + char morpho[100]; + FILE *F_fplm = NULL; + char buffer[1000]; + char feature_value[100]; + char feature[100]; + char value[100]; + char *token; + + + F_fplm = myfopen(ctx->fplm_filename, "r"); + + + while(fgets(buffer, 1000, F_fplm)){ + if(feof(F_fplm)) + break; + // printf("%s", buffer); + buffer[strlen(buffer) - 1] = '\0'; + sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form, pos, lemma, morpho); + //printf("form = %s pos = %s lemma = %s morpho = %s\n", form, pos, lemma, morpho); + token = strtok(morpho, "|"); + do{ + //printf("token = %s\n", token); + decompose_feature_value(token, feature, value); + //printf("feature = %s value = %s\n", feature, value); + }while((token = strtok(NULL, "|"))); + + + } + fclose(F_fplm); +} + + /* + while(strcmp(form, "end")){ + fscanf(stdin, "%s", form); + printf("form = %s\n", form); + form2fv(form, fv, fm, dico_features, ADD_MODE); + //void feat_vec_print_string(feat_vec *fv, dico *dico_features); + feat_vec_print(stdout, fv); + } + //dico_print_fh(stdout, dico_features); + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + */ + + + diff --git a/maca_morpho/src/maca_morpho_context.c b/maca_morpho/src/maca_morpho_context.c new file mode 100644 index 0000000000000000000000000000000000000000..5a82e3cb56d9e11b4f5682e5aa3b1a1f59ccc4c6 --- /dev/null +++ b/maca_morpho/src/maca_morpho_context.c @@ -0,0 +1,166 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "maca_morpho_context.h" +#include "util.h" + + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->cfw_filename) free(ctx->cfw_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + ctx->features_filename = NULL; + ctx->cfw_filename = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_fplm_help_message(context *ctx){ + fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); +} + +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); +} + +void context_maca_data_path_help_message(context *ctx){ + fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); +} + +void context_fm_help_message(context *ctx){ + fprintf(stderr, "\t-F --fm <file> : feature model file name\n"); +} + +void context_features_filename_help_message(context *ctx){ + fprintf(stderr, "\t-x --feat <file> : features dictionary file name\n"); +} + +void context_weights_matrix_filename_help_message(context *ctx){ + fprintf(stderr, "\t-w --weights <file> : weight matrix (cfw) filename\n"); +} + +void context_features_model_help_message(context *ctx){ + fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n"); +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[10] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"mcd", required_argument, 0, 'C'}, + {"language", required_argument, 0, 'L'}, + {"fplm", required_argument, 0, 'f'}, + {"maca_data_path", required_argument, 0, 'D'}, + {"fm", required_argument, 0, 'F'}, + {"feat", required_argument, 0, 'x'}, + {"weights", required_argument, 0, 'w'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:x:w:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'L': + ctx->language = strdup(optarg); + break; + case 'D': + ctx->maca_data_path = strdup(optarg); + break; + case 'F': + ctx->fm_filename = strdup(optarg); + break; + case 'x': + ctx->features_filename = strdup(optarg); + break; + case 'w': + ctx->cfw_filename = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { + strcat(absolute_path, e); + } else { + fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); + } + } + + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + +} diff --git a/maca_morpho/src/maca_morpho_context.h b/maca_morpho/src/maca_morpho_context.h new file mode 100644 index 0000000000000000000000000000000000000000..c1789a54631e3bcc3ba695573dc1cf784e177f32 --- /dev/null +++ b/maca_morpho/src/maca_morpho_context.h @@ -0,0 +1,37 @@ +#ifndef __MACA_MORPHO_CONTEXT__ +#define __MACA_MORPHO_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +#define DEFAULT_FPLM_FILENAME "fplm" + + + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *fplm_filename; + char *language; + char *maca_data_path; + char *fm_filename; + char *features_filename; + char *cfw_filename; +} context; + + + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_fplm_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_features_filename_help_message(context *ctx); +void context_weights_matrix_filename_help_message(context *ctx); +void context_features_model_help_message(context *ctx); +#endif diff --git a/maca_morpho/src/maca_morpho_feat_fct.c b/maca_morpho/src/maca_morpho_feat_fct.c new file mode 100644 index 0000000000000000000000000000000000000000..30b5ccab5b62b1350b4b33e84798eb90a8fde42f --- /dev/null +++ b/maca_morpho/src/maca_morpho_feat_fct.c @@ -0,0 +1,19 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"feat_lib.h" + + + +int s1(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 1];} +int s2(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 2];} + +feat_lib *feat_lib_build(void) +{ + feat_lib *fl = feat_lib_new(); + + feat_lib_add(fl, 1, (char *)"s1", s1); + feat_lib_add(fl, 1, (char *)"s2", s2); + return fl; +} + diff --git a/maca_morpho/src/predict.c b/maca_morpho/src/predict.c new file mode 100644 index 0000000000000000000000000000000000000000..af6de629ca83b31fc962e4f478544fd4ea78081f --- /dev/null +++ b/maca_morpho/src/predict.c @@ -0,0 +1,54 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "maca_morpho_context.h" +#include "feat_model.h" +#include "feat_vec.h" +#include "dico.h" +#include "util.h" +#include "vectorize.h" +#include "feature_table.h" + +void predict_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_language_help_message(ctx); + context_fplm_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_features_filename_help_message(ctx); + context_weights_matrix_filename_help_message(ctx); + context_features_model_help_message(ctx); + exit(1); +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + if(ctx->help) predict_help_message(ctx); + feature_table *cfw = feature_table_load(ctx->cfw_filename, ctx->verbose); + feat_vec *fv = feat_vec_new(10); + dico *dico_features = dico_read(ctx->features_filename, 0.5); + feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); + char form[100]; + int class; + float max; + + + while(strcmp(form, "end")){ + fscanf(stdin, "%s", form); + printf("form = %s\n", form); + form2fv(form, fv, fm, dico_features, LOOKUP_MODE); + class = feature_table_argmax(fv, cfw, &max); + feat_vec_print(stdout, fv); + printf("class = %d\n", class); + + } + + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + + + +} diff --git a/maca_morpho/src/vectorize.c b/maca_morpho/src/vectorize.c new file mode 100644 index 0000000000000000000000000000000000000000..f7f43136a5f0c6e1a7d3e53f3ff1ea406d589070 --- /dev/null +++ b/maca_morpho/src/vectorize.c @@ -0,0 +1,38 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"vectorize.h" + +int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, int mode) +{ + feat_desc *fd = fm->array[feat_nb]; + int i; + int feat_val; + char str[10]; + + /* the name of the feature is built in fm->string and its value in the dictionnary (dico_features) is returned */ + fm->string[0] = '\0'; + for(i=0; i < fd->nbelem; i++){ + strcat(fm->string, fd->array[i]->name); + feat_val = fd->array[i]->fct(form); + sprintf(str, "%d", feat_val); + strcat(fm->string, str); + + /* catenate_int(fm->string, feat_val); */ + } + if(mode == LOOKUP_MODE){ + if(fm->string) + return dico_string2int(dico_features, fm->string); + } + return dico_add(dico_features, fm->string); +} + + +feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode) +{ + int i; + feat_vec_empty(fv); + for(i=0; i < fm->nbelem; i++) + feat_vec_add(fv, get_feat_value(fm, form, dico_features, i, mode)); + return fv; +} diff --git a/maca_morpho/src/vectorize.h b/maca_morpho/src/vectorize.h new file mode 100644 index 0000000000000000000000000000000000000000..c859605c68cc9cbcfdc0ad169871047acfd0bec0 --- /dev/null +++ b/maca_morpho/src/vectorize.h @@ -0,0 +1,14 @@ +#ifndef __VECTORIZE__ +#define __VECTORIZE__ + +#include"dico.h" +#include"feat_model.h" +#include"feat_vec.h" + +#define LOOKUP_MODE 1 +#define ADD_MODE 2 + + +feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode); + +#endif