diff --git a/CMakeLists.txt b/CMakeLists.txt index 389bdf0e217738811da3e08daf8eaac4d5703f23..91554712dcc49e8a121a5f2b672a0e7d8d35ebb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) add_subdirectory(maca_graph_parser) +add_subdirectory(maca_morpho) if(MACA_EXPORT) add_subdirectory(maca_export) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 503e9214c5a8190648c7c237e0e7e5f96b12cd54..e8ca2b06c3475c322f5b9b8415054a23f366ddd6 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -9,9 +9,9 @@ set(SOURCES src/util.c src/sentence.c src/word_buffer.c src/trie.c - src/feat_desc.c - src/feat_lib.c - src/feat_model.c + src/feat_desc.c + src/feat_lib.c + src/feat_model.c ) diff --git a/maca_morpho/CMakeLists.txt b/maca_morpho/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a9b0bc7dc977135bf2d5c94dc9f12333da81c84 --- /dev/null +++ b/maca_morpho/CMakeLists.txt @@ -0,0 +1,23 @@ +set(SOURCES + src/feat_fct.c + src/context.c +) + + + +#compiling library +include_directories(src) +add_library(maca_morpho STATIC ${SOURCES}) +target_link_libraries(maca_morpho perceptron) +target_link_libraries(maca_morpho maca_common) + + + +#compiling, linking and installing executables + +add_executable(fplm2cff ./src/fplm2cff.c) +target_link_libraries(fplm2cff perceptron) +target_link_libraries(fplm2cff maca_common) +target_link_libraries(fplm2cff maca_morpho) +install (TARGETS fplm2cff DESTINATION bin) + diff --git a/maca_morpho/src/context.c b/maca_morpho/src/context.c new file mode 100644 index 0000000000000000000000000000000000000000..feed4972a06e4a7ff0ec4b8a2e2fd0ba255b4f7d --- /dev/null +++ b/maca_morpho/src/context.c @@ -0,0 +1,143 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "context.h" +#include "util.h" + + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_fplm_help_message(context *ctx){ + fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); +} + +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); +} +void context_maca_data_path_help_message(context *ctx){ + fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); +} + +void context_fm_help_message(context *ctx){ + fprintf(stderr, "\t-F --fm <file> : feature model file name\n"); +} + + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[8] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"mcd", required_argument, 0, 'C'}, + {"language", required_argument, 0, 'L'}, + {"fplm", required_argument, 0, 'f'}, + {"maca_data_path", required_argument, 0, 'D'}, + {"fm", required_argument, 0, 'F'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'L': + ctx->language = strdup(optarg); + break; + case 'D': + ctx->maca_data_path = strdup(optarg); + break; + case 'F': + ctx->fm_filename = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { + strcat(absolute_path, e); + } else { + fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); + } + } + + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + +} diff --git a/maca_morpho/src/context.h b/maca_morpho/src/context.h new file mode 100644 index 0000000000000000000000000000000000000000..14cdfb70fd61334f10f8ecf99ee0d0979985a791 --- /dev/null +++ b/maca_morpho/src/context.h @@ -0,0 +1,37 @@ +#ifndef __MACA_MORPHO_CONTEXT__ +#define __MACA_MORPHO_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +#define DEFAULT_FPLM_FILENAME "fplm" + + + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *fplm_filename; + char *language; + char *maca_data_path; + char *fm_filename; +} context; + + + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_conll_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_fplm_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_mcd_help_message(context *ctx); +void context_form_column_help_message(context *ctx); +void context_pos_column_help_message(context *ctx); + +#endif diff --git a/maca_morpho/src/feat_fct.c b/maca_morpho/src/feat_fct.c new file mode 100644 index 0000000000000000000000000000000000000000..0c7ddcbfdd0e7fb9274e9b8682b3c5ec207c8598 --- /dev/null +++ b/maca_morpho/src/feat_fct.c @@ -0,0 +1,19 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"feat_lib.h" + + + +int p1(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 1];} +int p2(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 2];} + +feat_lib *feat_lib_build(void) +{ + feat_lib *fl = feat_lib_new(); + + feat_lib_add(fl, 1, (char *)"p1", p1); + feat_lib_add(fl, 1, (char *)"p2", p2); + return fl; +} + diff --git a/maca_morpho/src/fplm2cff.c b/maca_morpho/src/fplm2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..52aa4df48a14cec584c70834dee9e3f99a4797b7 --- /dev/null +++ b/maca_morpho/src/fplm2cff.c @@ -0,0 +1,68 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "context.h" +#include "feat_model.h" +#include "feat_vec.h" +#include "dico.h" +#include "util.h" + +#define LOOKUP_MODE 1 +#define TRAIN_MODE 2 + +int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, int mode) +{ + feat_desc *fd = fm->array[feat_nb]; + int i; + int feat_val; + char str[10]; + + /* the name of the feature is built in fm->string and its value in the dictionnary (dico_features) is returned */ + fm->string[0] = '\0'; + for(i=0; i < fd->nbelem; i++){ + strcat(fm->string, fd->array[i]->name); + feat_val = fd->array[i]->fct(form); + sprintf(str, "%d", feat_val); + strcat(fm->string, str); + + /* catenate_int(fm->string, feat_val); */ + } + if(mode == LOOKUP_MODE){ + if(fm->string) + return dico_string2int(dico_features, fm->string); + } + return dico_add(dico_features, fm->string); +} + + +feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features) +{ + int i; + feat_vec_empty(fv); + for(i=0; i < fm->nbelem; i++) + feat_vec_add(fv, get_feat_value(fm, form, dico_features, i, TRAIN_MODE)); + return fv; +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + feat_vec *fv = feat_vec_new(10); + dico *dico_features = dico_new("dico_features", 1000); + feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); + char form[100]; + while(strcmp(form, "end")){ + fscanf(stdin, "%s", form); + printf("form = %s\n", form); + form2fv(form, fv, fm, dico_features); +/* void feat_vec_print_string(feat_vec *fv, dico *dico_features); */ + feat_vec_print(stdout, fv); + } + /* dico_print_fh(stdout, dico_features); */ + dico_print("xx", dico_features); + + + +}