diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h index 91b216dff5257671fafd3f6ecbbeeb174e4350a8..0c30a9aaf4d4b38979f05a1300c5a173195df99f 100644 --- a/maca_common/include/word_buffer.h +++ b/maca_common/include/word_buffer.h @@ -29,11 +29,6 @@ #define word_buffer_is_last(wb) (((wb)->current_index == (wb)->nbelem - 1)? 1 : 0) #define word_buffer_is_empty(wb) (((wb)->nbelem == 0)? 1 : 0) - - - - - typedef struct { int size; /* size of the array used to store words */ int nbelem; /* number of words in the buffer */ diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index 6494c2286da458ad2cdda966ac9db6a5a7165bee..150bdae8858af9b8f214ed1b0ad6866702fe0b64 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length) trie_trans *current_trans = NULL; int transition_exists = 1; int destination; - int i; while((current_index < length) && transition_exists){ transition_exists = 0; diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h index 1ad410d176332e67904789949d0720778f6b6030..697b67ee00d7ab0f0ff12da48b664c01ada76a46 100644 --- a/maca_lexer/src/context.h +++ b/maca_lexer/src/context.h @@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx); void context_form_column_help_message(context *ctx); void context_pos_column_help_message(context *ctx); +void context_input_help_message(context *ctx); +void context_mwe_token_separator_help_message(context *ctx); +void context_mwe_filename_help_message(context *ctx); +void context_vocab_help_message(context *ctx); + #endif diff --git a/maca_lexer/src/extract_mwe_from_fplm.c b/maca_lexer/src/extract_mwe_from_fplm.c index 6dec0cbea129e1ad1f421d9f491bf138126bda1e..800bed0478d786d5df37bd72d3e0562a0b12e1ef 100644 --- a/maca_lexer/src/extract_mwe_from_fplm.c +++ b/maca_lexer/src/extract_mwe_from_fplm.c @@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb char pos[1000]; char lemma[1000]; char morpho[1000]; - int num = 0; char buffer[10000]; FILE *f= myfopen(fplm_filename, "r"); int fields_nb; diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index 04e56bfa04282a0b316774bb4f94e1c7f0e68b8b..b0966453a6050b4febc293302ccb742427884cfd 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in int main(int argc, char *argv[]) { char buffer[10000]; - char *buffer_copy; - char *form; int form_code; context *ctx; - int form_column; + /* int form_column; */ FILE *f = NULL; trie *mwe_trie; dico *d_mwe_tokens = NULL; - int origin_state = 0; - int destination_state = 0; int states_array[100]; int symbols_array[100]; int path_index = 0; @@ -60,12 +56,12 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_lexer_check_options(ctx); - + /* if(ctx->form_column != -1) form_column = ctx->form_column; else form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; - + */ if(ctx->input_filename == NULL) f = stdin; else diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt index f100c0b072b876f9f2c202d8b4dce74b88dd0865..b2e874ca1fb06951ff72b2a18986de00673e3992 100644 --- a/maca_tokenizer/CMakeLists.txt +++ b/maca_tokenizer/CMakeLists.txt @@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex set(SOURCES ./src/context.c ${FLEX_fr_tok_rules_OUTPUTS} ${FLEX_en_tok_rules_OUTPUTS}) + ##compiling library include_directories(./src) add_library(maca_tokenizer_lib STATIC ${SOURCES}) - - +#compiling, linking and installing executables include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_executable(maca_tokenizer ./src/maca_tokenizer.c) diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c index 25a3414280e35f3ea9c704466468cf4678ea60db..9c9236bab08ac75200c457759ded3a78611046e3 100644 --- a/maca_tokenizer/src/context.c +++ b/maca_tokenizer/src/context.c @@ -7,7 +7,6 @@ #include "util.h" -void context_set_linguistic_resources_filenames(context *ctx); void context_free(context *ctx) { @@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[]) } } - context_set_linguistic_resources_filenames(ctx); - - if(ctx->mcd_filename) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); - + /* if(ctx->mcd_filename == NULL) - /* ctx->mcd_struct = mcd_build_conll07(); */ ctx->mcd_struct = mcd_build_wplgf(); - + */ return ctx; } -void context_set_linguistic_resources_filenames(context *ctx) -{ - char absolute_path[500]; - char absolute_filename[500]; - - absolute_path[0] = '\0'; - - if(ctx->maca_data_path) - strcat(absolute_path, ctx->maca_data_path); - else { - char *e = getenv("MACAON_DIR"); - if (e != NULL) { - strcat(absolute_path, e); - } else { - fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); - } - } - - - strcat(absolute_path, "/"); - strcat(absolute_path, ctx->language); - strcat(absolute_path, "/bin/"); - -} diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h index f9c3ce0ff164bce21d166069ecf10398fdd416dc..ce7e8f1d68cfd1f1137a59906142e631e0343f95 100644 --- a/maca_tokenizer/src/context.h +++ b/maca_tokenizer/src/context.h @@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx); void context_language_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); void context_mcd_help_message(context *ctx); +void context_input_help_message(context *ctx); #endif diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index dd6055bf731fa8edc997dc4e91a535b0a0c7ecb3..7e7fba8a832c30fd94c7eef96f23fa68212023b6 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -8,19 +8,32 @@ extern int defait_amalgames; %option noyywrap %s state_defait_amalgames -%s state_num %% if(defait_amalgames){ BEGIN(state_defait_amalgames); } -<state_num>[0-9]*,[0-9]* printf("%s", yytext); -[ \t]+ printf("\n"); -\. printf("\n."); -\, printf("\n,"); -' printf("'\n"); -ā printf("'\n"); -\n+ printf("\n"); +[0-9]+,[0-9]* printf("%s", yytext); +[ \t]+ printf("\n"); +\. printf("\n."); +\, printf("\n,"); +⦠printf("\nā¦"); +' printf("'\n"); +ā printf("'\n"); +-je printf("\n-je"); +-tu printf("\n-tu"); +-on printf("\n-on"); +-ce printf("\n-ce"); +-t-il printf("\n-t-il"); +-il printf("\n-il"); +-t-ils printf("\n-t-ils"); +-ils printf("\n-ils"); +-t-elle printf("\n-t-elle"); +-elle printf("\n-elle"); +-t-elles printf("\n-t-elles"); +-elles printf("\n-elles"); +\n+ printf("\n"); + <state_defait_amalgames>{ " du " printf("\nde\nle\n"); " des " printf("\nde\nles\n"); diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c index d9e9a1836c48e8c1e4ec5f678a4e7f4087747dda..6768e5da317a68bb98da83331151f63d61f1ffcc 100644 --- a/maca_tokenizer/src/maca_tokenizer.c +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -3,6 +3,9 @@ #include<string.h> #include"context.h" +int enlex(void); +int frlex(void); + int defait_amalgames = 0; void maca_tokenizer_help_message(context *ctx) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 85acc249c706395b5bcd9a0512765990736c8f9e..72c2d61ab47097612cca44a7d908d4406fd2129a 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -164,6 +164,9 @@ void context_f2p_filename_help_message(context *ctx){ void context_trace_mode_help_message(context *ctx){ fprintf(stderr, "\t-T --traces : activate trace mode (default is false)\n"); } +void context_debug_help_message(context *ctx){ + fprintf(stderr, "\t-d --debug : activate debug mode (default is false)\n"); +} context *context_read_options(int argc, char *argv[]) { diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 611dd10d09b88a0a5c13d0ee66cafd52a1550519..932e6717e28ebcec122951f69eb69ba05723869f 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -102,4 +102,7 @@ void context_ifpls_help_message(context *ctx); void context_input_help_message(context *ctx); void context_root_label_help_message(context *ctx); +void context_debug_help_message(context *ctx); + + #endif diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c index 3ecbcd4a0d74980a0b13ba6f8df7195db96411a8..fa44e9b51c6be52cb019d0ca4abde650c649cd67 100644 --- a/maca_trans_parser/src/maca_trans_parser.c +++ b/maca_trans_parser/src/maca_trans_parser.c @@ -13,11 +13,12 @@ /*#include"dnn_decoder.h"*/ #include"config2feat_vec.h" -void decode_help_message(context *ctx) +void maca_trans_parser_help_message(context *ctx) { context_general_help_message(ctx); /* context_beam_help_message(ctx); */ /* context_conll_help_message(ctx); */ + context_debug_help_message(ctx); fprintf(stderr, "INPUT\n"); context_input_help_message(ctx); context_mcd_help_message(ctx); @@ -27,7 +28,7 @@ void decode_help_message(context *ctx) context_root_label_help_message(ctx); } -void decode_check_options(context *ctx){ +void maca_trans_parser_check_options(context *ctx){ if(ctx->help /*!ctx->conll_filename*/ /* || !ctx->perc_model_filename @@ -35,7 +36,7 @@ void decode_check_options(context *ctx){ || !ctx->vocabs_filename || !ctx->features_model_filename*/ ){ - decode_help_message(ctx); + maca_trans_parser_help_message(ctx); exit(1); } } @@ -83,7 +84,7 @@ int main(int argc, char *argv[]) context *ctx; ctx = context_read_options(argc, argv); - decode_check_options(ctx); + maca_trans_parser_check_options(ctx); set_linguistic_resources_filenames_parser(ctx); ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); diff --git a/maca_trans_parser/src/maca_trans_tagger_bt.c b/maca_trans_parser/src/maca_trans_tagger_bt.c new file mode 100644 index 0000000000000000000000000000000000000000..778c634e64bdd04fa5d2ecd8442ea768a8423e0b --- /dev/null +++ b/maca_trans_parser/src/maca_trans_tagger_bt.c @@ -0,0 +1,103 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"beam.h" +#include"form2pos.h" +#include"simple_decoder_tagger.h" +/*#include"dnn_decoder.h"*/ +#include"config2feat_vec.h" + +void decode_tagger_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_f2p_filename_help_message(ctx); +} + +void decode_tagger_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_tagger_help_message(ctx); + exit(1); + } +} + +void decode_tagger_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + decode_tagger_check_options(ctx); + + decode_tagger_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + simple_decoder_tagger2(ctx); + + context_free(ctx); + return 0; +} + diff --git a/perceptron/lib/include/cf_file.h b/perceptron/lib/include/cf_file.h index 177b5e673513f5d6b577111081ab25407161cff6..d76f58f154468740c7fcd6c4329c01a6608ff750 100644 --- a/perceptron/lib/include/cf_file.h +++ b/perceptron/lib/include/cf_file.h @@ -3,6 +3,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class); int look_for_number_of_features(char *filename); +int look_for_number_of_examples(char *filename); int *count_occ_of_features(char *filename, int *n_feat); int cff_look_for_number_of_columns(char *cff_filename); int *cff_max_value_per_column(char *cff_filename, int n); diff --git a/perceptron/lib/include/feat_vec.h b/perceptron/lib/include/feat_vec.h index 4116d7652be8bdf4dd5922f936c40686cf28b67b..e1dfe919b0415ccb463762324a1c57642726250a 100644 --- a/perceptron/lib/include/feat_vec.h +++ b/perceptron/lib/include/feat_vec.h @@ -13,16 +13,14 @@ typedef struct { /*#include "word_emb.h"*/ #include "mcd.h" - -void feat_vec_concat(feat_vec *fv1, feat_vec *fv2); +void feat_vec_concat(feat_vec *fv1, feat_vec *fv2); feat_vec *feat_vec_copy(feat_vec *fv); - feat_vec *feat_vec_new(int size); -void feat_vec_free(feat_vec *fv); -int feat_vec_add(feat_vec *fv, int feat); -void feat_vec_empty(feat_vec *fv); -void feat_vec_print_string(feat_vec *fv, dico *dico_features); -void feat_vec_print(FILE *f, feat_vec *fv); +void feat_vec_free(feat_vec *fv); +int feat_vec_add(feat_vec *fv, int feat); +void feat_vec_empty(feat_vec *fv); +void feat_vec_print_string(feat_vec *fv, dico *dico_features); +void feat_vec_print(FILE *f, feat_vec *fv); /* void feat_vec_print_dnn(FILE *f, feat_vec *fv, feat_model *fm, mcd *m); */ /* void feat_vec_fill_input_array_dnn(fann_type *input_array, feat_vec *fv, feat_model *fm, mcd *m); */ /* void feat_vec_fill_input_array_dnn(float *input_array, feat_vec *fv, feat_model *fm, mcd *m); */ diff --git a/perceptron/lib/include/feature_table.h b/perceptron/lib/include/feature_table.h index ef1bbe66e1b47d616b138a47daacfd153b6dd174..97abb90e06cbed385b5ebe9cd3acfcd72f8b3cc5 100644 --- a/perceptron/lib/include/feature_table.h +++ b/perceptron/lib/include/feature_table.h @@ -17,15 +17,15 @@ typedef struct { } vcode; feature_table *feature_table_load(char *filename, int verbose); -void feature_table_dump(char *filename, feature_table *ft); +void feature_table_dump(char *filename, feature_table *ft); feature_table *feature_table_new(int features_nb, int classes_nb); -void feature_table_print(char *filename, feature_table *ft); -void feature_table_print_verbose(char *filename, feature_table *ft, dico *dico_features, dico *dico_classes); -int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max); -float feature_table_entropy(feat_vec *fv, feature_table *ft); -float feature_table_diff_scores(feat_vec *fv, feature_table *ft); -float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, float *max1, int *argmax2, float *max2); -void feature_table_free(feature_table *ft); -void feature_table_scores(feat_vec *fv, feature_table *ft, float *classes_score); -vcode* feature_table_get_vcode_array(feat_vec *fv, feature_table* ft); +void feature_table_print(char *filename, feature_table *ft); +void feature_table_print_verbose(char *filename, feature_table *ft, dico *dico_features, dico *dico_classes); +int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max); +float feature_table_entropy(feat_vec *fv, feature_table *ft); +float feature_table_diff_scores(feat_vec *fv, feature_table *ft); +float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, float *max1, int *argmax2, float *max2); +void feature_table_free(feature_table *ft); +void feature_table_scores(feat_vec *fv, feature_table *ft, float *classes_score); +vcode *feature_table_get_vcode_array(feat_vec *fv, feature_table* ft); #endif diff --git a/perceptron/lib/src/cf_file.c b/perceptron/lib/src/cf_file.c index 8a463f42874d70c3c8cd65ecd789b4b2790b28f9..9afcf344fe2a8934f5dd0b71f52d968552a1c0ac 100644 --- a/perceptron/lib/src/cf_file.c +++ b/perceptron/lib/src/cf_file.c @@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int fclose(f); } +int look_for_number_of_examples(char *filename) +{ + char buffer[10000]; + FILE *f = fopen(filename, "r"); + int number = 0; + + while(fgets(buffer, 10000, f)) + number ++; + fclose(f); + return number; +} + int look_for_number_of_features(char *filename) { char buffer[10000]; diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c index 5c2384aa457685b2c68faa96577e8e29139a9152..248a1b32ba2298d4021728c7660e55922462cf56 100644 --- a/perceptron/lib/src/feature_table.c +++ b/perceptron/lib/src/feature_table.c @@ -8,8 +8,6 @@ feature_table *feature_table_load(char *filename, int verbose) { int i; - - feature_table *ft = NULL; int features_nb; int classes_nb; diff --git a/perceptron/lib/src/perceptron.c b/perceptron/lib/src/perceptron.c index 3425a3c396adf67c8d18a88f8066d911146edd2b..284c295897625ae10f41b5d35e0753767a99f74a 100644 --- a/perceptron/lib/src/perceptron.c +++ b/perceptron/lib/src/perceptron.c @@ -2,6 +2,7 @@ #include<stdlib.h> #include<string.h> #include"feature_table.h" +#include"cf_file.h" #include"util.h" void perceptron_avg(char *filename, feature_table *ft, int n_iter) @@ -21,8 +22,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) feat_vec *fv = feat_vec_new(1); char *token; feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb); - int counter = 1; - + int counter = 1; + for(epoch = 0; epoch < n_iter; epoch++){ fprintf(stderr, "[%d]", epoch + 1); f = fopen(filename, "r"); @@ -35,10 +36,11 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) /* printf("token = %s\n", token); */ feat_vec_add(fv, atoi(token)); } - + + for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0; - + for(feat=0; feat < fv->nb; feat++) for(cla=0; cla < classes_nb; cla++) if(fv->t[feat] != -1) @@ -80,7 +82,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) for(i=0; i< ft->features_nb; i++) for(j=0; j< ft->classes_nb; j++) - ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; + ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; free(classes_score); feat_vec_free(fv);