diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index 6494c2286da458ad2cdda966ac9db6a5a7165bee..150bdae8858af9b8f214ed1b0ad6866702fe0b64 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length) trie_trans *current_trans = NULL; int transition_exists = 1; int destination; - int i; while((current_index < length) && transition_exists){ transition_exists = 0; diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h index 1ad410d176332e67904789949d0720778f6b6030..697b67ee00d7ab0f0ff12da48b664c01ada76a46 100644 --- a/maca_lexer/src/context.h +++ b/maca_lexer/src/context.h @@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx); void context_form_column_help_message(context *ctx); void context_pos_column_help_message(context *ctx); +void context_input_help_message(context *ctx); +void context_mwe_token_separator_help_message(context *ctx); +void context_mwe_filename_help_message(context *ctx); +void context_vocab_help_message(context *ctx); + #endif diff --git a/maca_lexer/src/extract_mwe_from_fplm.c b/maca_lexer/src/extract_mwe_from_fplm.c index 6dec0cbea129e1ad1f421d9f491bf138126bda1e..800bed0478d786d5df37bd72d3e0562a0b12e1ef 100644 --- a/maca_lexer/src/extract_mwe_from_fplm.c +++ b/maca_lexer/src/extract_mwe_from_fplm.c @@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb char pos[1000]; char lemma[1000]; char morpho[1000]; - int num = 0; char buffer[10000]; FILE *f= myfopen(fplm_filename, "r"); int fields_nb; diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index 04e56bfa04282a0b316774bb4f94e1c7f0e68b8b..b0966453a6050b4febc293302ccb742427884cfd 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in int main(int argc, char *argv[]) { char buffer[10000]; - char *buffer_copy; - char *form; int form_code; context *ctx; - int form_column; + /* int form_column; */ FILE *f = NULL; trie *mwe_trie; dico *d_mwe_tokens = NULL; - int origin_state = 0; - int destination_state = 0; int states_array[100]; int symbols_array[100]; int path_index = 0; @@ -60,12 +56,12 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_lexer_check_options(ctx); - + /* if(ctx->form_column != -1) form_column = ctx->form_column; else form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; - + */ if(ctx->input_filename == NULL) f = stdin; else diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt index f100c0b072b876f9f2c202d8b4dce74b88dd0865..b2e874ca1fb06951ff72b2a18986de00673e3992 100644 --- a/maca_tokenizer/CMakeLists.txt +++ b/maca_tokenizer/CMakeLists.txt @@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex set(SOURCES ./src/context.c ${FLEX_fr_tok_rules_OUTPUTS} ${FLEX_en_tok_rules_OUTPUTS}) + ##compiling library include_directories(./src) add_library(maca_tokenizer_lib STATIC ${SOURCES}) - - +#compiling, linking and installing executables include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_executable(maca_tokenizer ./src/maca_tokenizer.c) diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c index 25a3414280e35f3ea9c704466468cf4678ea60db..9c9236bab08ac75200c457759ded3a78611046e3 100644 --- a/maca_tokenizer/src/context.c +++ b/maca_tokenizer/src/context.c @@ -7,7 +7,6 @@ #include "util.h" -void context_set_linguistic_resources_filenames(context *ctx); void context_free(context *ctx) { @@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[]) } } - context_set_linguistic_resources_filenames(ctx); - - if(ctx->mcd_filename) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); - + /* if(ctx->mcd_filename == NULL) - /* ctx->mcd_struct = mcd_build_conll07(); */ ctx->mcd_struct = mcd_build_wplgf(); - + */ return ctx; } -void context_set_linguistic_resources_filenames(context *ctx) -{ - char absolute_path[500]; - char absolute_filename[500]; - - absolute_path[0] = '\0'; - - if(ctx->maca_data_path) - strcat(absolute_path, ctx->maca_data_path); - else { - char *e = getenv("MACAON_DIR"); - if (e != NULL) { - strcat(absolute_path, e); - } else { - fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); - } - } - - - strcat(absolute_path, "/"); - strcat(absolute_path, ctx->language); - strcat(absolute_path, "/bin/"); - -} diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h index f9c3ce0ff164bce21d166069ecf10398fdd416dc..ce7e8f1d68cfd1f1137a59906142e631e0343f95 100644 --- a/maca_tokenizer/src/context.h +++ b/maca_tokenizer/src/context.h @@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx); void context_language_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); void context_mcd_help_message(context *ctx); +void context_input_help_message(context *ctx); #endif diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index dd6055bf731fa8edc997dc4e91a535b0a0c7ecb3..7e7fba8a832c30fd94c7eef96f23fa68212023b6 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -8,19 +8,32 @@ extern int defait_amalgames; %option noyywrap %s state_defait_amalgames -%s state_num %% if(defait_amalgames){ BEGIN(state_defait_amalgames); } -<state_num>[0-9]*,[0-9]* printf("%s", yytext); -[ \t]+ printf("\n"); -\. printf("\n."); -\, printf("\n,"); -' printf("'\n"); -ā printf("'\n"); -\n+ printf("\n"); +[0-9]+,[0-9]* printf("%s", yytext); +[ \t]+ printf("\n"); +\. printf("\n."); +\, printf("\n,"); +⦠printf("\nā¦"); +' printf("'\n"); +ā printf("'\n"); +-je printf("\n-je"); +-tu printf("\n-tu"); +-on printf("\n-on"); +-ce printf("\n-ce"); +-t-il printf("\n-t-il"); +-il printf("\n-il"); +-t-ils printf("\n-t-ils"); +-ils printf("\n-ils"); +-t-elle printf("\n-t-elle"); +-elle printf("\n-elle"); +-t-elles printf("\n-t-elles"); +-elles printf("\n-elles"); +\n+ printf("\n"); + <state_defait_amalgames>{ " du " printf("\nde\nle\n"); " des " printf("\nde\nles\n"); diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c index d9e9a1836c48e8c1e4ec5f678a4e7f4087747dda..6768e5da317a68bb98da83331151f63d61f1ffcc 100644 --- a/maca_tokenizer/src/maca_tokenizer.c +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -3,6 +3,9 @@ #include<string.h> #include"context.h" +int enlex(void); +int frlex(void); + int defait_amalgames = 0; void maca_tokenizer_help_message(context *ctx) diff --git a/perceptron/lib/include/cf_file.h b/perceptron/lib/include/cf_file.h index 177b5e673513f5d6b577111081ab25407161cff6..d76f58f154468740c7fcd6c4329c01a6608ff750 100644 --- a/perceptron/lib/include/cf_file.h +++ b/perceptron/lib/include/cf_file.h @@ -3,6 +3,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class); int look_for_number_of_features(char *filename); +int look_for_number_of_examples(char *filename); int *count_occ_of_features(char *filename, int *n_feat); int cff_look_for_number_of_columns(char *cff_filename); int *cff_max_value_per_column(char *cff_filename, int n); diff --git a/perceptron/lib/src/cf_file.c b/perceptron/lib/src/cf_file.c index 8a463f42874d70c3c8cd65ecd789b4b2790b28f9..9afcf344fe2a8934f5dd0b71f52d968552a1c0ac 100644 --- a/perceptron/lib/src/cf_file.c +++ b/perceptron/lib/src/cf_file.c @@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int fclose(f); } +int look_for_number_of_examples(char *filename) +{ + char buffer[10000]; + FILE *f = fopen(filename, "r"); + int number = 0; + + while(fgets(buffer, 10000, f)) + number ++; + fclose(f); + return number; +} + int look_for_number_of_features(char *filename) { char buffer[10000]; diff --git a/perceptron/lib/src/perceptron.c b/perceptron/lib/src/perceptron.c index 3425a3c396adf67c8d18a88f8066d911146edd2b..284c295897625ae10f41b5d35e0753767a99f74a 100644 --- a/perceptron/lib/src/perceptron.c +++ b/perceptron/lib/src/perceptron.c @@ -2,6 +2,7 @@ #include<stdlib.h> #include<string.h> #include"feature_table.h" +#include"cf_file.h" #include"util.h" void perceptron_avg(char *filename, feature_table *ft, int n_iter) @@ -21,8 +22,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) feat_vec *fv = feat_vec_new(1); char *token; feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb); - int counter = 1; - + int counter = 1; + for(epoch = 0; epoch < n_iter; epoch++){ fprintf(stderr, "[%d]", epoch + 1); f = fopen(filename, "r"); @@ -35,10 +36,11 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) /* printf("token = %s\n", token); */ feat_vec_add(fv, atoi(token)); } - + + for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0; - + for(feat=0; feat < fv->nb; feat++) for(cla=0; cla < classes_nb; cla++) if(fv->t[feat] != -1) @@ -80,7 +82,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) for(i=0; i< ft->features_nb; i++) for(j=0; j< ft->classes_nb; j++) - ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; + ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; free(classes_score); feat_vec_free(fv);