From 8c017b3650466465a8299b565a71718d7063fe79 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 1 Dec 2016 14:32:53 -0500 Subject: [PATCH] code refactoring of maca_lexer and maca_tokenizer --- maca_common/src/trie.c | 1 - maca_lexer/src/context.h | 5 ++++ maca_lexer/src/extract_mwe_from_fplm.c | 1 - maca_lexer/src/maca_lexer.c | 10 +++----- maca_tokenizer/CMakeLists.txt | 4 ++-- maca_tokenizer/src/context.c | 33 ++------------------------ maca_tokenizer/src/context.h | 1 + maca_tokenizer/src/fr_tok_rules.l | 29 +++++++++++++++------- maca_tokenizer/src/maca_tokenizer.c | 3 +++ perceptron/lib/include/cf_file.h | 1 + perceptron/lib/src/cf_file.c | 12 ++++++++++ perceptron/lib/src/perceptron.c | 12 ++++++---- 12 files changed, 57 insertions(+), 55 deletions(-) diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index 6494c22..150bdae 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length) trie_trans *current_trans = NULL; int transition_exists = 1; int destination; - int i; while((current_index < length) && transition_exists){ transition_exists = 0; diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h index 1ad410d..697b67e 100644 --- a/maca_lexer/src/context.h +++ b/maca_lexer/src/context.h @@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx); void context_form_column_help_message(context *ctx); void context_pos_column_help_message(context *ctx); +void context_input_help_message(context *ctx); +void context_mwe_token_separator_help_message(context *ctx); +void context_mwe_filename_help_message(context *ctx); +void context_vocab_help_message(context *ctx); + #endif diff --git a/maca_lexer/src/extract_mwe_from_fplm.c b/maca_lexer/src/extract_mwe_from_fplm.c index 6dec0cb..800bed0 100644 --- a/maca_lexer/src/extract_mwe_from_fplm.c +++ b/maca_lexer/src/extract_mwe_from_fplm.c @@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb char pos[1000]; char lemma[1000]; char morpho[1000]; - int num = 0; char buffer[10000]; FILE *f= myfopen(fplm_filename, "r"); int fields_nb; diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index 04e56bf..b096645 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in int main(int argc, char *argv[]) { char buffer[10000]; - char *buffer_copy; - char *form; int form_code; context *ctx; - int form_column; + /* int form_column; */ FILE *f = NULL; trie *mwe_trie; dico *d_mwe_tokens = NULL; - int origin_state = 0; - int destination_state = 0; int states_array[100]; int symbols_array[100]; int path_index = 0; @@ -60,12 +56,12 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_lexer_check_options(ctx); - + /* if(ctx->form_column != -1) form_column = ctx->form_column; else form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; - + */ if(ctx->input_filename == NULL) f = stdin; else diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt index f100c0b..b2e874c 100644 --- a/maca_tokenizer/CMakeLists.txt +++ b/maca_tokenizer/CMakeLists.txt @@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex set(SOURCES ./src/context.c ${FLEX_fr_tok_rules_OUTPUTS} ${FLEX_en_tok_rules_OUTPUTS}) + ##compiling library include_directories(./src) add_library(maca_tokenizer_lib STATIC ${SOURCES}) - - +#compiling, linking and installing executables include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_executable(maca_tokenizer ./src/maca_tokenizer.c) diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c index 25a3414..9c9236b 100644 --- a/maca_tokenizer/src/context.c +++ b/maca_tokenizer/src/context.c @@ -7,7 +7,6 @@ #include "util.h" -void context_set_linguistic_resources_filenames(context *ctx); void context_free(context *ctx) { @@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[]) } } - context_set_linguistic_resources_filenames(ctx); - - if(ctx->mcd_filename) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); - + /* if(ctx->mcd_filename == NULL) - /* ctx->mcd_struct = mcd_build_conll07(); */ ctx->mcd_struct = mcd_build_wplgf(); - + */ return ctx; } -void context_set_linguistic_resources_filenames(context *ctx) -{ - char absolute_path[500]; - char absolute_filename[500]; - - absolute_path[0] = '\0'; - - if(ctx->maca_data_path) - strcat(absolute_path, ctx->maca_data_path); - else { - char *e = getenv("MACAON_DIR"); - if (e != NULL) { - strcat(absolute_path, e); - } else { - fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); - } - } - - - strcat(absolute_path, "/"); - strcat(absolute_path, ctx->language); - strcat(absolute_path, "/bin/"); - -} diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h index f9c3ce0..ce7e8f1 100644 --- a/maca_tokenizer/src/context.h +++ b/maca_tokenizer/src/context.h @@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx); void context_language_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); void context_mcd_help_message(context *ctx); +void context_input_help_message(context *ctx); #endif diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index dd6055b..7e7fba8 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -8,19 +8,32 @@ extern int defait_amalgames; %option noyywrap %s state_defait_amalgames -%s state_num %% if(defait_amalgames){ BEGIN(state_defait_amalgames); } -<state_num>[0-9]*,[0-9]* printf("%s", yytext); -[ \t]+ printf("\n"); -\. printf("\n."); -\, printf("\n,"); -' printf("'\n"); -ā printf("'\n"); -\n+ printf("\n"); +[0-9]+,[0-9]* printf("%s", yytext); +[ \t]+ printf("\n"); +\. printf("\n."); +\, printf("\n,"); +⦠printf("\nā¦"); +' printf("'\n"); +ā printf("'\n"); +-je printf("\n-je"); +-tu printf("\n-tu"); +-on printf("\n-on"); +-ce printf("\n-ce"); +-t-il printf("\n-t-il"); +-il printf("\n-il"); +-t-ils printf("\n-t-ils"); +-ils printf("\n-ils"); +-t-elle printf("\n-t-elle"); +-elle printf("\n-elle"); +-t-elles printf("\n-t-elles"); +-elles printf("\n-elles"); +\n+ printf("\n"); + <state_defait_amalgames>{ " du " printf("\nde\nle\n"); " des " printf("\nde\nles\n"); diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c index d9e9a18..6768e5d 100644 --- a/maca_tokenizer/src/maca_tokenizer.c +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -3,6 +3,9 @@ #include<string.h> #include"context.h" +int enlex(void); +int frlex(void); + int defait_amalgames = 0; void maca_tokenizer_help_message(context *ctx) diff --git a/perceptron/lib/include/cf_file.h b/perceptron/lib/include/cf_file.h index 177b5e6..d76f58f 100644 --- a/perceptron/lib/include/cf_file.h +++ b/perceptron/lib/include/cf_file.h @@ -3,6 +3,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class); int look_for_number_of_features(char *filename); +int look_for_number_of_examples(char *filename); int *count_occ_of_features(char *filename, int *n_feat); int cff_look_for_number_of_columns(char *cff_filename); int *cff_max_value_per_column(char *cff_filename, int n); diff --git a/perceptron/lib/src/cf_file.c b/perceptron/lib/src/cf_file.c index 8a463f4..9afcf34 100644 --- a/perceptron/lib/src/cf_file.c +++ b/perceptron/lib/src/cf_file.c @@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int fclose(f); } +int look_for_number_of_examples(char *filename) +{ + char buffer[10000]; + FILE *f = fopen(filename, "r"); + int number = 0; + + while(fgets(buffer, 10000, f)) + number ++; + fclose(f); + return number; +} + int look_for_number_of_features(char *filename) { char buffer[10000]; diff --git a/perceptron/lib/src/perceptron.c b/perceptron/lib/src/perceptron.c index 3425a3c..284c295 100644 --- a/perceptron/lib/src/perceptron.c +++ b/perceptron/lib/src/perceptron.c @@ -2,6 +2,7 @@ #include<stdlib.h> #include<string.h> #include"feature_table.h" +#include"cf_file.h" #include"util.h" void perceptron_avg(char *filename, feature_table *ft, int n_iter) @@ -21,8 +22,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) feat_vec *fv = feat_vec_new(1); char *token; feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb); - int counter = 1; - + int counter = 1; + for(epoch = 0; epoch < n_iter; epoch++){ fprintf(stderr, "[%d]", epoch + 1); f = fopen(filename, "r"); @@ -35,10 +36,11 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) /* printf("token = %s\n", token); */ feat_vec_add(fv, atoi(token)); } - + + for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0; - + for(feat=0; feat < fv->nb; feat++) for(cla=0; cla < classes_nb; cla++) if(fv->t[feat] != -1) @@ -80,7 +82,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) for(i=0; i< ft->features_nb; i++) for(j=0; j< ft->classes_nb; j++) - ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; + ft->table[i][j] -= 1/(float)counter * ft_sum->table[i][j]; free(classes_score); feat_vec_free(fv); -- GitLab