Skip to content
Snippets Groups Projects
Commit 8c017b36 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

code refactoring of maca_lexer and maca_tokenizer

parent 1d026907
Branches
No related tags found
No related merge requests found
...@@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length) ...@@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length)
trie_trans *current_trans = NULL; trie_trans *current_trans = NULL;
int transition_exists = 1; int transition_exists = 1;
int destination; int destination;
int i;
while((current_index < length) && transition_exists){ while((current_index < length) && transition_exists){
transition_exists = 0; transition_exists = 0;
......
...@@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx); ...@@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx); void context_form_column_help_message(context *ctx);
void context_pos_column_help_message(context *ctx); void context_pos_column_help_message(context *ctx);
void context_input_help_message(context *ctx);
void context_mwe_token_separator_help_message(context *ctx);
void context_mwe_filename_help_message(context *ctx);
void context_vocab_help_message(context *ctx);
#endif #endif
...@@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb ...@@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb
char pos[1000]; char pos[1000];
char lemma[1000]; char lemma[1000];
char morpho[1000]; char morpho[1000];
int num = 0;
char buffer[10000]; char buffer[10000];
FILE *f= myfopen(fplm_filename, "r"); FILE *f= myfopen(fplm_filename, "r");
int fields_nb; int fields_nb;
......
...@@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in ...@@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
char buffer[10000]; char buffer[10000];
char *buffer_copy;
char *form;
int form_code; int form_code;
context *ctx; context *ctx;
int form_column; /* int form_column; */
FILE *f = NULL; FILE *f = NULL;
trie *mwe_trie; trie *mwe_trie;
dico *d_mwe_tokens = NULL; dico *d_mwe_tokens = NULL;
int origin_state = 0;
int destination_state = 0;
int states_array[100]; int states_array[100];
int symbols_array[100]; int symbols_array[100];
int path_index = 0; int path_index = 0;
...@@ -60,12 +56,12 @@ int main(int argc, char *argv[]) ...@@ -60,12 +56,12 @@ int main(int argc, char *argv[])
ctx = context_read_options(argc, argv); ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx); maca_lexer_check_options(ctx);
/*
if(ctx->form_column != -1) if(ctx->form_column != -1)
form_column = ctx->form_column; form_column = ctx->form_column;
else else
form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM];
*/
if(ctx->input_filename == NULL) if(ctx->input_filename == NULL)
f = stdin; f = stdin;
else else
......
...@@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex ...@@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex
set(SOURCES ./src/context.c set(SOURCES ./src/context.c
${FLEX_fr_tok_rules_OUTPUTS} ${FLEX_fr_tok_rules_OUTPUTS}
${FLEX_en_tok_rules_OUTPUTS}) ${FLEX_en_tok_rules_OUTPUTS})
##compiling library ##compiling library
include_directories(./src) include_directories(./src)
add_library(maca_tokenizer_lib STATIC ${SOURCES}) add_library(maca_tokenizer_lib STATIC ${SOURCES})
#compiling, linking and installing executables
include_directories(${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_executable(maca_tokenizer ./src/maca_tokenizer.c) add_executable(maca_tokenizer ./src/maca_tokenizer.c)
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
#include "util.h" #include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx) void context_free(context *ctx)
{ {
...@@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[]) ...@@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[])
} }
} }
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename) if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
/*
if(ctx->mcd_filename == NULL) if(ctx->mcd_filename == NULL)
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf(); ctx->mcd_struct = mcd_build_wplgf();
*/
return ctx; return ctx;
} }
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
}
...@@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx); ...@@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx); void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx); void context_mcd_help_message(context *ctx);
void context_input_help_message(context *ctx);
#endif #endif
...@@ -8,19 +8,32 @@ extern int defait_amalgames; ...@@ -8,19 +8,32 @@ extern int defait_amalgames;
%option noyywrap %option noyywrap
%s state_defait_amalgames %s state_defait_amalgames
%s state_num
%% %%
if(defait_amalgames){ if(defait_amalgames){
BEGIN(state_defait_amalgames); BEGIN(state_defait_amalgames);
} }
<state_num>[0-9]*,[0-9]* printf("%s", yytext); [0-9]+,[0-9]* printf("%s", yytext);
[ \t]+ printf("\n"); [ \t]+ printf("\n");
\. printf("\n."); \. printf("\n.");
\, printf("\n,"); \, printf("\n,");
… printf("\n…");
' printf("'\n"); ' printf("'\n");
’ printf("'\n"); ’ printf("'\n");
-je printf("\n-je");
-tu printf("\n-tu");
-on printf("\n-on");
-ce printf("\n-ce");
-t-il printf("\n-t-il");
-il printf("\n-il");
-t-ils printf("\n-t-ils");
-ils printf("\n-ils");
-t-elle printf("\n-t-elle");
-elle printf("\n-elle");
-t-elles printf("\n-t-elles");
-elles printf("\n-elles");
\n+ printf("\n"); \n+ printf("\n");
<state_defait_amalgames>{ <state_defait_amalgames>{
" du " printf("\nde\nle\n"); " du " printf("\nde\nle\n");
" des " printf("\nde\nles\n"); " des " printf("\nde\nles\n");
......
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
#include<string.h> #include<string.h>
#include"context.h" #include"context.h"
int enlex(void);
int frlex(void);
int defait_amalgames = 0; int defait_amalgames = 0;
void maca_tokenizer_help_message(context *ctx) void maca_tokenizer_help_message(context *ctx)
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class); void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class);
int look_for_number_of_features(char *filename); int look_for_number_of_features(char *filename);
int look_for_number_of_examples(char *filename);
int *count_occ_of_features(char *filename, int *n_feat); int *count_occ_of_features(char *filename, int *n_feat);
int cff_look_for_number_of_columns(char *cff_filename); int cff_look_for_number_of_columns(char *cff_filename);
int *cff_max_value_per_column(char *cff_filename, int n); int *cff_max_value_per_column(char *cff_filename, int n);
......
...@@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int ...@@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int
fclose(f); fclose(f);
} }
int look_for_number_of_examples(char *filename)
{
char buffer[10000];
FILE *f = fopen(filename, "r");
int number = 0;
while(fgets(buffer, 10000, f))
number ++;
fclose(f);
return number;
}
int look_for_number_of_features(char *filename) int look_for_number_of_features(char *filename)
{ {
char buffer[10000]; char buffer[10000];
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include<stdlib.h> #include<stdlib.h>
#include<string.h> #include<string.h>
#include"feature_table.h" #include"feature_table.h"
#include"cf_file.h"
#include"util.h" #include"util.h"
void perceptron_avg(char *filename, feature_table *ft, int n_iter) void perceptron_avg(char *filename, feature_table *ft, int n_iter)
...@@ -36,6 +37,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) ...@@ -36,6 +37,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter)
feat_vec_add(fv, atoi(token)); feat_vec_add(fv, atoi(token));
} }
for(cla=0; cla < classes_nb; cla++) for(cla=0; cla < classes_nb; cla++)
classes_score[cla] = 0; classes_score[cla] = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment