Skip to content
Snippets Groups Projects
Commit 8c017b36 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

code refactoring of maca_lexer and maca_tokenizer

parent 1d026907
No related branches found
No related tags found
No related merge requests found
......@@ -93,7 +93,6 @@ void trie_add_word(trie *t, int *word, int length)
trie_trans *current_trans = NULL;
int transition_exists = 1;
int destination;
int i;
while((current_index < length) && transition_exists){
transition_exists = 0;
......
......@@ -37,4 +37,9 @@ void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx);
void context_pos_column_help_message(context *ctx);
void context_input_help_message(context *ctx);
void context_mwe_token_separator_help_message(context *ctx);
void context_mwe_filename_help_message(context *ctx);
void context_vocab_help_message(context *ctx);
#endif
......@@ -23,7 +23,6 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char buffer[10000];
FILE *f= myfopen(fplm_filename, "r");
int fields_nb;
......
......@@ -42,16 +42,12 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
int main(int argc, char *argv[])
{
char buffer[10000];
char *buffer_copy;
char *form;
int form_code;
context *ctx;
int form_column;
/* int form_column; */
FILE *f = NULL;
trie *mwe_trie;
dico *d_mwe_tokens = NULL;
int origin_state = 0;
int destination_state = 0;
int states_array[100];
int symbols_array[100];
int path_index = 0;
......@@ -60,12 +56,12 @@ int main(int argc, char *argv[])
ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx);
/*
if(ctx->form_column != -1)
form_column = ctx->form_column;
else
form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM];
*/
if(ctx->input_filename == NULL)
f = stdin;
else
......
......@@ -4,13 +4,13 @@ FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex
set(SOURCES ./src/context.c
${FLEX_fr_tok_rules_OUTPUTS}
${FLEX_en_tok_rules_OUTPUTS})
##compiling library
include_directories(./src)
add_library(maca_tokenizer_lib STATIC ${SOURCES})
#compiling, linking and installing executables
include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_executable(maca_tokenizer ./src/maca_tokenizer.c)
......
......@@ -7,7 +7,6 @@
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
......@@ -109,41 +108,13 @@ context *context_read_options(int argc, char *argv[])
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
/*
if(ctx->mcd_filename == NULL)
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf();
*/
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
}
......@@ -29,5 +29,6 @@ void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
void context_input_help_message(context *ctx);
#endif
......@@ -8,19 +8,32 @@ extern int defait_amalgames;
%option noyywrap
%s state_defait_amalgames
%s state_num
%%
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
<state_num>[0-9]*,[0-9]* printf("%s", yytext);
[0-9]+,[0-9]* printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
… printf("\n…");
' printf("'\n");
’ printf("'\n");
-je printf("\n-je");
-tu printf("\n-tu");
-on printf("\n-on");
-ce printf("\n-ce");
-t-il printf("\n-t-il");
-il printf("\n-il");
-t-ils printf("\n-t-ils");
-ils printf("\n-ils");
-t-elle printf("\n-t-elle");
-elle printf("\n-elle");
-t-elles printf("\n-t-elles");
-elles printf("\n-elles");
\n+ printf("\n");
<state_defait_amalgames>{
" du " printf("\nde\nle\n");
" des " printf("\nde\nles\n");
......
......@@ -3,6 +3,9 @@
#include<string.h>
#include"context.h"
int enlex(void);
int frlex(void);
int defait_amalgames = 0;
void maca_tokenizer_help_message(context *ctx)
......
......@@ -3,6 +3,7 @@
void look_for_number_of_features_and_classes(char *filename, int *max_feat, int *max_class);
int look_for_number_of_features(char *filename);
int look_for_number_of_examples(char *filename);
int *count_occ_of_features(char *filename, int *n_feat);
int cff_look_for_number_of_columns(char *cff_filename);
int *cff_max_value_per_column(char *cff_filename, int n);
......
......@@ -75,6 +75,18 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int
fclose(f);
}
int look_for_number_of_examples(char *filename)
{
char buffer[10000];
FILE *f = fopen(filename, "r");
int number = 0;
while(fgets(buffer, 10000, f))
number ++;
fclose(f);
return number;
}
int look_for_number_of_features(char *filename)
{
char buffer[10000];
......
......@@ -2,6 +2,7 @@
#include<stdlib.h>
#include<string.h>
#include"feature_table.h"
#include"cf_file.h"
#include"util.h"
void perceptron_avg(char *filename, feature_table *ft, int n_iter)
......@@ -36,6 +37,7 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter)
feat_vec_add(fv, atoi(token));
}
for(cla=0; cla < classes_nb; cla++)
classes_score[cla] = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment