Skip to content
Snippets Groups Projects
Commit df494a92 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modified several little details in maca_lexer

parent ad8f50f8
No related branches found
No related tags found
No related merge requests found
...@@ -14,11 +14,11 @@ void context_free(context *ctx) ...@@ -14,11 +14,11 @@ void context_free(context *ctx)
if(ctx->program_name) free(ctx->program_name); if(ctx->program_name) free(ctx->program_name);
if(ctx->input_filename) free(ctx->input_filename); if(ctx->input_filename) free(ctx->input_filename);
if(ctx->output_filename) free(ctx->output_filename); if(ctx->output_filename) free(ctx->output_filename);
if(ctx->fplm_filename) free(ctx->fplm_filename);
if(ctx->language) free(ctx->language); if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path); if(ctx->maca_data_path) free(ctx->maca_data_path);
if(ctx->mwe_filename) free(ctx->mwe_filename); if(ctx->mwe_filename) free(ctx->mwe_filename);
if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename); if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename);
if(ctx->mwe_tokens_separator) free(ctx->mwe_tokens_separator);
free(ctx); free(ctx);
} }
...@@ -30,7 +30,6 @@ context *context_new(void) ...@@ -30,7 +30,6 @@ context *context_new(void)
ctx->verbose = 0; ctx->verbose = 0;
ctx->debug_mode = 0; ctx->debug_mode = 0;
ctx->program_name = NULL; ctx->program_name = NULL;
ctx->fplm_filename = NULL;
ctx->mcd_filename = NULL; ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL; ctx->mcd_struct = NULL;
ctx->language = strdup("fr"); ctx->language = strdup("fr");
...@@ -40,6 +39,7 @@ context *context_new(void) ...@@ -40,6 +39,7 @@ context *context_new(void)
ctx->output_filename = NULL; ctx->output_filename = NULL;
ctx->mwe_filename = NULL; ctx->mwe_filename = NULL;
ctx->mwe_tokens_dico_filename = NULL; ctx->mwe_tokens_dico_filename = NULL;
ctx->mwe_tokens_separator = strdup(" ");
return ctx; return ctx;
} }
...@@ -60,10 +60,6 @@ void context_form_column_help_message(context *ctx){ ...@@ -60,10 +60,6 @@ void context_form_column_help_message(context *ctx){
fprintf(stderr, "\t-F --form_column <int> : column containing form\n"); fprintf(stderr, "\t-F --form_column <int> : column containing form\n");
} }
void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
void context_mcd_help_message(context *ctx){ void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
} }
...@@ -76,6 +72,18 @@ void context_maca_data_path_help_message(context *ctx){ ...@@ -76,6 +72,18 @@ void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
} }
void context_mwe_token_separator_help_message(context *ctx){
fprintf(stderr, "\t-s --mwe_sep <string> : multi word expression tokens separator (default is space character)\n");
}
void context_mwe_filename_help_message(context *ctx){
fprintf(stderr, "\t-M --mwe <filename> : multi word expression file\n");
}
void context_vocab_help_message(context *ctx){
fprintf(stderr, "\t-V --vocab <filename> : multi word expression tokens vocabulary file\n");
}
context *context_read_options(int argc, char *argv[]) context *context_read_options(int argc, char *argv[])
{ {
int c; int c;
...@@ -84,7 +92,7 @@ context *context_read_options(int argc, char *argv[]) ...@@ -84,7 +92,7 @@ context *context_read_options(int argc, char *argv[])
ctx->program_name = strdup(argv[0]); ctx->program_name = strdup(argv[0]);
static struct option long_options[12] = static struct option long_options[13] =
{ {
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'}, {"verbose", no_argument, 0, 'v'},
...@@ -93,16 +101,16 @@ context *context_read_options(int argc, char *argv[]) ...@@ -93,16 +101,16 @@ context *context_read_options(int argc, char *argv[])
{"output", required_argument, 0, 'o'}, {"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'}, {"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'}, {"language", required_argument, 0, 'L'},
{"fplm", required_argument, 0, 'f'},
{"form_column", required_argument, 0, 'F'}, {"form_column", required_argument, 0, 'F'},
{"maca_data_path", required_argument, 0, 'D'}, {"maca_data_path", required_argument, 0, 'D'},
{"mwe", required_argument, 0, 'M'}, {"mwe", required_argument, 0, 'M'},
{"vocab", required_argument, 0, 'V'} {"vocab", required_argument, 0, 'V'},
{"mwe_sep", required_argument, 0, 's'}
}; };
optind = 0; optind = 0;
opterr = 0; opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:f:C:L:M:F:D:V:", long_options, &option_index)) != -1){ while ((c = getopt_long (argc, argv, "hvdi:o:C:L:M:F:D:V:s:", long_options, &option_index)) != -1){
switch (c) switch (c)
{ {
case 'd': case 'd':
...@@ -117,9 +125,6 @@ context *context_read_options(int argc, char *argv[]) ...@@ -117,9 +125,6 @@ context *context_read_options(int argc, char *argv[])
case 'F': case 'F':
ctx->form_column = atoi(optarg) - 1; ctx->form_column = atoi(optarg) - 1;
break; break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
case 'i': case 'i':
ctx->input_filename = strdup(optarg); ctx->input_filename = strdup(optarg);
break; break;
...@@ -141,6 +146,9 @@ context *context_read_options(int argc, char *argv[]) ...@@ -141,6 +146,9 @@ context *context_read_options(int argc, char *argv[])
case 'M': case 'M':
ctx->mwe_filename = strdup(optarg); ctx->mwe_filename = strdup(optarg);
break; break;
case 's':
ctx->mwe_tokens_separator = strdup(optarg);
break;
} }
} }
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "mcd.h" #include "mcd.h"
#include <stdlib.h> #include <stdlib.h>
#define DEFAULT_MWE_TOKENS_DICO_FILENAME "mwe_tokens" #define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico"
#define DEFAULT_MWE_FILENAME "mwe" #define DEFAULT_MWE_FILENAME "mwe"
typedef struct { typedef struct {
...@@ -12,7 +12,6 @@ typedef struct { ...@@ -12,7 +12,6 @@ typedef struct {
int verbose; int verbose;
int debug_mode; int debug_mode;
char *program_name; char *program_name;
char *fplm_filename;
char *language; char *language;
char *maca_data_path; char *maca_data_path;
char *mcd_filename; char *mcd_filename;
...@@ -22,6 +21,7 @@ typedef struct { ...@@ -22,6 +21,7 @@ typedef struct {
char *output_filename; char *output_filename;
char *mwe_filename; char *mwe_filename;
char *mwe_tokens_dico_filename; char *mwe_tokens_dico_filename;
char *mwe_tokens_separator;
} context; } context;
context *context_new(void); context *context_new(void);
...@@ -31,7 +31,6 @@ context *context_read_options(int argc, char *argv[]); ...@@ -31,7 +31,6 @@ context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx); void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx); void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx); void context_language_help_message(context *ctx);
void context_fplm_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx); void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx); void context_form_column_help_message(context *ctx);
......
...@@ -7,6 +7,39 @@ ...@@ -7,6 +7,39 @@
#include"util.h" #include"util.h"
#include"context.h" #include"context.h"
void maca_lexer_help_message(context *ctx)
{
context_general_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_form_column_help_message(ctx);
context_mwe_token_separator_help_message(ctx);
context_mwe_filename_help_message(ctx);
context_vocab_help_message(ctx);
}
void maca_lexer_check_options(context *ctx){
if(ctx->help){
maca_lexer_help_message(ctx);
exit(1);
}
}
int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_index)
{
int i;
for(i=path_index - 1; i >= 0; i--){
if(mwe_trie->states[states_array[i]]->is_accept) return i;
}
return -1;
}
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
char buffer[10000]; char buffer[10000];
...@@ -26,7 +59,7 @@ int main(int argc, char *argv[]) ...@@ -26,7 +59,7 @@ int main(int argc, char *argv[])
int i; int i;
ctx = context_read_options(argc, argv); ctx = context_read_options(argc, argv);
/* maca_lexer_check_options(ctx); */ maca_lexer_check_options(ctx);
if(ctx->form_column != -1) if(ctx->form_column != -1)
...@@ -39,7 +72,10 @@ int main(int argc, char *argv[]) ...@@ -39,7 +72,10 @@ int main(int argc, char *argv[])
else else
f = myfopen(ctx->input_filename, "r"); f = myfopen(ctx->input_filename, "r");
if(ctx->verbose) fprintf(stderr, "reading mwe list from file : %s\n", ctx->mwe_filename);
mwe_trie = trie_build_from_collection(ctx->mwe_filename); mwe_trie = trie_build_from_collection(ctx->mwe_filename);
if(ctx->verbose) fprintf(stderr, "reading mwe tokens vocabulary from file : %s\n", ctx->mwe_tokens_dico_filename);
d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5); d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
/* trie_print(stdout, mwe_trie); */ /* trie_print(stdout, mwe_trie); */
...@@ -58,7 +94,8 @@ int main(int argc, char *argv[]) ...@@ -58,7 +94,8 @@ int main(int argc, char *argv[])
symbols_array[path_index] = form_code; symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0 states_array[path_index] = (form_code == -1)? 0
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
/* printf("buffer = %s ", buffer); /*
printf("buffer = %s ", buffer);
printf("code = %d\n", form_code); printf("code = %d\n", form_code);
...@@ -72,26 +109,27 @@ int main(int argc, char *argv[]) ...@@ -72,26 +109,27 @@ int main(int argc, char *argv[])
printf("\n"); printf("\n");
*/ */
if(states_array[path_index] == 0){ /* in initial state of trie */ if(states_array[path_index] == 0){ /* in initial state of trie */
if(path_index == 0){ /* nothing has been recognized */ /* nothing has been recognized */
if(path_index == 0)
printf("%s\n", buffer); printf("%s\n", buffer);
} else{ /* there is something in the path */
else{ int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
if(mwe_trie->states[states_array[path_index - 1]]->is_accept){ /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i < path_index; i++){ for(i=0; i <= accept_state_index; i++){
if(i > 0) printf("#"); if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
} }
printf("\n"); if(accept_state_index != -1) printf("\n");
} /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
else{ for(i = accept_state_index + 1; i < path_index; i++){
for(i=0; i < path_index; i++){
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
} }
} /* do not forget to print the current token */
printf("%s\n", buffer); printf("%s\n", buffer);
}
path_index = 0; path_index = 0;
} }
}
/* not in state 0 of trie */
else{ else{
path_index++; path_index++;
} }
...@@ -99,4 +137,3 @@ int main(int argc, char *argv[]) ...@@ -99,4 +137,3 @@ int main(int argc, char *argv[])
} }
return 0; return 0;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment