Skip to content
Snippets Groups Projects
Commit 0442bfb6 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

defined separate tokenization rules for french and english. Fixed some bugs in maca_lexer.

parent df494a92
No related branches found
No related tags found
No related merge requests found
......@@ -21,6 +21,19 @@ typedef struct {
int states_nb;
} trie;
typedef struct {
int state;
int symbol;
} state_symbol;
typedef struct {
int size;
state_symbol *array;
int nbelem;
} trie_path;
trie_state *trie_state_new(trie_trans *transitions, int is_accept);
void trie_state_free(trie_state *state);
......
......@@ -40,6 +40,7 @@ context *context_new(void)
ctx->mwe_filename = NULL;
ctx->mwe_tokens_dico_filename = NULL;
ctx->mwe_tokens_separator = strdup(" ");
ctx->paste = 1;
return ctx;
}
......
......@@ -22,6 +22,7 @@ typedef struct {
char *mwe_filename;
char *mwe_tokens_dico_filename;
char *mwe_tokens_separator;
int paste;
} context;
context *context_new(void);
......
......@@ -22,7 +22,6 @@ void maca_lexer_help_message(context *ctx)
context_vocab_help_message(ctx);
}
void maca_lexer_check_options(context *ctx){
if(ctx->help){
maca_lexer_help_message(ctx);
......@@ -111,25 +110,41 @@ int main(int argc, char *argv[])
if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */
if(path_index == 0)
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
else{ /* there is something in the path */
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
/* do not forget to print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
path_index = 0;
}
}
/* not in state 0 of trie */
/* not in state 0 of trie we are processing tokens of a potential mwe */
else{
path_index++;
}
......
FLEX_TARGET(tokenizer tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/maca_tokenizer.c)
FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex.c)
FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c)
set(SOURCES ./src/context.c
${FLEX_fr_tok_rules_OUTPUTS}
${FLEX_en_tok_rules_OUTPUTS})
##compiling library
include_directories(./src)
add_library(maca_tokenizer_lib STATIC ${SOURCES})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_executable(maca_tokenizer main.c ${FLEX_tokenizer_OUTPUTS})
add_executable(maca_tokenizer ./src/maca_tokenizer.c)
target_link_libraries(maca_tokenizer maca_tokenizer_lib maca_common)
install (TARGETS maca_tokenizer DESTINATION bin)
int defait_amalgames = 0;
int main(int argc, char* argv[]) {
if(argc > 1) defait_amalgames = 1;
yylex() ;
return 0;
}
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->input_filename) free(ctx->input_filename);
if(ctx->output_filename) free(ctx->output_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->input_filename = NULL;
ctx->output_filename = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_input_help_message(context *ctx){
fprintf(stderr, "\t-i --input <file> : input mcf file name\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[8] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"maca_data_path", required_argument, 0, 'D'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'i':
ctx->input_filename = strdup(optarg);
break;
case 'o':
ctx->output_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'L':
ctx->language = strdup(optarg);
break;
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
if(ctx->mcd_filename == NULL)
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf();
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
}
#ifndef __MACA_LEXER_CONTEXT__
#define __MACA_LEXER_CONTEXT__
#include "mcd.h"
#include <stdlib.h>
#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico"
#define DEFAULT_MWE_FILENAME "mwe"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *language;
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
char *input_filename;
char *output_filename;
} context;
context *context_new(void);
void context_free(context *ctx);
context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
#endif
%{
#include <stdio.h>
extern int defait_amalgames;
%}
%option prefix="en"
%option noyywrap
%%
[0-9]+\.[0-9]+ printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
don't printf("do\nnot");
don’t printf("do\nnot");
doesn't printf("does\nnot");
doesn’t printf("does\nnot");
won't printf("will\nnot");
won’t printf("will\nnot");
cannot printf("can\nnot");
wanna printf("want\nto");
's printf("\n's");
’s printf("\n's");
\n+ printf("\n");
%%
......@@ -2,6 +2,10 @@
#include <stdio.h>
extern int defait_amalgames;
%}
%option prefix="fr"
/*%option outfile="fr_lex.c"*/
%option noyywrap
%s state_defait_amalgames
%s state_num
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"context.h"
int defait_amalgames = 0;
void maca_tokenizer_help_message(context *ctx)
{
context_general_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_language_help_message(ctx);
}
void maca_tokenizer_check_options(context *ctx){
if(ctx->help){
maca_tokenizer_help_message(ctx);
exit(1);
}
}
int main(int argc, char* argv[])
{
context *ctx;
ctx = context_read_options(argc, argv);
maca_tokenizer_check_options(ctx);
if(!strcmp(ctx->language, "en"))
enlex() ;
else
frlex() ;
/* if(argc > 1) defait_amalgames = 1; */
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment