Skip to content
Snippets Groups Projects
Commit f0babb73 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modified maca_tokenizer so that it can output offset and token length

parent 568cf30a
No related branches found
No related tags found
No related merge requests found
cmake -DCMAKE_BUILD_TYPE=Debug ..
make
sudo make install
cmake ..
make
sudo make install
......@@ -2,6 +2,7 @@ FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex
FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c)
set(SOURCES ./src/context.c
./src/maca_tokenizer_functions_for_lex.c
${FLEX_fr_tok_rules_OUTPUTS}
${FLEX_en_tok_rules_OUTPUTS})
......
......@@ -32,18 +32,11 @@ context *context_new(void)
ctx->maca_data_path = NULL;
ctx->input_filename = NULL;
ctx->output_filename = NULL;
ctx->print_offset = 0;
ctx->print_token_length = 0;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_input_help_message(context *ctx){
fprintf(stderr, "\t-i --input <file> : input mcf file name\n");
}
......@@ -56,6 +49,22 @@ void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
void context_print_offset_message(context *ctx){
fprintf(stderr, "\t-p --print_offset : print offset and token length\n");
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
context_print_offset_message(ctx);
}
context *context_read_options(int argc, char *argv[])
{
int c;
......@@ -64,11 +73,12 @@ context *context_read_options(int argc, char *argv[])
ctx->program_name = strdup(argv[0]);
static struct option long_options[8] =
static struct option long_options[9] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"print_offset", no_argument, 0, 'p'},
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
......@@ -78,7 +88,7 @@ context *context_read_options(int argc, char *argv[])
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){
while ((c = getopt_long (argc, argv, "hvdpi:o:C:L:D:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
......@@ -105,6 +115,10 @@ context *context_read_options(int argc, char *argv[])
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
case 'p':
ctx->print_offset = 1;
ctx->print_token_length = 1;
break;
}
}
......
......@@ -18,6 +18,8 @@ typedef struct {
mcd *mcd_struct;
char *input_filename;
char *output_filename;
int print_offset;
int print_token_length;
} context;
context *context_new(void);
......
%{
#include<stdio.h>
#include"maca_tokenizer_functions_for_lex.h"
extern int defait_amalgames;
/*extern int print_offset;
extern int print_token_length;*/
int offset = 0;
int token_length = 0;
char token[10000];
%}
%option prefix="fr"
......@@ -12,36 +19,39 @@ extern int defait_amalgames;
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
[ ]*\? {maca_tokenizer_segment((char *)"?", yytext);}
[ ]*\! {maca_tokenizer_segment((char *)"!", yytext);}
[ ]*, {maca_tokenizer_segment((char *)",", yytext);}
[ ]*: {maca_tokenizer_segment((char *)":", yytext);}
[ ]*; {maca_tokenizer_segment((char *)";", yytext);}
[ ]*… {maca_tokenizer_segment((char *)"…", yytext);}
[ ]*\) {maca_tokenizer_segment((char *)")", yytext);}
[ ]*» {maca_tokenizer_segment((char *)"»", yytext);}
\( {maca_tokenizer_segment((char *)"((", yytext);}
' {maca_tokenizer_segment((char *)"'", yytext);}
’ {maca_tokenizer_segment((char *)"'", yytext);}
\" {maca_tokenizer_segment((char *)"\"", yytext);}
« {maca_tokenizer_segment((char *)"«", yytext);}
[0-9]+,[0-9]+ {maca_tokenizer_segment(yytext, yytext);}
[0-9]+,[0-9]+ printf("%s", yytext);
[ \t]+ printf("\n");
[ ]*\. printf("\n.");
[ ]*\? printf("\n?");
[ ]*\! printf("\n!");
[ ]*, printf("\n,");
[ ]*: printf("\n:");
[ ]*; printf("\n;");
[ ]*… printf("\n…");
[ ]*\) printf("\n)");
[ ]*» printf("\n»");
\( printf("(\n");
' printf("'\n");
’ printf("'\n");
\" printf("\"\n");
« printf("»\n");
-je printf("\n-je");
-tu printf("\n-tu");
-on printf("\n-on");
-ce printf("\n-ce");
-t-il printf("\n-t-il");
-il printf("\n-il");
-t-ils printf("\n-t-ils");
-ils printf("\n-ils");
-t-elle printf("\n-t-elle");
-elle printf("\n-elle");
-t-elles printf("\n-t-elles");
-elles printf("\n-elles");
\n+ printf("\n");
-je {maca_tokenizer_segment((char *)"-je", yytext);}
-tu {maca_tokenizer_segment((char *)"-tu", yytext);}
-on {maca_tokenizer_segment((char *)"-on", yytext);}
-ce {maca_tokenizer_segment((char *)"-ce", yytext);}
-t-il {maca_tokenizer_segment((char *)"-t-il", yytext);}
-il {maca_tokenizer_segment((char *)"-il", yytext);}
-t-ils {maca_tokenizer_segment((char *)"-t-ils", yytext);}
-ils {maca_tokenizer_segment((char *)"-ils", yytext);}
-t-elle {maca_tokenizer_segment((char *)"-t-elle", yytext);}
-elle {maca_tokenizer_segment((char *)"-elle", yytext);}
-t-elles {maca_tokenizer_segment((char *)"-t-elles", yytext);}
-elles {maca_tokenizer_segment((char *)"-elles", yytext);}
\n+ {maca_tokenizer_segment((char *)"", yytext);}
. {maca_tokenizer_add_char_to_token(yytext[0]);}
<state_defait_amalgames>{
" du " printf("\nde\nle\n");
......
......@@ -7,6 +7,8 @@ int enlex(void);
int frlex(void);
int defait_amalgames = 0;
int print_offset = 0;
int print_token_length = 0;
void maca_tokenizer_help_message(context *ctx)
{
......@@ -33,6 +35,9 @@ int main(int argc, char* argv[])
ctx = context_read_options(argc, argv);
maca_tokenizer_check_options(ctx);
print_offset = ctx->print_offset;
print_token_length = ctx->print_token_length;
if(!strcmp(ctx->language, "en"))
enlex() ;
else
......
#include <stdio.h>
#include <string.h>
#include "char16.h"
extern int offset;
extern int token_length;
extern char token[];
/*extern char *yytext;*/
extern int print_offset;
extern int print_token_length;
void maca_tokenizer_segment(char *separator, char *xx){
if(token_length != 0){
printf("%s", token);
if(print_offset)
printf("\t%d", offset);
if(print_token_length)
printf("\t%d", utf8_strlen(token));
printf("\n");
}
offset += utf8_strlen(token);
token_length = 0;
if(strlen(separator) != 0){
printf("%s", separator);
if(print_offset)
printf("\t%d", offset);
if(print_token_length)
printf("\t%d", (int) strlen(separator));
printf("\n");
}
offset += strlen(xx);
}
void maca_tokenizer_add_char_to_token(char c)
{
token[token_length] = c;
token_length++;
token[token_length] = 0;
}
void maca_tokenizer_segment(char *separator, char *xx);
void maca_tokenizer_add_char_to_token(char c);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment