Skip to content
Snippets Groups Projects
Commit 021c77a2 authored by Johannes Heinecke's avatar Johannes Heinecke
Browse files

Merge branch 'master' of https://gitlab.lif.univ-mrs.fr/alexis.nasr/macaon2 into johannes

parents f1a77980 1d026907
Branches
No related tags found
1 merge request!7Johannes
This commit is part of merge request !7. Comments created here will be created in the context of that merge request.
Showing
with 1035 additions and 25 deletions
......@@ -29,6 +29,7 @@ add_subdirectory(maca_tools)
add_subdirectory(perceptron)
#add_subdirectory(maca_lemmatizer)
add_subdirectory(maca_tokenizer)
add_subdirectory(maca_lexer)
add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
add_subdirectory(maca_graph_parser)
......
......@@ -8,7 +8,15 @@ set(SOURCES src/util.c
src/word.c
src/sentence.c
src/word_buffer.c
src/trie.c
)
#compiling library
add_library(maca_common STATIC ${SOURCES})
#compiling, linking and installing executables
#add_executable(test_trie ./test/test_trie.c)
#target_link_libraries(test_trie maca_common)
#install (TARGETS test_trie DESTINATION bin)
#ifndef __TRIE__
#define __TRIE__
#include<stdio.h>
typedef struct trans{
int destination;
int symbol;
struct trans *next;
} trie_trans;
typedef struct {
trie_trans *transitions;
int is_accept;
int fail;
} trie_state;
typedef struct {
trie_state **states;
int size;
int states_nb;
} trie;
typedef struct {
int state;
int symbol;
} state_symbol;
typedef struct {
int size;
state_symbol *array;
int nbelem;
} trie_path;
trie_state *trie_state_new(trie_trans *transitions, int is_accept);
void trie_state_free(trie_state *state);
trie *trie_new(void);
void trie_free(trie *t);
trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next);
void trie_trans_free_rec(trie_trans *trans);
int trie_add_state(trie *t);
void trie_add_trans(trie *t, int origin, int symbol, int destination);
void trie_add_word(trie *t, int *word, int length);
void trie_print(FILE *f, trie *t);
int trie_lookup(trie *t, int *word, int length);
trie *trie_build_from_collection(char *filename);
int trie_destination_state(trie *t, int origin, int symbol);
#endif
......@@ -2,6 +2,8 @@
#define __UTIL__
#include<stdlib.h>
#include<stdio.h>
void myfree(void *ptr);
void *memalloc(size_t s);
FILE *myfopen(const char *path, const char *mode);
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include "trie.h"
#include "util.h"
trie_state *trie_state_new(trie_trans *transitions, int is_accept)
{
trie_state *state = memalloc(sizeof(trie_state));
state->transitions = transitions;
state->is_accept = is_accept;
state->fail = 0;
return state;
}
void trie_state_free(trie_state *state)
{
if(state){
trie_trans_free_rec(state->transitions);
free(state);
}
}
trie *trie_new(void)
{
trie *t = memalloc(sizeof(trie));
t->states = NULL;
t->size = 0;
t->states_nb = 0;
trie_add_state(t); /* initial state */
return t;
}
void trie_free(trie *t)
{
int i;
if(t){
for(i=0; i < t->size; i++)
trie_state_free(t->states[i]);
free(t->states);
free(t);
}
}
trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next)
{
trie_trans *trans = memalloc(sizeof(trie_trans));
trans->destination = destination;
trans->symbol = symbol;
trans->next = next;
return trans;
}
void trie_trans_free_rec(trie_trans *trans)
{
if(trans){
if(trans->next)
trie_trans_free_rec(trans->next);
else
free(trans);
}
}
int trie_add_state(trie *t)
{
if(t->states_nb == t->size){
t->size = 2 * (t->size + 1);
t->states = (trie_state **) realloc(t->states, t->size * sizeof(trie_state *));
}
t->states[t->states_nb] = trie_state_new(NULL, 0);
t->states_nb++;
return t->states_nb - 1;
}
void trie_add_trans(trie *t, int origin, int symbol, int destination)
{
/* make sure origin state exists */
while(origin >= t->states_nb)
trie_add_state(t);
/* make sure destination state exists */
while(destination >= t->states_nb)
trie_add_state(t);
t->states[origin]->transitions = trie_trans_new(destination, symbol, t->states[origin]->transitions);
}
void trie_add_word(trie *t, int *word, int length)
{
int current_index = 0;
int current_state = 0;
trie_trans *current_trans = NULL;
int transition_exists = 1;
int destination;
int i;
while((current_index < length) && transition_exists){
transition_exists = 0;
for(current_trans = t->states[current_state]->transitions; current_trans; current_trans = current_trans->next){
if(current_trans->symbol == word[current_index]){
current_state = current_trans->destination;
current_index++;
transition_exists = 1;
break;
}
}
}
while(current_index < length){
destination = trie_add_state(t);
trie_add_trans(t, current_state, word[current_index], destination);
current_index++;
current_state = destination;
}
t->states[current_state]->is_accept = 1;
}
void trie_print(FILE *f, trie *t)
{
int i;
trie_trans *trans;
for(i=0; i < t->states_nb; i++){
fprintf(f, "state %d", i);
if(t->states[i]->is_accept) fprintf(f, " ACCEPT\n");
else fprintf(f, "\n");
fprintf(f, "FAIL = %d\n", t->states[i]->fail);
for(trans = t->states[i]->transitions; trans; trans = trans->next){
fprintf(f, "%d %d %d\n", i, trans->symbol, trans->destination);
}
fprintf(f, "\n");
}
}
int trie_lookup(trie *t, int *word, int length)
{
int i;
int current_state = 0;
trie_trans *trans;
for(i=0; i < length; i++){
for(trans = t->states[current_state]->transitions; trans; trans = trans->next){
if(trans->symbol == word[i]){
current_state = trans->destination;
break;
}
}
if(trans == NULL)
return 0;
}
return t->states[current_state]->is_accept;
}
trie *trie_build_from_collection(char *filename)
{
trie *t = trie_new();
FILE *f = myfopen(filename, "r");
char buffer[1000];
int word[100];
int length;
char *token;
while(fgets(buffer, 10000, f)){
length = 0;
token = strtok(buffer, " ");
while(token){
word[length++] = atoi(token);
token = strtok(NULL, " ");
}
trie_add_word(t, word, length);
}
fclose(f);
return t;
}
int trie_destination_state(trie *t, int origin, int symbol)
{
trie_trans *trans;
for(trans = t->states[origin]->transitions; trans; trans = trans->next){
if(trans->symbol == symbol)
return trans->destination;
}
return 0;
}
set(SOURCES src/context.c)
##compiling library
include_directories(src)
add_library(maca_lexer_lib STATIC ${SOURCES})
#compiling, linking and installing executables
add_executable(extract_mwe_from_fplm ./src/extract_mwe_from_fplm.c)
target_link_libraries(extract_mwe_from_fplm maca_common)
install (TARGETS extract_mwe_from_fplm DESTINATION bin)
add_executable(maca_lexer ./src/maca_lexer.c)
target_link_libraries(maca_lexer maca_lexer_lib maca_common)
install (TARGETS maca_lexer DESTINATION bin)
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->input_filename) free(ctx->input_filename);
if(ctx->output_filename) free(ctx->output_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
if(ctx->mwe_filename) free(ctx->mwe_filename);
if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename);
if(ctx->mwe_tokens_separator) free(ctx->mwe_tokens_separator);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->form_column = -1;
ctx->input_filename = NULL;
ctx->output_filename = NULL;
ctx->mwe_filename = NULL;
ctx->mwe_tokens_dico_filename = NULL;
ctx->mwe_tokens_separator = strdup(" ");
ctx->paste = 1;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_input_help_message(context *ctx){
fprintf(stderr, "\t-i --input <file> : input mcf file name\n");
}
void context_form_column_help_message(context *ctx){
fprintf(stderr, "\t-F --form_column <int> : column containing form\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
}
void context_mwe_token_separator_help_message(context *ctx){
fprintf(stderr, "\t-s --mwe_sep <string> : multi word expression tokens separator (default is space character)\n");
}
void context_mwe_filename_help_message(context *ctx){
fprintf(stderr, "\t-M --mwe <filename> : multi word expression file\n");
}
void context_vocab_help_message(context *ctx){
fprintf(stderr, "\t-V --vocab <filename> : multi word expression tokens vocabulary file\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[13] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"form_column", required_argument, 0, 'F'},
{"maca_data_path", required_argument, 0, 'D'},
{"mwe", required_argument, 0, 'M'},
{"vocab", required_argument, 0, 'V'},
{"mwe_sep", required_argument, 0, 's'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:C:L:M:F:D:V:s:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'F':
ctx->form_column = atoi(optarg) - 1;
break;
case 'i':
ctx->input_filename = strdup(optarg);
break;
case 'o':
ctx->output_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'L':
ctx->language = strdup(optarg);
break;
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
case 'V':
ctx->mwe_tokens_dico_filename = strdup(optarg);
break;
case 'M':
ctx->mwe_filename = strdup(optarg);
break;
case 's':
ctx->mwe_tokens_separator = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
if((ctx->mcd_filename == NULL) && (ctx->form_column == -1))
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf();
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->mwe_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MWE_FILENAME);
ctx->mwe_filename = strdup(absolute_filename);
}
if(!ctx->mwe_tokens_dico_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MWE_TOKENS_DICO_FILENAME);
ctx->mwe_tokens_dico_filename = strdup(absolute_filename);
}
}
#ifndef __MACA_LEXER_CONTEXT__
#define __MACA_LEXER_CONTEXT__
#include "mcd.h"
#include <stdlib.h>
#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico"
#define DEFAULT_MWE_FILENAME "mwe"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *language;
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
int form_column;
char *input_filename;
char *output_filename;
char *mwe_filename;
char *mwe_tokens_dico_filename;
char *mwe_tokens_separator;
int paste;
} context;
context *context_new(void);
void context_free(context *ctx);
context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx);
void context_pos_column_help_message(context *ctx);
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"dico.h"
#include"util.h"
/* return 1 if form contains at least one space character */
int form_is_complex(char *form)
{
int i;
int l = strlen(form);
for(i=0; i < l; i++)
if(form[i] == ' ')
return 1;
return 0;
}
dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int debug_mode)
{
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char buffer[10000];
FILE *f= myfopen(fplm_filename, "r");
int fields_nb;
char token[1000];
int l;
int i, j;
dico *d_tokens = dico_new("TOKENS", 100000);
int token_code;
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
if(fields_nb != 4){
if(debug_mode){
fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);
fprintf(stderr, "incorrect fplm entry, skipping it\n");
}
continue;
}
if(form_is_complex(form)){
/* fprintf(stdout, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */
/* fprintf(stdout, "%s\n", form); */
l = strlen(form);
j = 0;
for(i=0; i <= l; i++){
if((form[i] != ' ') && (i < l)){
token[j++] = form[i];
}
else{
token[j] = '\0';
token_code = dico_add(d_tokens, token);
/* fprintf(output_file, "token = %s code = %d\n", token, token_code); */
fprintf(output_file, "%d", token_code);
if(i != l)
fprintf(output_file, " ");
j = 0;
}
}
fprintf(output_file, "\n");
}
}
return d_tokens;
}
int main(int argc, char *argv[])
{
dico *d_tokens;
d_tokens = decompose_mwe_in_fplm_file(argv[1], stdout, 1);
dico_print("d_tokens.dico", d_tokens);
dico_free(d_tokens);
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"trie.h"
#include"dico.h"
#include"util.h"
#include"context.h"
void maca_lexer_help_message(context *ctx)
{
context_general_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_form_column_help_message(ctx);
context_mwe_token_separator_help_message(ctx);
context_mwe_filename_help_message(ctx);
context_vocab_help_message(ctx);
}
void maca_lexer_check_options(context *ctx){
if(ctx->help){
maca_lexer_help_message(ctx);
exit(1);
}
}
int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_index)
{
int i;
for(i=path_index - 1; i >= 0; i--){
if(mwe_trie->states[states_array[i]]->is_accept) return i;
}
return -1;
}
int main(int argc, char *argv[])
{
char buffer[10000];
char *buffer_copy;
char *form;
int form_code;
context *ctx;
int form_column;
FILE *f = NULL;
trie *mwe_trie;
dico *d_mwe_tokens = NULL;
int origin_state = 0;
int destination_state = 0;
int states_array[100];
int symbols_array[100];
int path_index = 0;
int i;
ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx);
if(ctx->form_column != -1)
form_column = ctx->form_column;
else
form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM];
if(ctx->input_filename == NULL)
f = stdin;
else
f = myfopen(ctx->input_filename, "r");
if(ctx->verbose) fprintf(stderr, "reading mwe list from file : %s\n", ctx->mwe_filename);
mwe_trie = trie_build_from_collection(ctx->mwe_filename);
if(ctx->verbose) fprintf(stderr, "reading mwe tokens vocabulary from file : %s\n", ctx->mwe_tokens_dico_filename);
d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
/* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */
if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
printf("\n");
continue;
}
buffer[strlen(buffer)-1] = '\0';
form_code = dico_string2int(d_mwe_tokens, buffer);
symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
/*
printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]);
}
printf("\n");
for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]);
}
printf("\n");
*/
if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */
if(path_index == 0)
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
else{ /* there is something in the path */
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
/* do not forget to print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
path_index = 0;
}
}
/* not in state 0 of trie we are processing tokens of a potential mwe */
else{
path_index++;
}
}
return 0;
}
FLEX_TARGET(tokenizer tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/maca_tokenizer.c)
FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex.c)
FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c)
set(SOURCES ./src/context.c
${FLEX_fr_tok_rules_OUTPUTS}
${FLEX_en_tok_rules_OUTPUTS})
##compiling library
include_directories(./src)
add_library(maca_tokenizer_lib STATIC ${SOURCES})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_executable(maca_tokenizer main.c ${FLEX_tokenizer_OUTPUTS})
add_executable(maca_tokenizer ./src/maca_tokenizer.c)
target_link_libraries(maca_tokenizer maca_tokenizer_lib maca_common)
install (TARGETS maca_tokenizer DESTINATION bin)
int main(int argc, char* argv[]) {
yylex() ;
return 0;
}
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->input_filename) free(ctx->input_filename);
if(ctx->output_filename) free(ctx->output_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->input_filename = NULL;
ctx->output_filename = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_input_help_message(context *ctx){
fprintf(stderr, "\t-i --input <file> : input mcf file name\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[8] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"maca_data_path", required_argument, 0, 'D'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'i':
ctx->input_filename = strdup(optarg);
break;
case 'o':
ctx->output_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'L':
ctx->language = strdup(optarg);
break;
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
if(ctx->mcd_filename == NULL)
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf();
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
}
#ifndef __MACA_LEXER_CONTEXT__
#define __MACA_LEXER_CONTEXT__
#include "mcd.h"
#include <stdlib.h>
#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico"
#define DEFAULT_MWE_FILENAME "mwe"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *language;
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
char *input_filename;
char *output_filename;
} context;
context *context_new(void);
void context_free(context *ctx);
context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
#endif
%{
#include <stdio.h>
extern int defait_amalgames;
%}
%option prefix="en"
%option noyywrap
%%
[0-9]+\.[0-9]+ printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
don't printf("do\nnot");
don’t printf("do\nnot");
doesn't printf("does\nnot");
doesn’t printf("does\nnot");
won't printf("will\nnot");
won’t printf("will\nnot");
cannot printf("can\nnot");
wanna printf("want\nto");
's printf("\n's");
’s printf("\n's");
\n+ printf("\n");
%%
%{
#include <stdio.h>
extern int defait_amalgames;
%}
%option prefix="fr"
/*%option outfile="fr_lex.c"*/
%option noyywrap
%s state_defait_amalgames
%s state_num
%%
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
<state_num>[0-9]*,[0-9]* printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
' printf("'\n");
’ printf("'\n");
\n+ printf("\n");
<state_defait_amalgames>{
" du " printf("\nde\nle\n");
" des " printf("\nde\nles\n");
" au " printf("\nà\nle\n");
" aux " printf("\nà\nles\n");
}
%%
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"context.h"
int defait_amalgames = 0;
void maca_tokenizer_help_message(context *ctx)
{
context_general_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_language_help_message(ctx);
}
void maca_tokenizer_check_options(context *ctx){
if(ctx->help){
maca_tokenizer_help_message(ctx);
exit(1);
}
}
int main(int argc, char* argv[])
{
context *ctx;
ctx = context_read_options(argc, argv);
maca_tokenizer_check_options(ctx);
if(!strcmp(ctx->language, "en"))
enlex() ;
else
frlex() ;
/* if(argc > 1) defait_amalgames = 1; */
return 0;
}
%{
#include <stdio.h>
%}
%option noyywrap
%%
" "+ printf("\n");
\. printf("\n.");
\, printf("\n,");
' printf("'\n");
’ printf("'\n");
\n+ printf("\n");
du printf("de\nle");
des printf("de\nles");
au printf("à\nle");
aux printf("à\nles");
%%
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment