diff --git a/CMakeLists.txt b/CMakeLists.txt index b4f6ff988b1692329822f3bc7cbaf1490f1c0c2b..552dabbfe8fce081f5646a284851b115cc04afef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(maca_tools) add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) add_subdirectory(maca_tokenizer) +add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) add_subdirectory(maca_graph_parser) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 58e4c1f2f79afc41d48a0ad738c0dc2149a11582..75b9fad8808f8e5a646425313efde9c554b62ae0 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -8,7 +8,15 @@ set(SOURCES src/util.c src/word.c src/sentence.c src/word_buffer.c + src/trie.c ) #compiling library add_library(maca_common STATIC ${SOURCES}) + + +#compiling, linking and installing executables +#add_executable(test_trie ./test/test_trie.c) +#target_link_libraries(test_trie maca_common) +#install (TARGETS test_trie DESTINATION bin) + diff --git a/maca_common/include/trie.h b/maca_common/include/trie.h new file mode 100644 index 0000000000000000000000000000000000000000..0ec45ad4334bcafdfa7b3a74a32f53877045a87d --- /dev/null +++ b/maca_common/include/trie.h @@ -0,0 +1,38 @@ +#ifndef __TRIE__ +#define __TRIE__ + +#include<stdio.h> + +typedef struct trans{ + int destination; + int symbol; + struct trans *next; +} trie_trans; + +typedef struct { + trie_trans *transitions; + int is_accept; + int fail; +} trie_state; + +typedef struct { + trie_state **states; + int size; + int states_nb; +} trie; + + +trie_state *trie_state_new(trie_trans *transitions, int is_accept); +void trie_state_free(trie_state *state); +trie *trie_new(void); +void trie_free(trie *t); +trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next); +void trie_trans_free_rec(trie_trans *trans); +int trie_add_state(trie *t); +void trie_add_trans(trie *t, int origin, int symbol, int destination); +void trie_add_word(trie *t, int *word, int length); +void trie_print(FILE *f, trie *t); +int trie_lookup(trie *t, int *word, int length); +trie *trie_build_from_collection(char *filename); +int trie_destination_state(trie *t, int origin, int symbol); +#endif diff --git a/maca_common/include/util.h b/maca_common/include/util.h index 7046269758ef894325a2209bbcd8c89ed4c3755b..971bf1a1f0f54f4bd509346866d9285bd628584c 100644 --- a/maca_common/include/util.h +++ b/maca_common/include/util.h @@ -2,6 +2,8 @@ #define __UTIL__ #include<stdlib.h> +#include<stdio.h> + void myfree(void *ptr); void *memalloc(size_t s); FILE *myfopen(const char *path, const char *mode); diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c new file mode 100644 index 0000000000000000000000000000000000000000..6494c2286da458ad2cdda966ac9db6a5a7165bee --- /dev/null +++ b/maca_common/src/trie.c @@ -0,0 +1,183 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include "trie.h" +#include "util.h" + +trie_state *trie_state_new(trie_trans *transitions, int is_accept) +{ + trie_state *state = memalloc(sizeof(trie_state)); + state->transitions = transitions; + state->is_accept = is_accept; + state->fail = 0; + return state; +} + +void trie_state_free(trie_state *state) +{ + if(state){ + trie_trans_free_rec(state->transitions); + free(state); + } +} + +trie *trie_new(void) +{ + trie *t = memalloc(sizeof(trie)); + t->states = NULL; + t->size = 0; + t->states_nb = 0; + trie_add_state(t); /* initial state */ + return t; +} + +void trie_free(trie *t) +{ + int i; + if(t){ + for(i=0; i < t->size; i++) + trie_state_free(t->states[i]); + free(t->states); + free(t); + } +} + +trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next) +{ + trie_trans *trans = memalloc(sizeof(trie_trans)); + trans->destination = destination; + trans->symbol = symbol; + trans->next = next; + return trans; +} + +void trie_trans_free_rec(trie_trans *trans) +{ + if(trans){ + if(trans->next) + trie_trans_free_rec(trans->next); + else + free(trans); + } +} + +int trie_add_state(trie *t) +{ + if(t->states_nb == t->size){ + t->size = 2 * (t->size + 1); + t->states = (trie_state **) realloc(t->states, t->size * sizeof(trie_state *)); + } + t->states[t->states_nb] = trie_state_new(NULL, 0); + t->states_nb++; + return t->states_nb - 1; +} + +void trie_add_trans(trie *t, int origin, int symbol, int destination) +{ + /* make sure origin state exists */ + while(origin >= t->states_nb) + trie_add_state(t); + + /* make sure destination state exists */ + while(destination >= t->states_nb) + trie_add_state(t); + + t->states[origin]->transitions = trie_trans_new(destination, symbol, t->states[origin]->transitions); +} + +void trie_add_word(trie *t, int *word, int length) +{ + int current_index = 0; + int current_state = 0; + trie_trans *current_trans = NULL; + int transition_exists = 1; + int destination; + int i; + + while((current_index < length) && transition_exists){ + transition_exists = 0; + for(current_trans = t->states[current_state]->transitions; current_trans; current_trans = current_trans->next){ + if(current_trans->symbol == word[current_index]){ + current_state = current_trans->destination; + current_index++; + transition_exists = 1; + break; + } + } + } + while(current_index < length){ + destination = trie_add_state(t); + trie_add_trans(t, current_state, word[current_index], destination); + current_index++; + current_state = destination; + } + t->states[current_state]->is_accept = 1; +} + +void trie_print(FILE *f, trie *t) +{ + int i; + trie_trans *trans; + for(i=0; i < t->states_nb; i++){ + fprintf(f, "state %d", i); + if(t->states[i]->is_accept) fprintf(f, " ACCEPT\n"); + else fprintf(f, "\n"); + fprintf(f, "FAIL = %d\n", t->states[i]->fail); + for(trans = t->states[i]->transitions; trans; trans = trans->next){ + fprintf(f, "%d %d %d\n", i, trans->symbol, trans->destination); + } + fprintf(f, "\n"); + } +} + +int trie_lookup(trie *t, int *word, int length) +{ + int i; + int current_state = 0; + trie_trans *trans; + for(i=0; i < length; i++){ + for(trans = t->states[current_state]->transitions; trans; trans = trans->next){ + if(trans->symbol == word[i]){ + current_state = trans->destination; + break; + } + } + if(trans == NULL) + return 0; + } + return t->states[current_state]->is_accept; +} + + +trie *trie_build_from_collection(char *filename) +{ + trie *t = trie_new(); + FILE *f = myfopen(filename, "r"); + char buffer[1000]; + int word[100]; + int length; + char *token; + + while(fgets(buffer, 10000, f)){ + length = 0; + token = strtok(buffer, " "); + while(token){ + word[length++] = atoi(token); + token = strtok(NULL, " "); + } + trie_add_word(t, word, length); + } + fclose(f); + return t; +} + +int trie_destination_state(trie *t, int origin, int symbol) +{ + trie_trans *trans; + for(trans = t->states[origin]->transitions; trans; trans = trans->next){ + if(trans->symbol == symbol) + return trans->destination; + } + return 0; +} diff --git a/maca_lexer/CMakeLists.txt b/maca_lexer/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c42bcff7ba6efe6b2aa6f55411ce1cee5dd230d --- /dev/null +++ b/maca_lexer/CMakeLists.txt @@ -0,0 +1,16 @@ +set(SOURCES src/context.c) + +##compiling library +include_directories(src) +add_library(maca_lexer_lib STATIC ${SOURCES}) + +#compiling, linking and installing executables + +add_executable(extract_mwe_from_fplm ./src/extract_mwe_from_fplm.c) +target_link_libraries(extract_mwe_from_fplm maca_common) +install (TARGETS extract_mwe_from_fplm DESTINATION bin) + +add_executable(maca_lexer ./src/maca_lexer.c) +target_link_libraries(maca_lexer maca_lexer_lib maca_common) +install (TARGETS maca_lexer DESTINATION bin) + diff --git a/maca_lexer/src/context.c b/maca_lexer/src/context.c new file mode 100644 index 0000000000000000000000000000000000000000..6a281826a707db0dab8c1375d4ca6c8a962595c9 --- /dev/null +++ b/maca_lexer/src/context.c @@ -0,0 +1,196 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "context.h" +#include "util.h" + + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->input_filename) free(ctx->input_filename); + if(ctx->output_filename) free(ctx->output_filename); + if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + if(ctx->mwe_filename) free(ctx->mwe_filename); + if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + ctx->form_column = -1; + ctx->input_filename = NULL; + ctx->output_filename = NULL; + ctx->mwe_filename = NULL; + ctx->mwe_tokens_dico_filename = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_input_help_message(context *ctx){ + fprintf(stderr, "\t-i --input <file> : input mcf file name\n"); +} + +void context_form_column_help_message(context *ctx){ + fprintf(stderr, "\t-F --form_column <int> : column containing form\n"); +} + +void context_fplm_help_message(context *ctx){ + fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); +} + +void context_mcd_help_message(context *ctx){ + fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); +} + +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); +} + +void context_maca_data_path_help_message(context *ctx){ + fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[12] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"input", required_argument, 0, 'i'}, + {"output", required_argument, 0, 'o'}, + {"mcd", required_argument, 0, 'C'}, + {"language", required_argument, 0, 'L'}, + {"fplm", required_argument, 0, 'f'}, + {"form_column", required_argument, 0, 'F'}, + {"maca_data_path", required_argument, 0, 'D'}, + {"mwe", required_argument, 0, 'M'}, + {"vocab", required_argument, 0, 'V'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdi:o:f:C:L:M:F:D:V:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'F': + ctx->form_column = atoi(optarg) - 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'i': + ctx->input_filename = strdup(optarg); + break; + case 'o': + ctx->output_filename = strdup(optarg); + break; + case 'C': + ctx->mcd_filename = strdup(optarg); + break; + case 'L': + ctx->language = strdup(optarg); + break; + case 'D': + ctx->maca_data_path = strdup(optarg); + break; + case 'V': + ctx->mwe_tokens_dico_filename = strdup(optarg); + break; + case 'M': + ctx->mwe_filename = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + + if(ctx->mcd_filename) + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + + + if((ctx->mcd_filename == NULL) && (ctx->form_column == -1)) + /* ctx->mcd_struct = mcd_build_conll07(); */ + ctx->mcd_struct = mcd_build_wplgf(); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { + strcat(absolute_path, e); + } else { + fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); + } + } + + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->mwe_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MWE_FILENAME); + ctx->mwe_filename = strdup(absolute_filename); + } + + if(!ctx->mwe_tokens_dico_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MWE_TOKENS_DICO_FILENAME); + ctx->mwe_tokens_dico_filename = strdup(absolute_filename); + } + +} diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h new file mode 100644 index 0000000000000000000000000000000000000000..a1898faf3588e159738c450d898f69e6e8999596 --- /dev/null +++ b/maca_lexer/src/context.h @@ -0,0 +1,40 @@ +#ifndef __MACA_LEXER_CONTEXT__ +#define __MACA_LEXER_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +#define DEFAULT_MWE_TOKENS_DICO_FILENAME "mwe_tokens" +#define DEFAULT_MWE_FILENAME "mwe" + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *fplm_filename; + char *language; + char *maca_data_path; + char *mcd_filename; + mcd *mcd_struct; + int form_column; + char *input_filename; + char *output_filename; + char *mwe_filename; + char *mwe_tokens_dico_filename; +} context; + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_conll_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_fplm_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_mcd_help_message(context *ctx); +void context_form_column_help_message(context *ctx); +void context_pos_column_help_message(context *ctx); + +#endif diff --git a/maca_lexer/src/extract_mwe_from_fplm.c b/maca_lexer/src/extract_mwe_from_fplm.c new file mode 100644 index 0000000000000000000000000000000000000000..6dec0cbea129e1ad1f421d9f491bf138126bda1e --- /dev/null +++ b/maca_lexer/src/extract_mwe_from_fplm.c @@ -0,0 +1,77 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include"dico.h" +#include"util.h" + + +/* return 1 if form contains at least one space character */ +int form_is_complex(char *form) +{ + int i; + int l = strlen(form); + for(i=0; i < l; i++) + if(form[i] == ' ') + return 1; + return 0; +} + +dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int debug_mode) +{ + char form[1000]; + char pos[1000]; + char lemma[1000]; + char morpho[1000]; + int num = 0; + char buffer[10000]; + FILE *f= myfopen(fplm_filename, "r"); + int fields_nb; + char token[1000]; + int l; + int i, j; + dico *d_tokens = dico_new("TOKENS", 100000); + int token_code; + while(fgets(buffer, 10000, f)){ + fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + if(fields_nb != 4){ + if(debug_mode){ + fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + if(form_is_complex(form)){ + /* fprintf(stdout, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ + /* fprintf(stdout, "%s\n", form); */ + l = strlen(form); + j = 0; + for(i=0; i <= l; i++){ + if((form[i] != ' ') && (i < l)){ + token[j++] = form[i]; + } + else{ + token[j] = '\0'; + token_code = dico_add(d_tokens, token); + /* fprintf(output_file, "token = %s code = %d\n", token, token_code); */ + fprintf(output_file, "%d", token_code); + if(i != l) + fprintf(output_file, " "); + j = 0; + } + } + fprintf(output_file, "\n"); + } + } + return d_tokens; +} + +int main(int argc, char *argv[]) +{ + + dico *d_tokens; + + d_tokens = decompose_mwe_in_fplm_file(argv[1], stdout, 1); + dico_print("d_tokens.dico", d_tokens); + dico_free(d_tokens); +} diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c new file mode 100644 index 0000000000000000000000000000000000000000..5308fac5d036510738c80a77457d555225ae456f --- /dev/null +++ b/maca_lexer/src/maca_lexer.c @@ -0,0 +1,102 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include"trie.h" +#include"dico.h" +#include"util.h" +#include"context.h" + +int main(int argc, char *argv[]) +{ + char buffer[10000]; + char *buffer_copy; + char *form; + int form_code; + context *ctx; + int form_column; + FILE *f = NULL; + trie *mwe_trie; + dico *d_mwe_tokens = NULL; + int origin_state = 0; + int destination_state = 0; + int states_array[100]; + int symbols_array[100]; + int path_index = 0; + int i; + + ctx = context_read_options(argc, argv); + /* maca_lexer_check_options(ctx); */ + + + if(ctx->form_column != -1) + form_column = ctx->form_column; + else + form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; + + if(ctx->input_filename == NULL) + f = stdin; + else + f = myfopen(ctx->input_filename, "r"); + + mwe_trie = trie_build_from_collection(ctx->mwe_filename); + d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5); + + /* trie_print(stdout, mwe_trie); */ + + + /* look for a valid word */ + while(fgets(buffer, 10000, f)){ + if(feof(f)) return 0; /* no more words to read */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ + printf("\n"); + continue; + } + + buffer[strlen(buffer)-1] = '\0'; + form_code = dico_string2int(d_mwe_tokens, buffer); + symbols_array[path_index] = form_code; + states_array[path_index] = (form_code == -1)? 0 + : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); + /* printf("buffer = %s ", buffer); + printf("code = %d\n", form_code); + + + for(i=0; i <= path_index; i++){ + printf("%d ", states_array[i]); + } + printf("\n"); + for(i=0; i <= path_index; i++){ + printf("%d ", symbols_array[i]); + } + printf("\n"); + */ + if(states_array[path_index] == 0){ /* in initial state of trie */ + if(path_index == 0){ /* nothing has been recognized */ + printf("%s\n", buffer); + } + else{ + if(mwe_trie->states[states_array[path_index - 1]]->is_accept){ + for(i=0; i < path_index; i++){ + if(i > 0) printf("#"); + printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + printf("\n"); + } + else{ + for(i=0; i < path_index; i++){ + printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + } + printf("%s\n", buffer); + } + path_index = 0; + } + else{ + path_index++; + } + + } + return 0; +} + diff --git a/maca_tokenizer/main.c b/maca_tokenizer/main.c index f0bef4ca78b9ae992a5cd6fb7a24a17600e157ac..f3586e084fd9aee749368fc0d07e1223ef9e0f30 100644 --- a/maca_tokenizer/main.c +++ b/maca_tokenizer/main.c @@ -1,5 +1,8 @@ +int defait_amalgames = 0; + int main(int argc, char* argv[]) { + if(argc > 1) defait_amalgames = 1; yylex() ; return 0; diff --git a/maca_tokenizer/tok_rules.l b/maca_tokenizer/tok_rules.l index d60af0ee7f05e26fa5bb0718370c3581dbfc5fee..4cd823d15c0535b7bcf9fcb1b34385d5789355f4 100644 --- a/maca_tokenizer/tok_rules.l +++ b/maca_tokenizer/tok_rules.l @@ -1,16 +1,26 @@ %{ #include <stdio.h> +extern int defait_amalgames; %} %option noyywrap +%s state_defait_amalgames +%s state_num %% -" "+ printf("\n"); + if(defait_amalgames){ + BEGIN(state_defait_amalgames); + } + +<state_num>[0-9]*,[0-9]* printf("%s", yytext); +[ \t]+ printf("\n"); \. printf("\n."); \, printf("\n,"); ' printf("'\n"); ’ printf("'\n"); \n+ printf("\n"); -du printf("de\nle"); -des printf("de\nles"); -au printf("à\nle"); -aux printf("à\nles"); +<state_defait_amalgames>{ +" du " printf("\nde\nle\n"); +" des " printf("\nde\nles\n"); +" au " printf("\nà\nle\n"); +" aux " printf("\nà\nles\n"); +} %%