Skip to content
Snippets Groups Projects
Commit ad8f50f8 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added maca_lexer a program that recoginzes non ambiguous mwe

parent 8c12e031
No related branches found
No related tags found
No related merge requests found
......@@ -22,6 +22,7 @@ add_subdirectory(maca_tools)
add_subdirectory(perceptron)
#add_subdirectory(maca_lemmatizer)
add_subdirectory(maca_tokenizer)
add_subdirectory(maca_lexer)
add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
add_subdirectory(maca_graph_parser)
......
......@@ -8,7 +8,15 @@ set(SOURCES src/util.c
src/word.c
src/sentence.c
src/word_buffer.c
src/trie.c
)
#compiling library
add_library(maca_common STATIC ${SOURCES})
#compiling, linking and installing executables
#add_executable(test_trie ./test/test_trie.c)
#target_link_libraries(test_trie maca_common)
#install (TARGETS test_trie DESTINATION bin)
#ifndef __TRIE__
#define __TRIE__
#include<stdio.h>
typedef struct trans{
int destination;
int symbol;
struct trans *next;
} trie_trans;
typedef struct {
trie_trans *transitions;
int is_accept;
int fail;
} trie_state;
typedef struct {
trie_state **states;
int size;
int states_nb;
} trie;
trie_state *trie_state_new(trie_trans *transitions, int is_accept);
void trie_state_free(trie_state *state);
trie *trie_new(void);
void trie_free(trie *t);
trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next);
void trie_trans_free_rec(trie_trans *trans);
int trie_add_state(trie *t);
void trie_add_trans(trie *t, int origin, int symbol, int destination);
void trie_add_word(trie *t, int *word, int length);
void trie_print(FILE *f, trie *t);
int trie_lookup(trie *t, int *word, int length);
trie *trie_build_from_collection(char *filename);
int trie_destination_state(trie *t, int origin, int symbol);
#endif
......@@ -2,6 +2,8 @@
#define __UTIL__
#include<stdlib.h>
#include<stdio.h>
void myfree(void *ptr);
void *memalloc(size_t s);
FILE *myfopen(const char *path, const char *mode);
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include "trie.h"
#include "util.h"
trie_state *trie_state_new(trie_trans *transitions, int is_accept)
{
trie_state *state = memalloc(sizeof(trie_state));
state->transitions = transitions;
state->is_accept = is_accept;
state->fail = 0;
return state;
}
void trie_state_free(trie_state *state)
{
if(state){
trie_trans_free_rec(state->transitions);
free(state);
}
}
trie *trie_new(void)
{
trie *t = memalloc(sizeof(trie));
t->states = NULL;
t->size = 0;
t->states_nb = 0;
trie_add_state(t); /* initial state */
return t;
}
void trie_free(trie *t)
{
int i;
if(t){
for(i=0; i < t->size; i++)
trie_state_free(t->states[i]);
free(t->states);
free(t);
}
}
trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next)
{
trie_trans *trans = memalloc(sizeof(trie_trans));
trans->destination = destination;
trans->symbol = symbol;
trans->next = next;
return trans;
}
void trie_trans_free_rec(trie_trans *trans)
{
if(trans){
if(trans->next)
trie_trans_free_rec(trans->next);
else
free(trans);
}
}
int trie_add_state(trie *t)
{
if(t->states_nb == t->size){
t->size = 2 * (t->size + 1);
t->states = (trie_state **) realloc(t->states, t->size * sizeof(trie_state *));
}
t->states[t->states_nb] = trie_state_new(NULL, 0);
t->states_nb++;
return t->states_nb - 1;
}
void trie_add_trans(trie *t, int origin, int symbol, int destination)
{
/* make sure origin state exists */
while(origin >= t->states_nb)
trie_add_state(t);
/* make sure destination state exists */
while(destination >= t->states_nb)
trie_add_state(t);
t->states[origin]->transitions = trie_trans_new(destination, symbol, t->states[origin]->transitions);
}
void trie_add_word(trie *t, int *word, int length)
{
int current_index = 0;
int current_state = 0;
trie_trans *current_trans = NULL;
int transition_exists = 1;
int destination;
int i;
while((current_index < length) && transition_exists){
transition_exists = 0;
for(current_trans = t->states[current_state]->transitions; current_trans; current_trans = current_trans->next){
if(current_trans->symbol == word[current_index]){
current_state = current_trans->destination;
current_index++;
transition_exists = 1;
break;
}
}
}
while(current_index < length){
destination = trie_add_state(t);
trie_add_trans(t, current_state, word[current_index], destination);
current_index++;
current_state = destination;
}
t->states[current_state]->is_accept = 1;
}
void trie_print(FILE *f, trie *t)
{
int i;
trie_trans *trans;
for(i=0; i < t->states_nb; i++){
fprintf(f, "state %d", i);
if(t->states[i]->is_accept) fprintf(f, " ACCEPT\n");
else fprintf(f, "\n");
fprintf(f, "FAIL = %d\n", t->states[i]->fail);
for(trans = t->states[i]->transitions; trans; trans = trans->next){
fprintf(f, "%d %d %d\n", i, trans->symbol, trans->destination);
}
fprintf(f, "\n");
}
}
int trie_lookup(trie *t, int *word, int length)
{
int i;
int current_state = 0;
trie_trans *trans;
for(i=0; i < length; i++){
for(trans = t->states[current_state]->transitions; trans; trans = trans->next){
if(trans->symbol == word[i]){
current_state = trans->destination;
break;
}
}
if(trans == NULL)
return 0;
}
return t->states[current_state]->is_accept;
}
trie *trie_build_from_collection(char *filename)
{
trie *t = trie_new();
FILE *f = myfopen(filename, "r");
char buffer[1000];
int word[100];
int length;
char *token;
while(fgets(buffer, 10000, f)){
length = 0;
token = strtok(buffer, " ");
while(token){
word[length++] = atoi(token);
token = strtok(NULL, " ");
}
trie_add_word(t, word, length);
}
fclose(f);
return t;
}
int trie_destination_state(trie *t, int origin, int symbol)
{
trie_trans *trans;
for(trans = t->states[origin]->transitions; trans; trans = trans->next){
if(trans->symbol == symbol)
return trans->destination;
}
return 0;
}
set(SOURCES src/context.c)
##compiling library
include_directories(src)
add_library(maca_lexer_lib STATIC ${SOURCES})
#compiling, linking and installing executables
add_executable(extract_mwe_from_fplm ./src/extract_mwe_from_fplm.c)
target_link_libraries(extract_mwe_from_fplm maca_common)
install (TARGETS extract_mwe_from_fplm DESTINATION bin)
add_executable(maca_lexer ./src/maca_lexer.c)
target_link_libraries(maca_lexer maca_lexer_lib maca_common)
install (TARGETS maca_lexer DESTINATION bin)
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->input_filename) free(ctx->input_filename);
if(ctx->output_filename) free(ctx->output_filename);
if(ctx->fplm_filename) free(ctx->fplm_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
if(ctx->mwe_filename) free(ctx->mwe_filename);
if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->fplm_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->form_column = -1;
ctx->input_filename = NULL;
ctx->output_filename = NULL;
ctx->mwe_filename = NULL;
ctx->mwe_tokens_dico_filename = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_input_help_message(context *ctx){
fprintf(stderr, "\t-i --input <file> : input mcf file name\n");
}
void context_form_column_help_message(context *ctx){
fprintf(stderr, "\t-F --form_column <int> : column containing form\n");
}
void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[12] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"fplm", required_argument, 0, 'f'},
{"form_column", required_argument, 0, 'F'},
{"maca_data_path", required_argument, 0, 'D'},
{"mwe", required_argument, 0, 'M'},
{"vocab", required_argument, 0, 'V'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:o:f:C:L:M:F:D:V:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'F':
ctx->form_column = atoi(optarg) - 1;
break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
case 'i':
ctx->input_filename = strdup(optarg);
break;
case 'o':
ctx->output_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'L':
ctx->language = strdup(optarg);
break;
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
case 'V':
ctx->mwe_tokens_dico_filename = strdup(optarg);
break;
case 'M':
ctx->mwe_filename = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
if((ctx->mcd_filename == NULL) && (ctx->form_column == -1))
/* ctx->mcd_struct = mcd_build_conll07(); */
ctx->mcd_struct = mcd_build_wplgf();
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->mwe_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MWE_FILENAME);
ctx->mwe_filename = strdup(absolute_filename);
}
if(!ctx->mwe_tokens_dico_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MWE_TOKENS_DICO_FILENAME);
ctx->mwe_tokens_dico_filename = strdup(absolute_filename);
}
}
#ifndef __MACA_LEXER_CONTEXT__
#define __MACA_LEXER_CONTEXT__
#include "mcd.h"
#include <stdlib.h>
#define DEFAULT_MWE_TOKENS_DICO_FILENAME "mwe_tokens"
#define DEFAULT_MWE_FILENAME "mwe"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *fplm_filename;
char *language;
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
int form_column;
char *input_filename;
char *output_filename;
char *mwe_filename;
char *mwe_tokens_dico_filename;
} context;
context *context_new(void);
void context_free(context *ctx);
context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_fplm_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx);
void context_pos_column_help_message(context *ctx);
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"dico.h"
#include"util.h"
/* return 1 if form contains at least one space character */
int form_is_complex(char *form)
{
int i;
int l = strlen(form);
for(i=0; i < l; i++)
if(form[i] == ' ')
return 1;
return 0;
}
dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int debug_mode)
{
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char buffer[10000];
FILE *f= myfopen(fplm_filename, "r");
int fields_nb;
char token[1000];
int l;
int i, j;
dico *d_tokens = dico_new("TOKENS", 100000);
int token_code;
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
if(fields_nb != 4){
if(debug_mode){
fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);
fprintf(stderr, "incorrect fplm entry, skipping it\n");
}
continue;
}
if(form_is_complex(form)){
/* fprintf(stdout, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */
/* fprintf(stdout, "%s\n", form); */
l = strlen(form);
j = 0;
for(i=0; i <= l; i++){
if((form[i] != ' ') && (i < l)){
token[j++] = form[i];
}
else{
token[j] = '\0';
token_code = dico_add(d_tokens, token);
/* fprintf(output_file, "token = %s code = %d\n", token, token_code); */
fprintf(output_file, "%d", token_code);
if(i != l)
fprintf(output_file, " ");
j = 0;
}
}
fprintf(output_file, "\n");
}
}
return d_tokens;
}
int main(int argc, char *argv[])
{
dico *d_tokens;
d_tokens = decompose_mwe_in_fplm_file(argv[1], stdout, 1);
dico_print("d_tokens.dico", d_tokens);
dico_free(d_tokens);
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"trie.h"
#include"dico.h"
#include"util.h"
#include"context.h"
int main(int argc, char *argv[])
{
char buffer[10000];
char *buffer_copy;
char *form;
int form_code;
context *ctx;
int form_column;
FILE *f = NULL;
trie *mwe_trie;
dico *d_mwe_tokens = NULL;
int origin_state = 0;
int destination_state = 0;
int states_array[100];
int symbols_array[100];
int path_index = 0;
int i;
ctx = context_read_options(argc, argv);
/* maca_lexer_check_options(ctx); */
if(ctx->form_column != -1)
form_column = ctx->form_column;
else
form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM];
if(ctx->input_filename == NULL)
f = stdin;
else
f = myfopen(ctx->input_filename, "r");
mwe_trie = trie_build_from_collection(ctx->mwe_filename);
d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
/* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */
if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
printf("\n");
continue;
}
buffer[strlen(buffer)-1] = '\0';
form_code = dico_string2int(d_mwe_tokens, buffer);
symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
/* printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]);
}
printf("\n");
for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]);
}
printf("\n");
*/
if(states_array[path_index] == 0){ /* in initial state of trie */
if(path_index == 0){ /* nothing has been recognized */
printf("%s\n", buffer);
}
else{
if(mwe_trie->states[states_array[path_index - 1]]->is_accept){
for(i=0; i < path_index; i++){
if(i > 0) printf("#");
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
printf("\n");
}
else{
for(i=0; i < path_index; i++){
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
printf("%s\n", buffer);
}
path_index = 0;
}
else{
path_index++;
}
}
return 0;
}
int defait_amalgames = 0;
int main(int argc, char* argv[]) {
if(argc > 1) defait_amalgames = 1;
yylex() ;
return 0;
......
%{
#include <stdio.h>
extern int defait_amalgames;
%}
%option noyywrap
%s state_defait_amalgames
%s state_num
%%
" "+ printf("\n");
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
<state_num>[0-9]*,[0-9]* printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
' printf("'\n");
’ printf("'\n");
\n+ printf("\n");
du printf("de\nle");
des printf("de\nles");
au printf("à\nle");
aux printf("à\nles");
<state_defait_amalgames>{
" du " printf("\nde\nle\n");
" des " printf("\nde\nles\n");
" au " printf("\nà\nle\n");
" aux " printf("\nà\nles\n");
}
%%
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment