diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index ba4b6fb14d0fe34b564b908a148fd92c4593d554..c9134fa4883c3edf798648658c52eef3c7754582 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -118,6 +118,7 @@ typedef struct { mcd *mcd_build_conll07(void); mcd *mcd_build_ifpls(void); +mcd *mcd_build_wplgf(void); mcd *mcd_read(char *mcd_filename, int verbose); void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); diff --git a/maca_common/include/word.h b/maca_common/include/word.h index ba567bef034a027f458a821837d2744befec905d..2e5ae2c72df48d5f8c1772dbd26653a091d104ee 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -101,7 +101,7 @@ word *word_create_dummy(mcd *mcd_struct); word *word_copy(word *w); void word_free(word *w); void word_print2(FILE *f, word *w); -void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels); +void word_print(FILE *f, word *w); word *word_read(FILE *f, mcd *mcd_struct); diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 115e8ce3e71892540e71815a3a17530043946fb5..1bb939cf75a01482d61960b2635c354508df24d5 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -250,6 +250,50 @@ mcd *mcd_build_conll07(void) /* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */ +mcd *mcd_build_wplgf(void) +{ + mcd *m = mcd_new(5); + int col; + + col = 0; + m->wf[col]=MCD_WF_FORM; + m->wf_str[col]=strdup("FORM"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FORM] = col; + + col = 1; + m->wf[col]=MCD_WF_POS; + m->wf_str[col]=strdup("POS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_POS] = 1; + + col = 2; + m->wf[col]=MCD_WF_LEMMA; + m->wf_str[col]=strdup("LEMMA"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LEMMA] = 2; + + col = 3; + m->wf[col]=MCD_WF_GOV; + m->wf_str[col]=strdup("GOV"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_GOV] = 3; + + col = 4; + m->wf[col]=MCD_WF_LABEL; + m->wf_str[col]=strdup("LABEL"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LABEL] = 4; + + return m; +} +/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */ + mcd *mcd_build_ifpls(void) { mcd *m = mcd_new(6); diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index 431fa52c05d851fcc7f1aec30967aa5ec932d861..30a23b947c529a3c3455d36f81e016d6554f832f 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -37,7 +37,7 @@ void sentence_print(FILE *f, sentence *s, dico *dico_labels) for(i=1; i < s->length; i++){ fprintf(f, "%d\t", i); - word_print(f, s->words[i], s->mcd_struct, dico_labels); + word_print(f, s->words[i]); fprintf(f, "\n"); } fprintf(f, "\n"); diff --git a/maca_common/src/word.c b/maca_common/src/word.c index b2c4aa67fc657209a925b23e2a6f67ff513cef0e..ee703d075017a98e4af07693deaa5227c9f1bfdc 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -30,6 +30,7 @@ word *word_read(FILE *f, mcd *mcd_struct) /* look for a valid word */ while(fgets(buffer, 10000, f)){ + /* printf("buffer = %s\n", buffer); */ /* ignore empty lines */ if((buffer[0] == '\n')) continue; /* lines beginning with ## are comments */ @@ -120,7 +121,7 @@ void word_print2(FILE *f, word *w) printf("rel index = %d\n", word_get_relative_index(w)); } -void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels) +void word_print(FILE *f, word *w) { if(w == NULL) return; diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 81daf46f5a6215a04283ea641851d8d253282343..262a08cbc2233cd0f935b0648bf75d48f68a2470 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -141,7 +141,8 @@ context *context_read_options(int argc, char *argv[]) if((ctx->mcd_filename == NULL) && ((ctx->form_column == -1) || (ctx->pos_column == -1))) - ctx->mcd_struct = mcd_build_conll07(); + /* ctx->mcd_struct = mcd_build_conll07(); */ + ctx->mcd_struct = mcd_build_wplgf(); return ctx; } diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index f0b58d06b9071b3dcbe9d2516056a8d9212c79c3..cf264aafcfea1f97ce6ddadb656b9d1218b9ce05 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -18,6 +18,7 @@ set(SOURCES src/context.c src/queue.c src/beam.c src/feat_types.c + src/word_buffer.c ) #compiling library @@ -70,6 +71,12 @@ target_link_libraries(maca_trans_parser_train transparse) target_link_libraries(maca_trans_parser_train maca_common) install (TARGETS maca_trans_parser_train DESTINATION bin) +add_executable(test_word_buffer ./src/test_word_buffer.c) +target_compile_options(test_word_buffer INTERFACE -Wall) +target_link_libraries(test_word_buffer transparse) +target_link_libraries(test_word_buffer maca_common) +install (TARGETS test_word_buffer DESTINATION bin) + #add_executable(test_w2v ./src/test_w2v.c) #target_link_libraries(test_w2v transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 54465328a86010387629033e527a7b003f219872..fe4f6205fa1ae91fc29347156aacb48e32d72663 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -265,7 +265,8 @@ context *context_read_options(int argc, char *argv[]) if(ctx->conll) ctx->mcd_struct = mcd_build_conll07(); else - ctx->mcd_struct = mcd_build_ifpls(); + ctx->mcd_struct = mcd_build_wplgf(); + /* ctx->mcd_struct = mcd_build_ifpls(); */ return ctx; } diff --git a/maca_trans_parser/src/test_word_buffer.c b/maca_trans_parser/src/test_word_buffer.c new file mode 100644 index 0000000000000000000000000000000000000000..0ab392fbf9ad78c47a2e0842384114371dd2ede7 --- /dev/null +++ b/maca_trans_parser/src/test_word_buffer.c @@ -0,0 +1,35 @@ +#include<stdio.h> +#include<stdlib.h> +#include"word_buffer.h" +#include"util.h" + +int main(int argc, char *argv[]) +{ + mcd *mcd_struct; + FILE *mcf; + word_buffer *wb; + if(argc < 2){ + fprintf(stderr, "usage %s mcf mcd\n", argv[0]); + exit(1); + } + mcd_struct = mcd_read(argv[2], 1); + mcf = myfopen(argv[1], "r"); + + wb = word_buffer_new(mcf, mcd_struct, 0); + word_buffer_print(stdout, wb); + printf("\n"); + while(word_buffer_move_right(wb)){ + word_buffer_print(stdout, wb); + printf("\n"); + } + + printf("=================== CHANGE DIRECTION =====================\n"); + + while(word_buffer_move_left(wb)){ + word_buffer_print(stdout, wb); + printf("\n"); + } + + word_buffer_free(wb); + +} diff --git a/maca_trans_parser/src/word_buffer.c b/maca_trans_parser/src/word_buffer.c new file mode 100644 index 0000000000000000000000000000000000000000..fe35fc5d7179268c10439c71648a76da19fdbf89 --- /dev/null +++ b/maca_trans_parser/src/word_buffer.c @@ -0,0 +1,134 @@ +#include<stdio.h> +#include"word_buffer.h" +#include"util.h" + +word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead) +{ + int i; + word_buffer *wb = (word_buffer *)memalloc(sizeof(word_buffer)); + wb->input_file = input_file; + wb->mcd_struct = mcd_struct; + wb->size = 10; + wb->nbelem = 0; + wb->array = (word **)memalloc(wb->size * sizeof(word *)); + wb->current_index = 0; + wb->lookahead = lookahead; + for(i=0; i <= lookahead; i++) + word_buffer_read_next_word(wb); + return wb; +} + +void word_buffer_print(FILE *f, word_buffer *wb) +{ + word *w; + w = word_buffer_bm3(wb); + if(w){ fprintf(f, "[-3] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_bm2(wb); + if(w){ fprintf(f, "[-2] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_bm1(wb); + if(w){ fprintf(f, "[-1] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b0(wb); + if(w){ fprintf(f, "[ 0] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b1(wb); + if(w){ fprintf(f, "[ 1] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b2(wb); + if(w){ fprintf(f, "[ 2] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b3(wb); + if(w){ fprintf(f, "[ 3] "); word_print(f, w); fprintf(f, "\n");} +} + +void word_buffer_free(word_buffer *wb) +{ + int i; + free(wb->array); + free(wb); + for(i=0; i < wb->nbelem; i++){ + if(wb->array[i]) + word_free(wb->array[i]); + } +} + +int word_buffer_add(word_buffer *wb, word *w) +{ + if(wb->nbelem == wb->size -1){ + wb->size = 2 * (wb->size + 1); + wb->array = (word **)realloc(wb->array, wb->size * sizeof(word *)); + } + wb->array[wb->nbelem] = w; + wb->nbelem++; + return wb->nbelem - 1; +} + +word *word_buffer_get_word(word_buffer *wb, int offset) +{ + return ((wb->current_index + offset >=0) && (wb->current_index + offset <= wb->nbelem))? wb->array[wb->current_index + offset] : NULL; +} + +word *word_buffer_b0(word_buffer *wb) +{ + return(wb->nbelem == 0)? NULL : wb->array[wb->current_index]; +} + +word *word_buffer_b1(word_buffer *wb) +{ + return(wb->current_index + 1 >= wb->nbelem)? NULL : wb->array[wb->current_index + 1]; +} + +word *word_buffer_b2(word_buffer *wb) +{ + return(wb->current_index + 2 >= wb->nbelem)? NULL : wb->array[wb->current_index + 2]; +} + +word *word_buffer_b3(word_buffer *wb) +{ + return(wb->current_index + 3 >= wb->nbelem)? NULL : wb->array[wb->current_index + 3]; +} + +word *word_buffer_bm1(word_buffer *wb) +{ + return(wb->current_index - 1 < 0)? NULL : wb->array[wb->current_index - 1]; +} + +word *word_buffer_bm2(word_buffer *wb) +{ + return(wb->current_index - 2 < 0)? NULL : wb->array[wb->current_index - 2]; +} + +word *word_buffer_bm3(word_buffer *wb) +{ + return(wb->current_index - 3 < 0)? NULL : wb->array[wb->current_index - 3]; +} + +int word_buffer_read_next_word(word_buffer *wb) +{ + word *w = NULL; + int index; + + w = word_read(wb->input_file, wb->mcd_struct); + if(w == NULL) return -1; + + + index = word_buffer_add(wb, w); + word_set_relative_index(w, index); + return index; +} + +int word_buffer_move_right(word_buffer *wb) +{ + if((wb->nbelem - 1 - wb->current_index) <= wb->lookahead) + word_buffer_read_next_word(wb); + if(wb->current_index == wb->nbelem - 1) return 0; + wb->current_index++; + return 1; +} + +int word_buffer_move_left(word_buffer *wb) +{ + if(wb->current_index > 0){ + wb->current_index--; + return 1; + } + return 0; +} + + diff --git a/maca_trans_parser/src/word_buffer.h b/maca_trans_parser/src/word_buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..cd41d1729e6d59abb91e5b8d92b1d024c80e4b53 --- /dev/null +++ b/maca_trans_parser/src/word_buffer.h @@ -0,0 +1,35 @@ +#ifndef __WORD_BUFFER__ +#define __WORD_BUFFER__ + +#include<stdio.h> +#include"word.h" +#include"mcd.h" + +typedef struct { + int size; /* size of the array used to store words */ + int nbelem; /* number of words in the buffer */ + int lookahead; /* number of words between the current word and the last word of the buffer */ + int current_index; /* position of the current word */ + word **array; /* array to store words */ + FILE *input_file; /* file to read the words from */ + mcd *mcd_struct; /* mcd describing the format of input_file */ +} word_buffer; + + +word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead); +void word_buffer_free(word_buffer *wb); +int word_buffer_add(word_buffer *wb, word *w); +word *word_buffer_get_word_relative(word_buffer *wb, int dist); +word *word_buffer_b0(word_buffer *wb); +word *word_buffer_b1(word_buffer *wb); +word *word_buffer_b2(word_buffer *wb); +word *word_buffer_b3(word_buffer *wb); +word *word_buffer_bm1(word_buffer *wb); +word *word_buffer_bm2(word_buffer *wb); +word *word_buffer_bm3(word_buffer *wb); +int word_buffer_read_next_word(word_buffer *wb); +int word_buffer_move_right(word_buffer *wb); +int word_buffer_move_left(word_buffer *wb); +void word_buffer_print(FILE *f, word_buffer *wb); + +#endif