From 4fc0c025150028007ab5cd04e6a16721dd2bc6cb Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Wed, 5 Oct 2016 22:32:48 -0400 Subject: [PATCH] new structure called word buffer to store words --- maca_common/include/mcd.h | 1 + maca_common/include/word.h | 2 +- maca_common/src/mcd.c | 44 ++++++++ maca_common/src/sentence.c | 2 +- maca_common/src/word.c | 3 +- maca_lemmatizer/src/context.c | 3 +- maca_trans_parser/CMakeLists.txt | 7 ++ maca_trans_parser/src/context.c | 3 +- maca_trans_parser/src/test_word_buffer.c | 35 ++++++ maca_trans_parser/src/word_buffer.c | 134 +++++++++++++++++++++++ maca_trans_parser/src/word_buffer.h | 35 ++++++ 11 files changed, 264 insertions(+), 5 deletions(-) create mode 100644 maca_trans_parser/src/test_word_buffer.c create mode 100644 maca_trans_parser/src/word_buffer.c create mode 100644 maca_trans_parser/src/word_buffer.h diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index ba4b6fb..c9134fa 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -118,6 +118,7 @@ typedef struct { mcd *mcd_build_conll07(void); mcd *mcd_build_ifpls(void); +mcd *mcd_build_wplgf(void); mcd *mcd_read(char *mcd_filename, int verbose); void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); diff --git a/maca_common/include/word.h b/maca_common/include/word.h index ba567be..2e5ae2c 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -101,7 +101,7 @@ word *word_create_dummy(mcd *mcd_struct); word *word_copy(word *w); void word_free(word *w); void word_print2(FILE *f, word *w); -void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels); +void word_print(FILE *f, word *w); word *word_read(FILE *f, mcd *mcd_struct); diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 115e8ce..1bb939c 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -250,6 +250,50 @@ mcd *mcd_build_conll07(void) /* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */ +mcd *mcd_build_wplgf(void) +{ + mcd *m = mcd_new(5); + int col; + + col = 0; + m->wf[col]=MCD_WF_FORM; + m->wf_str[col]=strdup("FORM"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FORM] = col; + + col = 1; + m->wf[col]=MCD_WF_POS; + m->wf_str[col]=strdup("POS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_POS] = 1; + + col = 2; + m->wf[col]=MCD_WF_LEMMA; + m->wf_str[col]=strdup("LEMMA"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LEMMA] = 2; + + col = 3; + m->wf[col]=MCD_WF_GOV; + m->wf_str[col]=strdup("GOV"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_GOV] = 3; + + col = 4; + m->wf[col]=MCD_WF_LABEL; + m->wf_str[col]=strdup("LABEL"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LABEL] = 4; + + return m; +} +/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */ + mcd *mcd_build_ifpls(void) { mcd *m = mcd_new(6); diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index 431fa52..30a23b9 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -37,7 +37,7 @@ void sentence_print(FILE *f, sentence *s, dico *dico_labels) for(i=1; i < s->length; i++){ fprintf(f, "%d\t", i); - word_print(f, s->words[i], s->mcd_struct, dico_labels); + word_print(f, s->words[i]); fprintf(f, "\n"); } fprintf(f, "\n"); diff --git a/maca_common/src/word.c b/maca_common/src/word.c index b2c4aa6..ee703d0 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -30,6 +30,7 @@ word *word_read(FILE *f, mcd *mcd_struct) /* look for a valid word */ while(fgets(buffer, 10000, f)){ + /* printf("buffer = %s\n", buffer); */ /* ignore empty lines */ if((buffer[0] == '\n')) continue; /* lines beginning with ## are comments */ @@ -120,7 +121,7 @@ void word_print2(FILE *f, word *w) printf("rel index = %d\n", word_get_relative_index(w)); } -void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels) +void word_print(FILE *f, word *w) { if(w == NULL) return; diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 81daf46..262a08c 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -141,7 +141,8 @@ context *context_read_options(int argc, char *argv[]) if((ctx->mcd_filename == NULL) && ((ctx->form_column == -1) || (ctx->pos_column == -1))) - ctx->mcd_struct = mcd_build_conll07(); + /* ctx->mcd_struct = mcd_build_conll07(); */ + ctx->mcd_struct = mcd_build_wplgf(); return ctx; } diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index f0b58d0..cf264aa 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -18,6 +18,7 @@ set(SOURCES src/context.c src/queue.c src/beam.c src/feat_types.c + src/word_buffer.c ) #compiling library @@ -70,6 +71,12 @@ target_link_libraries(maca_trans_parser_train transparse) target_link_libraries(maca_trans_parser_train maca_common) install (TARGETS maca_trans_parser_train DESTINATION bin) +add_executable(test_word_buffer ./src/test_word_buffer.c) +target_compile_options(test_word_buffer INTERFACE -Wall) +target_link_libraries(test_word_buffer transparse) +target_link_libraries(test_word_buffer maca_common) +install (TARGETS test_word_buffer DESTINATION bin) + #add_executable(test_w2v ./src/test_w2v.c) #target_link_libraries(test_w2v transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 5446532..fe4f620 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -265,7 +265,8 @@ context *context_read_options(int argc, char *argv[]) if(ctx->conll) ctx->mcd_struct = mcd_build_conll07(); else - ctx->mcd_struct = mcd_build_ifpls(); + ctx->mcd_struct = mcd_build_wplgf(); + /* ctx->mcd_struct = mcd_build_ifpls(); */ return ctx; } diff --git a/maca_trans_parser/src/test_word_buffer.c b/maca_trans_parser/src/test_word_buffer.c new file mode 100644 index 0000000..0ab392f --- /dev/null +++ b/maca_trans_parser/src/test_word_buffer.c @@ -0,0 +1,35 @@ +#include<stdio.h> +#include<stdlib.h> +#include"word_buffer.h" +#include"util.h" + +int main(int argc, char *argv[]) +{ + mcd *mcd_struct; + FILE *mcf; + word_buffer *wb; + if(argc < 2){ + fprintf(stderr, "usage %s mcf mcd\n", argv[0]); + exit(1); + } + mcd_struct = mcd_read(argv[2], 1); + mcf = myfopen(argv[1], "r"); + + wb = word_buffer_new(mcf, mcd_struct, 0); + word_buffer_print(stdout, wb); + printf("\n"); + while(word_buffer_move_right(wb)){ + word_buffer_print(stdout, wb); + printf("\n"); + } + + printf("=================== CHANGE DIRECTION =====================\n"); + + while(word_buffer_move_left(wb)){ + word_buffer_print(stdout, wb); + printf("\n"); + } + + word_buffer_free(wb); + +} diff --git a/maca_trans_parser/src/word_buffer.c b/maca_trans_parser/src/word_buffer.c new file mode 100644 index 0000000..fe35fc5 --- /dev/null +++ b/maca_trans_parser/src/word_buffer.c @@ -0,0 +1,134 @@ +#include<stdio.h> +#include"word_buffer.h" +#include"util.h" + +word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead) +{ + int i; + word_buffer *wb = (word_buffer *)memalloc(sizeof(word_buffer)); + wb->input_file = input_file; + wb->mcd_struct = mcd_struct; + wb->size = 10; + wb->nbelem = 0; + wb->array = (word **)memalloc(wb->size * sizeof(word *)); + wb->current_index = 0; + wb->lookahead = lookahead; + for(i=0; i <= lookahead; i++) + word_buffer_read_next_word(wb); + return wb; +} + +void word_buffer_print(FILE *f, word_buffer *wb) +{ + word *w; + w = word_buffer_bm3(wb); + if(w){ fprintf(f, "[-3] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_bm2(wb); + if(w){ fprintf(f, "[-2] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_bm1(wb); + if(w){ fprintf(f, "[-1] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b0(wb); + if(w){ fprintf(f, "[ 0] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b1(wb); + if(w){ fprintf(f, "[ 1] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b2(wb); + if(w){ fprintf(f, "[ 2] "); word_print(f, w); fprintf(f, "\n");} + w = word_buffer_b3(wb); + if(w){ fprintf(f, "[ 3] "); word_print(f, w); fprintf(f, "\n");} +} + +void word_buffer_free(word_buffer *wb) +{ + int i; + free(wb->array); + free(wb); + for(i=0; i < wb->nbelem; i++){ + if(wb->array[i]) + word_free(wb->array[i]); + } +} + +int word_buffer_add(word_buffer *wb, word *w) +{ + if(wb->nbelem == wb->size -1){ + wb->size = 2 * (wb->size + 1); + wb->array = (word **)realloc(wb->array, wb->size * sizeof(word *)); + } + wb->array[wb->nbelem] = w; + wb->nbelem++; + return wb->nbelem - 1; +} + +word *word_buffer_get_word(word_buffer *wb, int offset) +{ + return ((wb->current_index + offset >=0) && (wb->current_index + offset <= wb->nbelem))? wb->array[wb->current_index + offset] : NULL; +} + +word *word_buffer_b0(word_buffer *wb) +{ + return(wb->nbelem == 0)? NULL : wb->array[wb->current_index]; +} + +word *word_buffer_b1(word_buffer *wb) +{ + return(wb->current_index + 1 >= wb->nbelem)? NULL : wb->array[wb->current_index + 1]; +} + +word *word_buffer_b2(word_buffer *wb) +{ + return(wb->current_index + 2 >= wb->nbelem)? NULL : wb->array[wb->current_index + 2]; +} + +word *word_buffer_b3(word_buffer *wb) +{ + return(wb->current_index + 3 >= wb->nbelem)? NULL : wb->array[wb->current_index + 3]; +} + +word *word_buffer_bm1(word_buffer *wb) +{ + return(wb->current_index - 1 < 0)? NULL : wb->array[wb->current_index - 1]; +} + +word *word_buffer_bm2(word_buffer *wb) +{ + return(wb->current_index - 2 < 0)? NULL : wb->array[wb->current_index - 2]; +} + +word *word_buffer_bm3(word_buffer *wb) +{ + return(wb->current_index - 3 < 0)? NULL : wb->array[wb->current_index - 3]; +} + +int word_buffer_read_next_word(word_buffer *wb) +{ + word *w = NULL; + int index; + + w = word_read(wb->input_file, wb->mcd_struct); + if(w == NULL) return -1; + + + index = word_buffer_add(wb, w); + word_set_relative_index(w, index); + return index; +} + +int word_buffer_move_right(word_buffer *wb) +{ + if((wb->nbelem - 1 - wb->current_index) <= wb->lookahead) + word_buffer_read_next_word(wb); + if(wb->current_index == wb->nbelem - 1) return 0; + wb->current_index++; + return 1; +} + +int word_buffer_move_left(word_buffer *wb) +{ + if(wb->current_index > 0){ + wb->current_index--; + return 1; + } + return 0; +} + + diff --git a/maca_trans_parser/src/word_buffer.h b/maca_trans_parser/src/word_buffer.h new file mode 100644 index 0000000..cd41d17 --- /dev/null +++ b/maca_trans_parser/src/word_buffer.h @@ -0,0 +1,35 @@ +#ifndef __WORD_BUFFER__ +#define __WORD_BUFFER__ + +#include<stdio.h> +#include"word.h" +#include"mcd.h" + +typedef struct { + int size; /* size of the array used to store words */ + int nbelem; /* number of words in the buffer */ + int lookahead; /* number of words between the current word and the last word of the buffer */ + int current_index; /* position of the current word */ + word **array; /* array to store words */ + FILE *input_file; /* file to read the words from */ + mcd *mcd_struct; /* mcd describing the format of input_file */ +} word_buffer; + + +word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead); +void word_buffer_free(word_buffer *wb); +int word_buffer_add(word_buffer *wb, word *w); +word *word_buffer_get_word_relative(word_buffer *wb, int dist); +word *word_buffer_b0(word_buffer *wb); +word *word_buffer_b1(word_buffer *wb); +word *word_buffer_b2(word_buffer *wb); +word *word_buffer_b3(word_buffer *wb); +word *word_buffer_bm1(word_buffer *wb); +word *word_buffer_bm2(word_buffer *wb); +word *word_buffer_bm3(word_buffer *wb); +int word_buffer_read_next_word(word_buffer *wb); +int word_buffer_move_right(word_buffer *wb); +int word_buffer_move_left(word_buffer *wb); +void word_buffer_print(FILE *f, word_buffer *wb); + +#endif -- GitLab