From 4fc0c025150028007ab5cd04e6a16721dd2bc6cb Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Wed, 5 Oct 2016 22:32:48 -0400
Subject: [PATCH] new structure called word buffer to store words

---
 maca_common/include/mcd.h                |   1 +
 maca_common/include/word.h               |   2 +-
 maca_common/src/mcd.c                    |  44 ++++++++
 maca_common/src/sentence.c               |   2 +-
 maca_common/src/word.c                   |   3 +-
 maca_lemmatizer/src/context.c            |   3 +-
 maca_trans_parser/CMakeLists.txt         |   7 ++
 maca_trans_parser/src/context.c          |   3 +-
 maca_trans_parser/src/test_word_buffer.c |  35 ++++++
 maca_trans_parser/src/word_buffer.c      | 134 +++++++++++++++++++++++
 maca_trans_parser/src/word_buffer.h      |  35 ++++++
 11 files changed, 264 insertions(+), 5 deletions(-)
 create mode 100644 maca_trans_parser/src/test_word_buffer.c
 create mode 100644 maca_trans_parser/src/word_buffer.c
 create mode 100644 maca_trans_parser/src/word_buffer.h

diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h
index ba4b6fb..c9134fa 100644
--- a/maca_common/include/mcd.h
+++ b/maca_common/include/mcd.h
@@ -118,6 +118,7 @@ typedef struct {
 
 mcd *mcd_build_conll07(void);
 mcd *mcd_build_ifpls(void);
+mcd *mcd_build_wplgf(void);
 
 mcd *mcd_read(char *mcd_filename, int verbose);
 void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
diff --git a/maca_common/include/word.h b/maca_common/include/word.h
index ba567be..2e5ae2c 100644
--- a/maca_common/include/word.h
+++ b/maca_common/include/word.h
@@ -101,7 +101,7 @@ word *word_create_dummy(mcd *mcd_struct);
 word *word_copy(word *w);
 void word_free(word *w);
 void word_print2(FILE *f, word *w);
-void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels);
+void word_print(FILE *f, word *w);
 
 word *word_read(FILE *f, mcd *mcd_struct);
 
diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c
index 115e8ce..1bb939c 100644
--- a/maca_common/src/mcd.c
+++ b/maca_common/src/mcd.c
@@ -250,6 +250,50 @@ mcd *mcd_build_conll07(void)
 
 /* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
 
+mcd *mcd_build_wplgf(void)
+{
+  mcd *m = mcd_new(5);
+  int col;
+
+  col = 0;
+  m->wf[col]=MCD_WF_FORM;
+  m->wf_str[col]=strdup("FORM");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_FORM] = col;
+  
+  col = 1;
+  m->wf[col]=MCD_WF_POS;
+  m->wf_str[col]=strdup("POS");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_POS] = 1;
+
+  col = 2;
+  m->wf[col]=MCD_WF_LEMMA;
+  m->wf_str[col]=strdup("LEMMA");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_LEMMA] = 2;
+
+  col = 3;
+  m->wf[col]=MCD_WF_GOV;
+  m->wf_str[col]=strdup("GOV");
+  m->representation[col]= MCD_REPRESENTATION_INT;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_GOV] = 3;
+
+  col = 4;
+  m->wf[col]=MCD_WF_LABEL;
+  m->wf_str[col]=strdup("LABEL");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_LABEL] = 4;
+  
+  return m;
+}
+/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
+
 mcd *mcd_build_ifpls(void)
 {
   mcd *m = mcd_new(6);
diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c
index 431fa52..30a23b9 100644
--- a/maca_common/src/sentence.c
+++ b/maca_common/src/sentence.c
@@ -37,7 +37,7 @@ void sentence_print(FILE *f, sentence *s, dico *dico_labels)
 
   for(i=1; i < s->length; i++){
     fprintf(f, "%d\t", i);
-    word_print(f, s->words[i], s->mcd_struct, dico_labels);
+    word_print(f, s->words[i]);
     fprintf(f, "\n");
   }    
   fprintf(f, "\n");
diff --git a/maca_common/src/word.c b/maca_common/src/word.c
index b2c4aa6..ee703d0 100644
--- a/maca_common/src/word.c
+++ b/maca_common/src/word.c
@@ -30,6 +30,7 @@ word *word_read(FILE *f, mcd *mcd_struct)
 
   /* look for a valid word */
   while(fgets(buffer, 10000, f)){
+    /* printf("buffer = %s\n", buffer); */
     /* ignore empty lines */
     if((buffer[0] == '\n')) continue;
     /* lines beginning with ## are comments */ 
@@ -120,7 +121,7 @@ void word_print2(FILE *f, word *w)
   printf("rel index = %d\n", word_get_relative_index(w));
 }
 
-void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels)
+void word_print(FILE *f, word *w)
 {
   if(w == NULL) return;
   
diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c
index 81daf46..262a08c 100644
--- a/maca_lemmatizer/src/context.c
+++ b/maca_lemmatizer/src/context.c
@@ -141,7 +141,8 @@ context *context_read_options(int argc, char *argv[])
 
 
   if((ctx->mcd_filename == NULL) && ((ctx->form_column == -1) || (ctx->pos_column == -1)))
-    ctx->mcd_struct = mcd_build_conll07();
+    /* ctx->mcd_struct = mcd_build_conll07(); */
+    ctx->mcd_struct = mcd_build_wplgf();
 
   return ctx;
 }
diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt
index f0b58d0..cf264aa 100644
--- a/maca_trans_parser/CMakeLists.txt
+++ b/maca_trans_parser/CMakeLists.txt
@@ -18,6 +18,7 @@ set(SOURCES src/context.c
  src/queue.c
  src/beam.c
  src/feat_types.c
+ src/word_buffer.c
 )
 
 #compiling library
@@ -70,6 +71,12 @@ target_link_libraries(maca_trans_parser_train transparse)
 target_link_libraries(maca_trans_parser_train maca_common)
 install (TARGETS maca_trans_parser_train DESTINATION bin)
 
+add_executable(test_word_buffer ./src/test_word_buffer.c)
+target_compile_options(test_word_buffer INTERFACE -Wall)
+target_link_libraries(test_word_buffer transparse)
+target_link_libraries(test_word_buffer maca_common)
+install (TARGETS test_word_buffer DESTINATION bin)
+
 
 #add_executable(test_w2v ./src/test_w2v.c)
 #target_link_libraries(test_w2v transparse)
diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c
index 5446532..fe4f620 100644
--- a/maca_trans_parser/src/context.c
+++ b/maca_trans_parser/src/context.c
@@ -265,7 +265,8 @@ context *context_read_options(int argc, char *argv[])
     if(ctx->conll)
      ctx->mcd_struct = mcd_build_conll07(); 
     else
-      ctx->mcd_struct = mcd_build_ifpls();
+      ctx->mcd_struct = mcd_build_wplgf();
+      /* ctx->mcd_struct = mcd_build_ifpls(); */
   
   return ctx;
 }
diff --git a/maca_trans_parser/src/test_word_buffer.c b/maca_trans_parser/src/test_word_buffer.c
new file mode 100644
index 0000000..0ab392f
--- /dev/null
+++ b/maca_trans_parser/src/test_word_buffer.c
@@ -0,0 +1,35 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include"word_buffer.h"
+#include"util.h"
+
+int main(int argc, char *argv[])
+{
+  mcd *mcd_struct;
+  FILE *mcf;
+  word_buffer *wb;
+  if(argc < 2){
+    fprintf(stderr, "usage %s mcf mcd\n", argv[0]);
+    exit(1);
+  }
+  mcd_struct = mcd_read(argv[2], 1);
+  mcf = myfopen(argv[1], "r");
+
+  wb = word_buffer_new(mcf, mcd_struct, 0);
+  word_buffer_print(stdout, wb);
+    printf("\n");
+  while(word_buffer_move_right(wb)){
+    word_buffer_print(stdout, wb);
+    printf("\n");
+  }
+
+  printf("=================== CHANGE DIRECTION =====================\n");
+  
+  while(word_buffer_move_left(wb)){
+    word_buffer_print(stdout, wb);
+    printf("\n");
+  }
+
+  word_buffer_free(wb);
+    
+}
diff --git a/maca_trans_parser/src/word_buffer.c b/maca_trans_parser/src/word_buffer.c
new file mode 100644
index 0000000..fe35fc5
--- /dev/null
+++ b/maca_trans_parser/src/word_buffer.c
@@ -0,0 +1,134 @@
+#include<stdio.h>
+#include"word_buffer.h"
+#include"util.h"
+
+word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead)
+{
+  int i;
+  word_buffer *wb = (word_buffer *)memalloc(sizeof(word_buffer));
+  wb->input_file = input_file;
+  wb->mcd_struct = mcd_struct;
+  wb->size = 10;
+  wb->nbelem = 0;
+  wb->array = (word **)memalloc(wb->size * sizeof(word *));
+  wb->current_index = 0;
+  wb->lookahead = lookahead;
+  for(i=0; i <= lookahead; i++)
+    word_buffer_read_next_word(wb);
+  return wb;
+}
+
+void word_buffer_print(FILE *f, word_buffer *wb)
+{
+  word *w;
+  w = word_buffer_bm3(wb);
+  if(w){ fprintf(f, "[-3] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_bm2(wb);
+  if(w){ fprintf(f, "[-2] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_bm1(wb);
+  if(w){ fprintf(f, "[-1] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_b0(wb);
+  if(w){ fprintf(f, "[ 0] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_b1(wb);
+  if(w){ fprintf(f, "[ 1] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_b2(wb);
+  if(w){ fprintf(f, "[ 2] "); word_print(f, w); fprintf(f, "\n");} 
+  w = word_buffer_b3(wb);
+  if(w){ fprintf(f, "[ 3] "); word_print(f, w); fprintf(f, "\n");} 
+}
+
+void word_buffer_free(word_buffer *wb)
+{
+  int i;
+  free(wb->array);
+  free(wb);
+  for(i=0; i < wb->nbelem; i++){
+    if(wb->array[i])
+      word_free(wb->array[i]);
+  }
+}
+
+int word_buffer_add(word_buffer *wb, word *w)
+{
+  if(wb->nbelem == wb->size -1){
+    wb->size = 2 * (wb->size + 1);
+    wb->array = (word **)realloc(wb->array, wb->size * sizeof(word *));
+  }
+  wb->array[wb->nbelem] = w;
+  wb->nbelem++;
+  return wb->nbelem - 1;
+}
+
+word *word_buffer_get_word(word_buffer *wb, int offset)
+{
+  return ((wb->current_index + offset >=0) && (wb->current_index + offset <= wb->nbelem))? wb->array[wb->current_index + offset] : NULL;
+}
+
+word *word_buffer_b0(word_buffer *wb)
+{
+  return(wb->nbelem == 0)? NULL : wb->array[wb->current_index];
+}
+
+word *word_buffer_b1(word_buffer *wb)
+{
+  return(wb->current_index + 1 >= wb->nbelem)? NULL : wb->array[wb->current_index + 1];
+}
+
+word *word_buffer_b2(word_buffer *wb)
+{
+  return(wb->current_index + 2 >= wb->nbelem)? NULL : wb->array[wb->current_index + 2];
+}
+
+word *word_buffer_b3(word_buffer *wb)
+{
+  return(wb->current_index + 3 >= wb->nbelem)? NULL : wb->array[wb->current_index + 3];
+}
+
+word *word_buffer_bm1(word_buffer *wb)
+{
+  return(wb->current_index - 1 < 0)? NULL : wb->array[wb->current_index - 1];
+}
+
+word *word_buffer_bm2(word_buffer *wb)
+{
+  return(wb->current_index - 2 < 0)? NULL : wb->array[wb->current_index - 2];
+}
+
+word *word_buffer_bm3(word_buffer *wb)
+{
+  return(wb->current_index - 3 < 0)? NULL : wb->array[wb->current_index - 3];
+}
+
+int word_buffer_read_next_word(word_buffer *wb)
+{
+  word *w = NULL;
+  int index;
+
+  w = word_read(wb->input_file, wb->mcd_struct);
+  if(w == NULL) return -1;
+
+  
+  index = word_buffer_add(wb, w);  
+  word_set_relative_index(w, index);
+  return index;
+}
+
+int word_buffer_move_right(word_buffer *wb)
+{
+  if((wb->nbelem - 1 - wb->current_index) <= wb->lookahead)
+    word_buffer_read_next_word(wb);
+  if(wb->current_index == wb->nbelem - 1) return 0;
+  wb->current_index++;
+  return 1;
+}
+
+int word_buffer_move_left(word_buffer *wb)
+{
+  if(wb->current_index > 0){
+    wb->current_index--;
+    return 1;
+  }
+  return 0;
+}
+
+
diff --git a/maca_trans_parser/src/word_buffer.h b/maca_trans_parser/src/word_buffer.h
new file mode 100644
index 0000000..cd41d17
--- /dev/null
+++ b/maca_trans_parser/src/word_buffer.h
@@ -0,0 +1,35 @@
+#ifndef __WORD_BUFFER__
+#define __WORD_BUFFER__
+
+#include<stdio.h>
+#include"word.h"
+#include"mcd.h"
+
+typedef struct {
+  int size;           /* size of the array used to store words */
+  int nbelem;         /* number of words in the buffer */
+  int lookahead;      /* number of words between the current word and the last word of the buffer */
+  int current_index;  /* position of the current word */
+  word **array;       /* array to store words */
+  FILE *input_file;   /* file to read the words from */
+  mcd *mcd_struct;    /* mcd describing the format of input_file */
+} word_buffer;
+
+
+word_buffer *word_buffer_new(FILE *input_file, mcd *mcd_struct, int lookahead);
+void word_buffer_free(word_buffer *wb);
+int word_buffer_add(word_buffer *wb, word *w);
+word *word_buffer_get_word_relative(word_buffer *wb, int dist);
+word *word_buffer_b0(word_buffer *wb);
+word *word_buffer_b1(word_buffer *wb);
+word *word_buffer_b2(word_buffer *wb);
+word *word_buffer_b3(word_buffer *wb);
+word *word_buffer_bm1(word_buffer *wb);
+word *word_buffer_bm2(word_buffer *wb);
+word *word_buffer_bm3(word_buffer *wb);
+int word_buffer_read_next_word(word_buffer *wb);
+int word_buffer_move_right(word_buffer *wb);
+int word_buffer_move_left(word_buffer *wb);
+void word_buffer_print(FILE *f, word_buffer *wb);
+
+#endif
-- 
GitLab