From c1b334fd249325f49c31ed5a8db6bb2cd00df2f1 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Tue, 11 Apr 2017 21:07:16 +0200
Subject: [PATCH] implemented a simple morphological analyzer

---
 CMakeLists.txt                                |   2 +-
 maca_common/include/mcd.h                     |   1 +
 maca_common/src/mcd.c                         |  57 ++++++
 maca_lemmatizer/src/maca_lemmatizer.c         |  14 +-
 maca_trans_parser/CMakeLists.txt              |  12 ++
 maca_trans_parser/src/context.c               |   3 +-
 maca_trans_parser/src/context.h               |   5 +
 maca_trans_parser/src/maca_trans_lemmatizer.c |   1 +
 maca_trans_parser/src/maca_trans_morpho.c     | 177 ++++++++++++++++++
 .../src/maca_trans_morpho_mcf2cff.c           | 129 +++++++++++++
 10 files changed, 398 insertions(+), 3 deletions(-)
 create mode 100644 maca_trans_parser/src/maca_trans_morpho.c
 create mode 100644 maca_trans_parser/src/maca_trans_morpho_mcf2cff.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e155629..7ac8c72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ add_subdirectory(maca_common)
 add_subdirectory(maca_tools)
 add_subdirectory(perceptron)
 #add_subdirectory(maca_lemmatizer)
-add_subdirectory(maca_morpho)
+#add_subdirectory(maca_morpho)
 add_subdirectory(maca_tokenizer)
 add_subdirectory(maca_lexer)
 add_subdirectory(maca_trans_parser)
diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h
index 392faf9..0d7761d 100644
--- a/maca_common/include/mcd.h
+++ b/maca_common/include/mcd.h
@@ -199,6 +199,7 @@ mcd *mcd_build_conll07(void);
 mcd *mcd_build_ifpls(void);
 mcd *mcd_build_wplgf(void);
 mcd *mcd_build_wplgfs(void);
+mcd *mcd_build_wpmlgfs(void);
 
 mcd      *mcd_read(char *mcd_filename, int verbose);
 void      mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c
index ca924f2..6911740 100644
--- a/maca_common/src/mcd.c
+++ b/maca_common/src/mcd.c
@@ -422,6 +422,63 @@ mcd *mcd_build_wplgfs(void)
   return m;
 }
 
+mcd *mcd_build_wpmlgfs(void)
+{
+  mcd *m = mcd_new(7);
+  int col;
+
+  col = 0;
+  m->wf[col]=MCD_WF_FORM;
+  m->wf_str[col]=strdup("FORM");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_FORM] = col;
+  
+  col = 1;
+  m->wf[col]=MCD_WF_POS;
+  m->wf_str[col]=strdup("POS");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_POS] = col;
+
+  col = 2;
+  m->wf[col]=MCD_WF_FEATS;
+  m->wf_str[col]=strdup("FEATS");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_FEATS] = col;
+
+  col = 3;
+  m->wf[col]=MCD_WF_LEMMA;
+  m->wf_str[col]=strdup("LEMMA");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_LEMMA] = col;
+
+  col = 4;
+  m->wf[col]=MCD_WF_GOV;
+  m->wf_str[col]=strdup("GOV");
+  m->representation[col]= MCD_REPRESENTATION_INT;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_GOV] = col;
+
+  col = 5;
+  m->wf[col]=MCD_WF_LABEL;
+  m->wf_str[col]=strdup("LABEL");
+  m->representation[col]= MCD_REPRESENTATION_VOCAB;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_LABEL] = col;
+  
+  col = 6;
+  m->wf[col]=MCD_WF_SENT_SEG;
+  m->wf_str[col]=strdup("SENT_SEG");
+  m->representation[col]= MCD_REPRESENTATION_INT;
+  m->filename[col] = strdup("_");
+  m->wf2col[MCD_WF_SENT_SEG] = col;
+  
+  return m;
+}
+
 
 
 /* returns a dico_vec containing the different dictionnaries found in an mcd structure */
diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c
index 5d9cacd..e8aeecd 100644
--- a/maca_lemmatizer/src/maca_lemmatizer.c
+++ b/maca_lemmatizer/src/maca_lemmatizer.c
@@ -125,6 +125,7 @@ int main(int argc, char *argv[])
   char *buffer_copy;
   char *form;
   char *pos;
+  char *feats;
 
   char *token;
   int column_nb;
@@ -136,11 +137,16 @@ int main(int argc, char *argv[])
   int form_column;
   int pos_column;
   int lemma_column;
+  int feats_column;
   FILE *f = NULL;
 
   ctx = context_read_options(argc, argv);
   maca_lemmatizer_check_options(ctx);
 
+
+  feats_column = ctx->mcd_struct->wf2col[MCD_WF_FEATS];
+
+  
   if(ctx->pos_column != -1)
     pos_column = ctx->pos_column;
   else
@@ -177,6 +183,7 @@ int main(int argc, char *argv[])
     form = NULL;
     pos = NULL;
     lemma = NULL;
+    feats = NULL;
     do{
       if(column_nb == lemma_column) /* lemma is present in the input file */
 	if(strcmp(token, "_")) /* and it is not an underscore */
@@ -188,6 +195,9 @@ int main(int argc, char *argv[])
       if(column_nb == pos_column){
 	pos = strdup(token);
       }
+      if(column_nb == feats_column){
+	feats = strdup(token);
+      }
       column_nb++;
     } while((token = strtok(NULL , "\t")));
     
@@ -215,11 +225,13 @@ int main(int argc, char *argv[])
     
     /* print_word(buffer, ctx->mcd_struct, lemma); */
 
-  /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma);  */
+    printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); 
+    printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); 
     printf("\t%s\n", lemma);
     
     if(pos)free(pos);
     if(form)free(form);
+    if(feats)free(feats);
   }
   free(buffer_copy);
   free(lemma_array);
diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt
index 04cb203..91bb573 100644
--- a/maca_trans_parser/CMakeLists.txt
+++ b/maca_trans_parser/CMakeLists.txt
@@ -40,6 +40,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse)
 target_link_libraries(maca_trans_tagger_mcf2cff maca_common)
 install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin)
 
+add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c)
+target_link_libraries(maca_trans_morpho_mcf2cff perceptron)
+target_link_libraries(maca_trans_morpho_mcf2cff transparse)
+target_link_libraries(maca_trans_morpho_mcf2cff maca_common)
+install (TARGETS maca_trans_morpho_mcf2cff DESTINATION bin)
+
 #add_executable(maca_trans_tagger_mcf2cff_bt ./src/maca_trans_tagger_mcf2cff_bt.c)
 #target_link_libraries(maca_trans_tagger_mcf2cff_bt perceptron)
 #target_link_libraries(maca_trans_tagger_mcf2cff_bt transparse)
@@ -100,6 +106,12 @@ target_link_libraries(maca_trans_tagger transparse)
 target_link_libraries(maca_trans_tagger maca_common)
 install (TARGETS maca_trans_tagger DESTINATION bin)
 
+add_executable(maca_trans_morpho ./src/maca_trans_morpho.c)
+target_link_libraries(maca_trans_morpho perceptron)
+target_link_libraries(maca_trans_morpho transparse)
+target_link_libraries(maca_trans_morpho maca_common)
+install (TARGETS maca_trans_morpho DESTINATION bin)
+
 #add_executable(maca_trans_tagger_bt ./src/maca_trans_tagger_bt.c)
 #target_link_libraries(maca_trans_tagger_bt perceptron)
 #target_link_libraries(maca_trans_tagger_bt transparse)
diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c
index 72c2d61..bc75f11 100644
--- a/maca_trans_parser/src/context.c
+++ b/maca_trans_parser/src/context.c
@@ -283,7 +283,8 @@ context *context_read_options(int argc, char *argv[])
   if(ctx->mcd_filename)
     ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
   else
-    ctx->mcd_struct = mcd_build_wplgfs();
+    ctx->mcd_struct = mcd_build_wpmlgfs();
+    /* ctx->mcd_struct = mcd_build_wplgfs(); */
 
 
   /* initialize maca_data_path field */
diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h
index 932e671..82db602 100644
--- a/maca_trans_parser/src/context.h
+++ b/maca_trans_parser/src/context.h
@@ -14,6 +14,11 @@
 #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" 
 #define DEFAULT_MODEL_TAGGER_FILENAME  "maca_trans_tagger.model" 
 
+#define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd" 
+#define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm" 
+#define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab" 
+#define DEFAULT_MODEL_MORPHO_FILENAME  "maca_trans_morpho.model" 
+
 #define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" 
 #define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" 
 #define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" 
diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c
index 351305c..ee38e7a 100644
--- a/maca_trans_parser/src/maca_trans_lemmatizer.c
+++ b/maca_trans_parser/src/maca_trans_lemmatizer.c
@@ -46,6 +46,7 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, in
 {
   char form[1000];
   char pos[1000];
+
   char lemma[1000];  
   char morpho[1000];
   int num = 0;
diff --git a/maca_trans_parser/src/maca_trans_morpho.c b/maca_trans_parser/src/maca_trans_morpho.c
new file mode 100644
index 0000000..be9db3a
--- /dev/null
+++ b/maca_trans_parser/src/maca_trans_morpho.c
@@ -0,0 +1,177 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<unistd.h>
+#include<getopt.h>
+#include"context.h"
+#include"feat_fct.h"
+#include"feature_table.h"
+#include"dico.h"
+#include"config2feat_vec.h"
+
+void decode_morpho_help_message(context *ctx);
+void decode_morpho_help_message(context *ctx)
+{
+  context_general_help_message(ctx);
+  context_beam_help_message(ctx);
+  context_conll_help_message(ctx);
+  fprintf(stderr, "INPUT\n");
+  context_input_help_message(ctx);
+  context_mcd_help_message(ctx);
+  context_model_help_message(ctx);
+  context_vocabs_help_message(ctx);
+  context_features_model_help_message(ctx);
+  context_f2p_filename_help_message(ctx);
+}
+
+void decode_morpho_check_options(context *ctx){
+  if(ctx->help
+     /*!ctx->conll_filename*/
+     /*     || !ctx->perc_model_filename
+     || !ctx->mcd_filename
+     || !ctx->vocabs_filename
+     || !ctx->features_model_filename*/
+     ){
+    decode_morpho_help_message(ctx);
+    exit(1);
+  }
+}
+
+void decode_morpho_set_linguistic_resources_filenames(context *ctx)
+{
+  char absolute_filename[500];
+  
+  if(!ctx->perc_model_filename){
+    strcpy(absolute_filename, ctx->maca_data_path);
+    strcat(absolute_filename, DEFAULT_MODEL_MORPHO_FILENAME);
+    ctx->perc_model_filename = strdup(absolute_filename);
+  }
+
+  if(!ctx->vocabs_filename){
+    strcpy(absolute_filename, ctx->maca_data_path);
+    strcat(absolute_filename, DEFAULT_VOCABS_MORPHO_FILENAME);
+    ctx->vocabs_filename = strdup(absolute_filename);
+  }
+
+  if(!ctx->features_model_filename){
+    strcpy(absolute_filename, ctx->maca_data_path);
+    strcat(absolute_filename, DEFAULT_FEATURES_MODEL_MORPHO_FILENAME);
+    ctx->features_model_filename = strdup(absolute_filename);
+  }
+
+  if(ctx->verbose){
+    fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
+    fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
+    fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
+    fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
+  }
+}
+void print_word(word *w, mcd *mcd_struct, dico *dico_morph, int postag)
+{
+  char *buffer = NULL;
+  char *token = NULL;
+  int col_nb = 0;
+  if(mcd_get_pos_col(mcd_struct) == -1){
+    printf("%s\t%s\n", w->input, dico_int2string(dico_morph, postag));
+  }
+  else{
+    buffer = strdup(w->input);
+    token = strtok(buffer, "\t");
+    col_nb = 0;
+    while(token){
+      if(col_nb != 0) printf("\t");
+      if(col_nb == mcd_get_feats_col(mcd_struct))
+	printf("%s", dico_int2string(dico_morph, postag));
+      else
+	word_print_col_n(stdout, w, col_nb);
+      col_nb++;
+      token = strtok(NULL, "\t");
+    }
+    if(col_nb <= mcd_get_feats_col(mcd_struct))
+      printf("\t%s", dico_int2string(dico_morph, postag));
+    printf("\n");
+    free(buffer);
+  }
+}
+
+int movement_morpho(config *c, int feats)
+{
+  word_set_feats(word_buffer_b0(c->bf), feats); 
+  word_buffer_move_right(c->bf);
+
+  return 1;
+}
+
+void simple_decoder_morpho(context *ctx)
+{
+  config *c;
+  feat_vec *fv = feat_vec_new(feature_types_nb);
+  FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
+  feature_table *ft =  feature_table_load(ctx->perc_model_filename, ctx->verbose);
+  int feats;
+  float max;
+  word *b0;
+  dico *dico_feats = dico_vec_get_dico(ctx->vocabs, (char *)"FEATS");
+
+  c = config_new(f, ctx->mcd_struct, 5); 
+
+  while(!config_is_terminal(c)){
+    b0 = word_buffer_b0(c->bf);
+    feats = word_get_feats(b0);
+
+    if(ctx->debug_mode){
+      fprintf(stderr, "***********************************\n");
+      config_print(stderr, c);
+    }
+    
+    /* if feats is not specified in input it is predicted */
+    if(feats == -1){
+      /* config_print(stdout, c); */
+      config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
+      
+      /* feat_vec_print(stdout, fv); */
+      feats = feature_table_argmax(fv, ft, &max);
+      /* printf("feats = %d\n", feats); */
+
+      if(ctx->debug_mode){
+	vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
+	for(int i=0; i < 3; i++){
+	  fprintf(stderr, "%d\t", i);
+	  fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_feats, vcode_array[i].class_code), vcode_array[i].score);
+	}
+	free(vcode_array);
+      }
+    }
+
+    print_word(b0, ctx->mcd_struct, dico_feats, feats);
+    
+    movement_morpho(c, feats);
+
+  }
+  /* config_print(stdout, c);  */
+  feat_vec_free(fv);
+  feature_table_free(ft);
+  config_free(c); 
+  if (ctx->input_filename) fclose(f);
+}
+
+
+int main(int argc, char *argv[])
+{
+  context *ctx = context_read_options(argc, argv);
+  decode_morpho_check_options(ctx);
+
+  decode_morpho_set_linguistic_resources_filenames(ctx);
+  ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
+  ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
+  mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
+
+  ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
+
+  if(ctx->beam_width == 1)
+    simple_decoder_morpho(ctx);
+  
+  context_free(ctx);
+  return 0;
+}
+
diff --git a/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c
new file mode 100644
index 0000000..a821863
--- /dev/null
+++ b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c
@@ -0,0 +1,129 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<unistd.h>
+#include<getopt.h>
+#include"feat_fct.h"
+#include"context.h"
+#include"feat_vec.h"
+#include"dico_vec.h"
+#include"config2feat_vec.h"
+
+
+int oracle_morpho(config *c)
+{
+  return word_get_feats(word_buffer_b0(config_get_buffer(c)));
+}
+
+
+int movement_morpho(config *c, int feats)
+{
+  word_set_feats(word_buffer_b0(c->bf), feats); 
+  word_buffer_move_right(c->bf);
+
+  return 1;
+}
+
+void maca_trans_morpho_mcf2cff_help_message(context *ctx)
+{
+  context_general_help_message(ctx);
+  context_mode_help_message(ctx);
+  context_sent_nb_help_message(ctx);
+  context_mcd_help_message(ctx);
+
+  fprintf(stderr, "INPUT\n");
+  context_conll_help_message(ctx);
+  fprintf(stderr, "IN TEST MODE\n");
+  context_vocabs_help_message(ctx);
+
+  fprintf(stderr, "OUTPUT\n");
+  context_cff_help_message(ctx);
+  fprintf(stderr, "IN TRAIN MODE\n");
+  context_vocabs_help_message(ctx);
+
+
+}
+
+void maca_trans_morpho_mcf2cff_check_options(context *ctx)
+{
+  if(!ctx->input_filename
+     || ctx->help
+     /* || !ctx->mcd_filename */
+     || !(ctx->cff_filename || ctx->fann_filename)
+     ){
+    maca_trans_morpho_mcf2cff_help_message(ctx);
+    exit(1);
+  }
+}
+
+void morpho_generate_training_file(FILE *output_file, context *ctx)
+{  
+  config *c;
+  feat_vec *fv = feat_vec_new(feature_types_nb);
+  FILE *conll_file = myfopen(ctx->input_filename, "r");
+  int feats;
+  /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
+  
+  c = config_new(conll_file, ctx->mcd_struct, 5);
+
+  while(!config_is_terminal(c)){ 
+    config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
+    feats = oracle_morpho(c);
+    
+    fprintf(output_file, "%d", feats);
+    feat_vec_print(output_file, fv);
+    movement_morpho(c, feats);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  context *ctx;
+  FILE *output_file;
+  
+  ctx = context_read_options(argc, argv);
+  maca_trans_morpho_mcf2cff_check_options(ctx);
+  
+  ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
+
+
+  if(ctx->mode == TRAIN_MODE){
+    mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
+    ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
+  }
+  else if(ctx->mode == TEST_MODE){
+    ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
+    mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
+  }
+    
+  /* in train mode create feature dictionnary for perceptron */
+  if(ctx->mode == TRAIN_MODE)
+    ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
+  
+  /* in test mode read feature dictionnary for perceptron */
+  if(ctx->mode == TEST_MODE)
+    ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
+  
+  /* add the feature dictionnary to the dico vector */
+  dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
+  
+  /* open output file */
+  if(ctx->cff_filename)
+    output_file = myfopen(ctx->cff_filename, "w");
+  else
+    output_file = stdout;
+  
+  morpho_generate_training_file(output_file, ctx);
+    
+  if(ctx->mode == TRAIN_MODE){
+    /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
+    dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
+    
+  }
+  
+  if(ctx->cff_filename)
+    fclose(output_file);
+  context_free(ctx);
+  return 0;
+}
+
-- 
GitLab