From 0442bfb6e491fcf250ecc480b450aff7feea2175 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Mon, 21 Nov 2016 22:26:32 -0500
Subject: [PATCH] defined separate tokenization rules for french and english.
 Fixed some bugs in maca_lexer.

---
 maca_common/include/trie.h                    |  13 ++
 maca_lexer/src/context.c                      |   1 +
 maca_lexer/src/context.h                      |   1 +
 maca_lexer/src/maca_lexer.c                   |  31 +++-
 maca_tokenizer/CMakeLists.txt                 |  18 ++-
 maca_tokenizer/main.c                         |  10 --
 maca_tokenizer/src/context.c                  | 149 ++++++++++++++++++
 maca_tokenizer/src/context.h                  |  33 ++++
 maca_tokenizer/src/en_tok_rules.l             |  27 ++++
 .../{tok_rules.l => src/fr_tok_rules.l}       |   4 +
 maca_tokenizer/src/maca_tokenizer.c           |  43 +++++
 11 files changed, 310 insertions(+), 20 deletions(-)
 delete mode 100644 maca_tokenizer/main.c
 create mode 100644 maca_tokenizer/src/context.c
 create mode 100644 maca_tokenizer/src/context.h
 create mode 100644 maca_tokenizer/src/en_tok_rules.l
 rename maca_tokenizer/{tok_rules.l => src/fr_tok_rules.l} (90%)
 create mode 100644 maca_tokenizer/src/maca_tokenizer.c

diff --git a/maca_common/include/trie.h b/maca_common/include/trie.h
index 0ec45ad..697c6e9 100644
--- a/maca_common/include/trie.h
+++ b/maca_common/include/trie.h
@@ -21,6 +21,19 @@ typedef struct {
   int states_nb;
 } trie;
 
+typedef struct {
+  int state;
+  int symbol;
+} state_symbol;
+
+typedef struct {
+  int size;
+  state_symbol *array;
+  int nbelem;
+} trie_path;
+
+
+
 
 trie_state *trie_state_new(trie_trans *transitions, int is_accept);
 void trie_state_free(trie_state *state);
diff --git a/maca_lexer/src/context.c b/maca_lexer/src/context.c
index b6d2b5e..bce96ef 100644
--- a/maca_lexer/src/context.c
+++ b/maca_lexer/src/context.c
@@ -40,6 +40,7 @@ context *context_new(void)
   ctx->mwe_filename = NULL;
   ctx->mwe_tokens_dico_filename = NULL;
   ctx->mwe_tokens_separator = strdup(" ");
+  ctx->paste = 1;
   return ctx;
 }
 
diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h
index 376f1b6..1ad410d 100644
--- a/maca_lexer/src/context.h
+++ b/maca_lexer/src/context.h
@@ -22,6 +22,7 @@ typedef struct {
   char *mwe_filename;
   char *mwe_tokens_dico_filename;
   char *mwe_tokens_separator;
+  int paste;
 } context;
 
 context *context_new(void);
diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c
index da78d6f..04e56bf 100644
--- a/maca_lexer/src/maca_lexer.c
+++ b/maca_lexer/src/maca_lexer.c
@@ -22,7 +22,6 @@ void maca_lexer_help_message(context *ctx)
   context_vocab_help_message(ctx);
 }
 
-
 void maca_lexer_check_options(context *ctx){
   if(ctx->help){
     maca_lexer_help_message(ctx);
@@ -111,25 +110,41 @@ int main(int argc, char *argv[])
     if(states_array[path_index] == 0){ /* in initial state of trie */
       /* nothing has been recognized */
       if(path_index == 0)
-	printf("%s\n", buffer);
+	if(ctx->paste)
+	 printf("%s\n", buffer);
+	else
+	  printf("%s\t1\n", buffer);
       else{ /* there is something in the path */
 	int accept_state_index =  look_for_accept_state_in_path(mwe_trie, states_array, path_index);
 	/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
 	for(i=0; i <= accept_state_index; i++){ 
-	  if(i > 0) printf("%s", ctx->mwe_tokens_separator);
-	  printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
+	  if(ctx->paste){
+	  if(i > 0) printf("%s", ctx->mwe_tokens_separator); 
+	  printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); 
+	  }
+	  else{
+	    if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+	    else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+	  }
 	}
-	if(accept_state_index != -1) printf("\n");
+	if(ctx->paste)
+	  if(accept_state_index != -1) printf("\n"); 
 	/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
 	for(i = accept_state_index + 1; i < path_index; i++){
-	  printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+	  if(ctx->paste)
+	    printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); 
+	  else
+	    printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
 	}
 	/* do not forget to print the current token */
-	printf("%s\n", buffer);
+	if(ctx->paste)
+	  printf("%s\n", buffer);
+	else
+	  printf("%s\t1\n", buffer);
 	path_index = 0;
       }
     }
-    /* not in state 0 of trie */
+    /* not in state 0 of trie we are processing tokens of a potential mwe */
     else{ 
       path_index++;
     }
diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt
index d524f50..f100c0b 100644
--- a/maca_tokenizer/CMakeLists.txt
+++ b/maca_tokenizer/CMakeLists.txt
@@ -1,4 +1,18 @@
-FLEX_TARGET(tokenizer tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/maca_tokenizer.c)
+FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex.c)
+FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c)
+
+set(SOURCES ./src/context.c
+  ${FLEX_fr_tok_rules_OUTPUTS}
+  ${FLEX_en_tok_rules_OUTPUTS})
+##compiling library
+include_directories(./src)
+
+add_library(maca_tokenizer_lib STATIC ${SOURCES})
+
+
+
+
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
-add_executable(maca_tokenizer main.c ${FLEX_tokenizer_OUTPUTS})
+add_executable(maca_tokenizer ./src/maca_tokenizer.c)
+target_link_libraries(maca_tokenizer maca_tokenizer_lib maca_common)
 install (TARGETS maca_tokenizer DESTINATION bin)
diff --git a/maca_tokenizer/main.c b/maca_tokenizer/main.c
deleted file mode 100644
index f3586e0..0000000
--- a/maca_tokenizer/main.c
+++ /dev/null
@@ -1,10 +0,0 @@
-int defait_amalgames = 0;
-
-int main(int argc, char* argv[]) {
-
-  if(argc > 1) defait_amalgames = 1;
-  yylex() ; 
-
-  return 0;
-}
-
diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c
new file mode 100644
index 0000000..25a3414
--- /dev/null
+++ b/maca_tokenizer/src/context.c
@@ -0,0 +1,149 @@
+#include<stdlib.h>
+#include<stdio.h>
+#include<string.h>
+#include<unistd.h>
+#include<getopt.h>
+#include "context.h"
+#include "util.h"
+
+
+void context_set_linguistic_resources_filenames(context *ctx);
+
+void context_free(context *ctx)
+{
+  if(ctx->program_name) free(ctx->program_name);
+  if(ctx->input_filename) free(ctx->input_filename);
+  if(ctx->output_filename) free(ctx->output_filename);
+  if(ctx->language) free(ctx->language);
+  if(ctx->maca_data_path) free(ctx->maca_data_path);
+  free(ctx);
+}
+
+context *context_new(void)
+{
+  context *ctx = (context *)memalloc(sizeof(context));
+  
+  ctx->help = 0;
+  ctx->verbose = 0;
+  ctx->debug_mode = 0;
+  ctx->program_name = NULL;
+  ctx->mcd_filename = NULL;
+  ctx->mcd_struct = NULL;
+  ctx->language = strdup("fr");
+  ctx->maca_data_path = NULL;
+  ctx->input_filename = NULL;
+  ctx->output_filename = NULL;
+  return ctx;
+}
+
+void context_general_help_message(context *ctx)
+{
+    fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "\t-h --help             : print this message\n");
+    fprintf(stderr, "\t-v --verbose          : activate verbose mode\n");
+    fprintf(stderr, "\t-r --hratio   <float> : set the occupation ratio of hash tables (default is 0.5)\n");
+}
+
+void context_input_help_message(context *ctx){
+  fprintf(stderr, "\t-i --input  <file>  : input mcf file name\n");
+}
+
+void context_mcd_help_message(context *ctx){
+  fprintf(stderr, "\t-C --mcd   <file> : multi column description file name\n");
+}
+
+void context_language_help_message(context *ctx){
+  fprintf(stderr, "\t-L --language  : identifier of the language to use\n");
+}
+
+context *context_read_options(int argc, char *argv[])
+{
+  int c;
+  int option_index = 0;
+  context *ctx = context_new();
+
+  ctx->program_name = strdup(argv[0]);
+
+  static struct option long_options[8] =
+    {
+      {"help",                no_argument,       0, 'h'},
+      {"verbose",             no_argument,       0, 'v'},
+      {"debug",               no_argument,       0, 'd'},
+      {"input",               required_argument, 0, 'i'},
+      {"output",              required_argument, 0, 'o'},
+      {"mcd",                 required_argument, 0, 'C'}, 
+      {"language",            required_argument, 0, 'L'},
+      {"maca_data_path",      required_argument, 0, 'D'}
+    };
+  optind = 0;
+  opterr = 0;
+  
+  while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){ 
+    switch (c)
+      {
+      case 'd':
+	ctx->debug_mode = 1;
+	break;
+      case 'h':
+	ctx->help = 1;
+	break;
+      case 'v':
+	ctx->verbose = 1;
+	break;
+      case 'i':
+	ctx->input_filename = strdup(optarg);
+	break;
+      case 'o':
+	ctx->output_filename = strdup(optarg);
+	break;
+      case 'C':
+	ctx->mcd_filename = strdup(optarg);
+	break;
+      case 'L':
+	ctx->language = strdup(optarg);
+	break;
+      case 'D':
+	ctx->maca_data_path = strdup(optarg);
+	break;
+      }
+  }
+
+  context_set_linguistic_resources_filenames(ctx);
+
+
+  if(ctx->mcd_filename)
+    ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
+
+
+  if(ctx->mcd_filename == NULL)
+    /* ctx->mcd_struct = mcd_build_conll07(); */
+    ctx->mcd_struct = mcd_build_wplgf();
+
+  return ctx;
+}
+
+void context_set_linguistic_resources_filenames(context *ctx)
+{
+  char absolute_path[500];
+  char absolute_filename[500];
+
+  absolute_path[0] = '\0';
+
+  if(ctx->maca_data_path)
+    strcat(absolute_path, ctx->maca_data_path);
+  else {
+      char *e = getenv("MACAON_DIR");
+      if (e != NULL) {
+	  strcat(absolute_path, e);	  
+      } else {
+	  fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
+      }
+  }
+
+	   
+  strcat(absolute_path, "/");
+  strcat(absolute_path, ctx->language);
+  strcat(absolute_path, "/bin/");
+
+}
diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h
new file mode 100644
index 0000000..f9c3ce0
--- /dev/null
+++ b/maca_tokenizer/src/context.h
@@ -0,0 +1,33 @@
+#ifndef __MACA_LEXER_CONTEXT__
+#define __MACA_LEXER_CONTEXT__
+
+#include "mcd.h"
+#include <stdlib.h>
+
+#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico"
+#define DEFAULT_MWE_FILENAME "mwe"
+
+typedef struct {
+  int help;
+  int verbose;
+  int debug_mode;
+  char *program_name;
+  char *language;
+  char *maca_data_path;
+  char *mcd_filename;
+  mcd *mcd_struct;
+  char *input_filename;
+  char *output_filename;
+} context;
+
+context *context_new(void);
+void context_free(context *ctx);
+
+context *context_read_options(int argc, char *argv[]);
+void context_general_help_message(context *ctx);
+void context_conll_help_message(context *ctx);
+void context_language_help_message(context *ctx);
+void context_maca_data_path_help_message(context *ctx);
+void context_mcd_help_message(context *ctx);
+
+#endif
diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l
new file mode 100644
index 0000000..0e8ea5b
--- /dev/null
+++ b/maca_tokenizer/src/en_tok_rules.l
@@ -0,0 +1,27 @@
+%{
+#include <stdio.h>
+extern int defait_amalgames;
+%}
+
+%option prefix="en"
+
+%option noyywrap
+%%
+[0-9]+\.[0-9]+ printf("%s", yytext);
+[ \t]+ printf("\n");
+\.  printf("\n.");
+\,  printf("\n,");
+don't printf("do\nnot");
+don’t printf("do\nnot");
+doesn't printf("does\nnot");
+doesn’t printf("does\nnot");
+won't printf("will\nnot");
+won’t printf("will\nnot");
+cannot printf("can\nnot");
+wanna printf("want\nto");
+'s printf("\n's");
+’s printf("\n's");
+\n+ printf("\n");
+
+
+%%
diff --git a/maca_tokenizer/tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l
similarity index 90%
rename from maca_tokenizer/tok_rules.l
rename to maca_tokenizer/src/fr_tok_rules.l
index 4cd823d..dd6055b 100644
--- a/maca_tokenizer/tok_rules.l
+++ b/maca_tokenizer/src/fr_tok_rules.l
@@ -2,6 +2,10 @@
 #include <stdio.h>
 extern int defait_amalgames;
 %}
+
+%option prefix="fr"
+/*%option outfile="fr_lex.c"*/
+
 %option noyywrap
 %s state_defait_amalgames
 %s state_num
diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c
new file mode 100644
index 0000000..d9e9a18
--- /dev/null
+++ b/maca_tokenizer/src/maca_tokenizer.c
@@ -0,0 +1,43 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include"context.h"
+
+int defait_amalgames = 0;
+
+void maca_tokenizer_help_message(context *ctx)
+{
+  context_general_help_message(ctx);
+  fprintf(stderr, "INPUT\n");
+  context_input_help_message(ctx);
+  context_mcd_help_message(ctx);
+  context_language_help_message(ctx);
+}
+
+void maca_tokenizer_check_options(context *ctx){
+  if(ctx->help){
+    maca_tokenizer_help_message(ctx);
+    exit(1);
+  }
+}
+
+
+int main(int argc, char* argv[])
+{
+  
+  context *ctx;
+
+  ctx = context_read_options(argc, argv);
+  maca_tokenizer_check_options(ctx);
+  
+  if(!strcmp(ctx->language, "en"))
+    enlex() ; 
+  else
+   frlex() ;  
+
+       
+  /* if(argc > 1) defait_amalgames = 1; */
+
+  return 0;
+}
+
-- 
GitLab