fixed a bug in word.h

68ee1829 · Alexis Nasr · 1d536e6c · ee98b081 · 68ee1829 · 68ee1829
Commit 68ee1829 authored 7 years ago by Alexis Nasr
--- a/maca_common/include/mcd.h
+++ b/maca_common/include/mcd.h
@@ -8,7 +8,7 @@
 #define MCD_INVALID_VALUE -1
-#define MCD_WF_NB 48
+#define MCD_WF_NB 51
 #define MCD_WF_ID 0
 #define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */
@@ -61,6 +61,12 @@
 #define MCD_WF_Person 45
 #define MCD_WF_Tense 46
+#define MCD_WF_FILE 48
+#define MCD_WF_DIRECTORY 49
+#define MCD_WF_SPEAKER 50
 /*Abbr
 AdpType
 AdvType

--- a/maca_common/include/word.h
+++ b/maca_common/include/word.h
@@ -63,6 +63,11 @@ typedef struct _word {
 #define word_get_label(w)          (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL])
 #define word_get_stag(w)           (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG])
 #define word_get_sent_seg(w)       (((w) == NULL) ?  0 : (w)->wf_array[MCD_WF_SENT_SEG])
+#define word_get_file(w)           (((w) == NULL) ?  0 : (w)->wf_array[MCD_WF_FILE])
+#define word_get_directory(w)      (((w) == NULL) ?  0 : (w)->wf_array[MCD_WF_DIRECTORY])
+#define word_get_speaker(w)        (((w) == NULL) ?  0 : (w)->wf_array[MCD_WF_SPEAKER])
 #define word_get_A(w)              (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A])
 #define word_get_B(w)              (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B])
 #define word_get_C(w)              (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C])

--- a/maca_common/src/mcd.c
+++ b/maca_common/src/mcd.c
@@ -512,6 +512,12 @@ int mcd_wf_code(char *wf)
  /* if(!strcmp(wf, "INT")) return MCD_WF_INT; */
  if(!strcmp(wf, "GOV")) return MCD_WF_GOV;
  if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG;
+  if(!strcmp(wf, "FILE")) return MCD_WF_FILE;
+  if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY;
+  if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER;
  if(!strcmp(wf, "A")) return MCD_WF_A;
  if(!strcmp(wf, "B")) return MCD_WF_B;
  if(!strcmp(wf, "C")) return MCD_WF_C;

--- a/maca_common/src/word.c
+++ b/maca_common/src/word.c
@@ -22,7 +22,6 @@ word *word_new(char *input)
  w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
  w->form = NULL;
  w->form_char16 = NULL;
  w->index = -1;
  w->signature = -1;
  w->is_root = 0;

--- a/maca_tokenizer/src/en_tok_rules.l
+++ b/maca_tokenizer/src/en_tok_rules.l
@@ -12,6 +12,7 @@ extern char *token;
 /*%option noyywrap*/
 %%
+#.*    ECHO;
 \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
 [ \t]+   {maca_tokenizer_segment((char *)"", yytext);}
 [ ]*\.   {maca_tokenizer_segment((char *)".", yytext);}

--- a/maca_tokenizer/src/fr_tok_rules.l
+++ b/maca_tokenizer/src/fr_tok_rules.l
@@ -26,7 +26,7 @@ nosepar [^ \t\n]
 	if(defait_amalgames){
 	BEGIN(state_defait_amalgames);
        }
+#.*    ECHO;
 \<[^\>]*\>   {maca_tokenizer_segment((char *)"", yytext);}
 {separ}+     {maca_tokenizer_segment((char *)"", yytext);}
 \.   {maca_tokenizer_segment((char *)".", yytext);}

--- a/maca_tools/src/mcf2json.c
+++ b/maca_tools/src/mcf2json.c
@@ -2,6 +2,9 @@
 #include<stdlib.h>
 #include<string.h>
 #include<getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include"mcd.h"
 #include"util.h"
@@ -16,6 +19,7 @@ typedef struct {
  char *mcf_filename;
  char *mcd_filename;
  mcd *mcd_struct;
+  char *root_dir;
 } context;
 void mcf2json_context_free(context *ctx)
@@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx)
      free(ctx->mcd_filename);
    if(ctx->mcd_struct)
      mcd_free(ctx->mcd_struct);
+    if(ctx->root_dir)
+      free(ctx->root_dir);
    free(ctx);
  }
 }
@@ -47,6 +53,7 @@ context *mcf2json_context_new(void)
  ctx->mcf_filename = NULL;
  ctx->mcd_filename = NULL;
  ctx->mcd_struct = NULL;
+  ctx->root_dir = NULL;
  return ctx;
 }
@@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx)
  fprintf(stderr, "\t-v --verbose          : activate verbose mode\n");
  fprintf(stderr, "\t-C --mcd              : mcd filename\n");
  fprintf(stderr, "\t-i --mcf              : mcf filename (read from stdin if absent)\n");
-  fprintf(stderr, "\t-o --conll            : conll filename (write to stdout if absent)\n");
+  fprintf(stderr, "\t-r --root             : root directory of the json files\n");
 }
 void mcf2json_check_options(context *ctx){
@@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[])
      {"help",                no_argument,       0, 'h'},
      {"verbose",             no_argument,       0, 'v'},
      {"debug",               no_argument,       0, 'd'},
-      {"conll",               required_argument, 0, 'o'},
      {"mcd",                 required_argument, 0, 'C'}, 
      {"mcf",                 required_argument, 0, 'i'},
+      {"root",                required_argument, 0, 'r'},
    };
  optind = 0;
  opterr = 0;
-  while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ 
+  while ((c = getopt_long (argc, argv, "hvdC:i:r:", long_options, &option_index)) != -1){ 
    switch (c)
      {
      case 'd':
@@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[])
      case 'v':
 	ctx->verbose = 1;
 	break;
-      case 'o':
-	ctx->conll_filename = strdup(optarg);
-	break;
      case 'i':
 	ctx->mcf_filename = strdup(optarg);
 	break;
      case 'C':
 	ctx->mcd_filename = strdup(optarg);
 	break;
+      case 'r':
+	ctx->root_dir = strdup(optarg);
+	break;
      }
  }
@@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[])
  else{
    ctx->mcd_struct = mcd_build_wpmlgfs();
  }
  return ctx;
 }
@@ -129,7 +135,7 @@ void print_footer(FILE *output_file)
 }
-void print_header(FILE *output_file, mcd *mcd_struct)
+void print_header(FILE *output_file, mcd *mcd_struct, char *filename)
 {
  int pos_col =  mcd_get_pos_col(mcd_struct);
  int label_col =  mcd_get_label_col(mcd_struct);
@@ -143,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
  fprintf(output_file, "\"header\":{\n");
  fprintf(output_file, "\"id\": \"\",\n");
  fprintf(output_file, "\"timestamp\": \"\",\n");
+  fprintf(output_file, "\"filename\": \"%s\",\n", filename);
  fprintf(output_file, "\"labels_segment\": [");
  for(i=0; i < dico_pos->nbelem; i++){
@@ -160,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
  fprintf(output_file, "},\n");
-  fprintf(output_file, "\"annotations\":{\n");
+  fprintf(output_file, "\"annotation\":{\n");
  fprintf(output_file, "\"name\": \"\",\n");
  fprintf(output_file, "\"time_start\": \"\",\n");
  fprintf(output_file, "\"time_end\": \"\"\n");
@@ -227,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
 }
-void print_segment(FILE *output_file, word_buffer *wb, int index)
+void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
 {
  int pos_col =  mcd_get_pos_col(word_buffer_get_mcd(wb));
  word *w = word_buffer_get_word_n(wb, index);
  fprintf(output_file, "{ ");
  /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
-  fprintf(output_file, "\"start\": %d, ", index);
+  fprintf(output_file, "\"start\": %d, ", index - index_first_word);
  /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
-  fprintf(output_file, "\"end\": %d, ", index);
+  fprintf(output_file, "\"end\": %d, ", index - index_first_word);
  fprintf(output_file, "\"label\": \"");
  if(pos_col != -1)
@@ -263,7 +270,7 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
  for(index = index_first_word; index <= index_last_word; index++){
    if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
    fprintf(output_file, "\n");
-    print_segment(output_file, wb, index);
+    print_segment(output_file, wb, index_first_word, index);
  }
  fprintf(output_file," ],\n");
 }
@@ -317,7 +324,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind
 int main(int argc, char *argv[])
 {
-  FILE *output_file;
+  FILE *output_file = NULL;
  context *ctx = mcf2json_context_read_options(argc, argv);
  word_buffer *wb = NULL;
  word *w = NULL;
@@ -326,17 +333,59 @@ int main(int argc, char *argv[])
  int index_first_word;
  int index_last_word;
  int sentence_nb = 0;
+  char current_directory[1000];
+  char current_file[1000];
+  char previous_directory[1000];
+  char previous_file[1000];
+  char filename_for_header[1000];
+  char *root_directory = NULL;
+  char destination_file[1000];
+  char destination_dir[1000];
+  struct stat st = {0};
  mcf2json_check_options(ctx);
  mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename);
-  output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
  wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
-  print_header(output_file, ctx->mcd_struct);
+  if(ctx->root_dir){
+    if(stat(ctx->root_dir, &st) == -1) {
+      mkdir(ctx->root_dir, 0700);
+      fprintf(stderr, "creating directory %s\n", ctx->root_dir);
+    }
    do{
      w = word_buffer_b0(wb);
+      if(w == NULL) break;
+      word_sprint_col_n(current_directory, w, ctx->mcd_struct->wf2col[MCD_WF_DIRECTORY]);
+      word_sprint_col_n(current_file, w, ctx->mcd_struct->wf2col[MCD_WF_FILE]);
+      if(strcmp(current_directory, previous_directory)){
+	strcpy(destination_dir, ctx->root_dir);
+	strcat(destination_dir, "/");
+	strcat(destination_dir, current_directory);
+	if (stat(destination_dir, &st) == -1) {
+	  mkdir(destination_dir, 0700);
+	  fprintf(stderr, "creating directory %s\n", destination_dir);
+	}
+      }
+      if(strcmp(current_file, previous_file)){
+	strcpy(destination_file, destination_dir);
+	strcat(destination_file, "/");
+	strcat(destination_file, current_file);
+	strcat(destination_file, ".json");
+	fprintf(stderr, "creating file %s\n", destination_file);
+	if(output_file){
+	  print_footer(output_file);
+	  fclose(output_file);
+	}
+	output_file = myfopen_no_exit(destination_file, "w");
+	strcpy(filename_for_header, current_directory);
+	strcat(filename_for_header, "/");
+	strcat(filename_for_header, current_file);
+	strcat(filename_for_header, ".json");
+	print_header(output_file, ctx->mcd_struct, filename_for_header);
+	first_sentence = 1;
+      }
      if(new_sentence){
 	new_sentence = 0;
 	sentence_nb++;
@@ -353,14 +402,38 @@ int main(int argc, char *argv[])
 	fprintf(output_file, "\n");
 	print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
      }
+      strcpy(previous_file, current_file);
+      strcpy(previous_directory, current_directory);
    } while(word_buffer_move_right(wb));
    print_footer(output_file);
-  if(ctx->conll_filename)
    fclose(output_file);
-  mcf2json_context_free(ctx);
+  }
+  else{ //ctx->root_dir is NULL dump everything to stdout
+      output_file = stdout;
+      print_header(output_file, ctx->mcd_struct, "");
+      do{
+	w = word_buffer_b0(wb);
+	if(new_sentence){
+	  new_sentence = 0;
+	  sentence_nb++;
+	  index_first_word = word_buffer_get_current_index(wb);
+	}
+	if(word_get_sent_seg(w)){
+	  index_last_word = word_buffer_get_current_index(wb);
+	  new_sentence = 1;
+	  if(first_sentence == 1)
+	    first_sentence = 0;
+	  else
+	    fprintf(output_file, ",");
+	  fprintf(output_file, "\n");
+	  print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
+	}
+      } while(word_buffer_move_right(wb));
+      print_footer(output_file);
+  }
+  mcf2json_context_free(ctx);
  return 0;
 }
--- a/maca_trans_parser/src/oracle_parser_arc_eager.c
+++ b/maca_trans_parser/src/oracle_parser_arc_eager.c
@@ -75,6 +75,7 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label)
       ){
      return MVT_PARSER_EOS;
    }
    /* LEFT ARC  b0 is the governor and s0 the dependent */
    if(s0_gov_index == b0_index){
      return movement_parser_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index)));