added mcf2json to extract from mcf files json files for the annotation tool

71c198d9 · Alexis Nasr · 55796a1c · 71c198d9 · 71c198d9
Commit 71c198d9 authored 7 years ago by Alexis Nasr
--- a/maca_tools/CMakeLists.txt
+++ b/maca_tools/CMakeLists.txt
@@ -12,6 +12,10 @@ target_link_libraries(mcf2orfeo transparse)
 target_link_libraries(mcf2orfeo maca_common)
 install (TARGETS mcf2orfeo DESTINATION bin)

+add_executable(mcf2json ./src/mcf2json.c)
+target_link_libraries(mcf2json maca_common)
+install (TARGETS mcf2json DESTINATION bin)
+
 add_executable(maca_compute_l_rules ./src/maca_compute_l_rules.c)
 target_link_libraries(maca_compute_l_rules maca_common)
 install (TARGETS maca_compute_l_rules DESTINATION bin)

--- a/maca_tools/src/mcf2json.c
+++ b/maca_tools/src/mcf2json.c
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<getopt.h>
+
+#include"mcd.h"
+#include"util.h"
+#include"word_buffer.h"
+
+typedef struct {
+  int help;
+  int verbose;
+  int debug_mode;
+  char *program_name;
+  char *conll_filename;
+  char *mcf_filename;
+  char *mcd_filename;
+  mcd *mcd_struct;
+} context;
+
+void mcf2json_context_free(context *ctx)
+{
+  if(ctx){
+    if(ctx->program_name)
+      free(ctx->program_name);
+    if(ctx->conll_filename)
+      free(ctx->conll_filename);
+    if(ctx->mcf_filename)
+      free(ctx->mcf_filename);
+    if(ctx->mcd_filename)
+      free(ctx->mcd_filename);
+    if(ctx->mcd_struct)
+      mcd_free(ctx->mcd_struct);
+    free(ctx);
+  }
+}
+
+context *mcf2json_context_new(void)
+{
+  context *ctx = (context *)memalloc(sizeof(context));
+
+  ctx->help = 0;
+  ctx->verbose = 0;
+  ctx->debug_mode = 0;
+  ctx->program_name = NULL;
+  ctx->conll_filename = NULL;
+  ctx->mcf_filename = NULL;
+  ctx->mcd_filename = NULL;
+  ctx->mcd_struct = NULL;
+  return ctx;
+}
+
+void mcf2json_context_general_help_message(context *ctx)
+{
+  fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "\t-h --help             : print this message\n");
+  fprintf(stderr, "\t-v --verbose          : activate verbose mode\n");
+  fprintf(stderr, "\t-C --mcd              : mcd filename\n");
+  fprintf(stderr, "\t-i --mcf              : mcf filename (read from stdin if absent)\n");
+  fprintf(stderr, "\t-o --conll            : conll filename (write to stdout if absent)\n");
+}
+
+void mcf2json_check_options(context *ctx){
+  if(ctx->help){
+    mcf2json_context_general_help_message(ctx);
+    exit(1);
+  }
+}
+
+context *mcf2json_context_read_options(int argc, char *argv[])
+{
+  int c;
+  int option_index = 0;
+  context *ctx = mcf2json_context_new();
+
+  ctx->program_name = strdup(argv[0]);
+
+  static struct option long_options[6] =
+    {
+      {"help",                no_argument,       0, 'h'},
+      {"verbose",             no_argument,       0, 'v'},
+      {"debug",               no_argument,       0, 'd'},
+      {"conll",               required_argument, 0, 'o'},
+      {"mcd",                 required_argument, 0, 'C'}, 
+      {"mcf",                 required_argument, 0, 'i'},
+    };
+  optind = 0;
+  opterr = 0;
+  
+  while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ 
+    switch (c)
+      {
+      case 'd':
+	ctx->debug_mode = 1;
+	break;
+      case 'h':
+	ctx->help = 1;
+	break;
+      case 'v':
+	ctx->verbose = 1;
+	break;
+      case 'o':
+	ctx->conll_filename = strdup(optarg);
+	break;
+      case 'i':
+	ctx->mcf_filename = strdup(optarg);
+	break;
+      case 'C':
+	ctx->mcd_filename = strdup(optarg);
+	break;
+      }
+  }
+
+  if(ctx->mcd_filename){
+    ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
+  }
+  else{
+    ctx->mcd_struct = mcd_build_wpmlgfs();
+  }
+
+  return ctx;
+}
+
+void print_footer(FILE *output_file)
+{
+  fprintf(output_file, "}]\n");
+  fprintf(output_file, "}\n");
+}
+
+
+void print_header(FILE *output_file)
+{
+  fprintf(output_file, "{\n");
+  fprintf(output_file, "\"header\":{\n");
+  fprintf(output_file, "\"id\": \"\",\n");
+  fprintf(output_file, "\"timestamp\": \"\",\n");
+  fprintf(output_file, "\"labels_segment\": \"\",\n");
+  fprintf(output_file, "\"labels_link\": \"\",\n");
+  fprintf(output_file, "},\n");
+  
+  fprintf(output_file, "\"annotations\":{\n");
+  fprintf(output_file, "\"name\": \"\",\n");
+  fprintf(output_file, "\"time_start\": \"\",\n");
+  fprintf(output_file, "\"time_end\": \"\",\n");
+  fprintf(output_file, "},\n");
+}
+
+void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col)
+{
+      fprintf(output_file, "{");
+
+      fprintf(output_file, "\"orig\": %d, ", index);
+      fprintf(output_file, "\"dest\":");
+      if(gov_col){
+	if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
+	  fprintf(output_file, "0");
+	else
+	  fprintf(output_file, "%d", word_get_gov(w) + index);
+      }
+      else
+	fprintf(output_file, "_");
+      fprintf(output_file, ", ");
+      
+
+      fprintf(output_file, "\"label\": \"");
+      if(label_col != -1)
+	word_print_col_n(output_file, w, label_col);
+      else
+	fprintf(output_file, "_");
+      fprintf(output_file, "\", ");
+
+      fprintf(output_file, "\"status_link\": \"\", ");
+      fprintf(output_file, "\"status_lab\": \"\", ");
+      fprintf(output_file, "\"timestamp\": \"\", ");
+      fprintf(output_file, "\"author\": \"\", ");
+      fprintf(output_file, "\"target\": \"\"");
+      fprintf(output_file, "}");
+
+    }
+
+
+void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
+{
+  word *w;
+  int index;
+  int gov_col =   mcd_get_gov_col(word_buffer_get_mcd(wb));
+  int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
+  int first_link = 1;
+  
+  fprintf(output_file, "\"links\": [");
+  for(index = index_first_word; index <= index_last_word; index++){
+    w = word_buffer_get_word_n(wb, index);
+    if(first_link == 1)
+      first_link = 0;
+    else
+      fprintf(output_file, ",");
+    fprintf(output_file, "\n");
+    
+    print_link(output_file, w, index - index_first_word + 1, gov_col, label_col);
+  }
+  fprintf(output_file," ]");
+}
+
+
+void print_segment(FILE *output_file, word *w, int index, int pos_col)
+{
+  fprintf(output_file, "{ ");
+  fprintf(output_file, "\"start\": %d, ", index);
+  fprintf(output_file, "\"end\": %d, ", index);
+  fprintf(output_file, "\"label\": \"");
+  
+  if(pos_col != -1)
+    word_print_col_n(output_file, w, pos_col);
+  else
+    fprintf(output_file, "_");
+  fprintf(output_file, "\", ");
+  
+  fprintf(output_file, "\"status_seg\": \"\", ");
+  fprintf(output_file, "\"status_lab\": \"\", ");
+  fprintf(output_file, "\"timestamp\": \"\", ");
+  fprintf(output_file, "\"author\": \"\", ");
+  fprintf(output_file, "\"target\": \"\", ");
+  fprintf(output_file, "\"priority\": \"\"");
+  fprintf(output_file, " }");
+}
+
+void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
+{
+  word *w;
+  int index;
+  int pos_col =  mcd_get_pos_col(word_buffer_get_mcd(wb));
+  int first_segment = 1;
+  
+  fprintf(output_file, "\"segments\": [");
+  for(index = index_first_word; index <= index_last_word; index++){
+    w = word_buffer_get_word_n(wb, index);
+    if(first_segment == 1)
+      first_segment = 0;
+    else
+      fprintf(output_file, ",");
+    fprintf(output_file, "\n");
+    print_segment(output_file, w, index - index_first_word + 1, pos_col);
+  }
+  fprintf(output_file," ],\n");
+}
+
+void print_token(FILE *output_file, word *w, int index, int form_col)
+{
+  fprintf(output_file, "{ ");
+  fprintf(output_file, "\"id\": %d, ", index);
+  fprintf(output_file, "\"word\": \"");
+  if(form_col != -1)
+    word_print_col_n(output_file, w, form_col);
+  else
+    fprintf(output_file, "_");
+  fprintf(output_file, "\", ");
+  
+  fprintf(output_file, "\"bold\": 0, ");
+  fprintf(output_file, "\"newline\": 0 ");
+  fprintf(output_file, "}");
+}
+
+
+void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
+{
+  word *w;
+  int index;
+  int form_col =  mcd_get_form_col(word_buffer_get_mcd(wb));
+  int first_token = 1;
+
+  fprintf(output_file, "\"tokens\": [");
+  for(index = index_first_word; index <= index_last_word; index++){
+    w = word_buffer_get_word_n(wb, index);
+    if(first_token == 1)
+      first_token = 0;
+    else
+      fprintf(output_file, ",");
+    fprintf(output_file, "\n");
+    print_token(output_file, w, index - index_first_word + 1, form_col);
+  }
+  fprintf(output_file," ],\n");
+}
+
+
+void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int index_first_word, int index_last_word)
+{
+  fprintf(output_file, "{\n");
+  fprintf(output_file, "\"id\": \"s_%d\",\n", sentence_nb);
+  print_tokens(output_file, wb, index_first_word, index_last_word);
+  print_segments(output_file, wb, index_first_word, index_last_word);
+  print_links(output_file, wb, index_first_word, index_last_word);
+  fprintf(output_file, "}\n");
+}
+
+int main(int argc, char *argv[])
+{
+  FILE *output_file;
+  context *ctx = mcf2json_context_read_options(argc, argv);
+  word_buffer *wb = NULL;
+  word *w = NULL;
+  int first_sentence = 1;
+  int new_sentence = 1;
+  int index_first_word;
+  int index_last_word;
+  int sentence_nb = 0;
+
+  mcf2json_check_options(ctx);
+  output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
+  wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
+
+  print_header(output_file);
+  fprintf(output_file, "\"documents\": [");
+  do{
+    w = word_buffer_b0(wb);
+    if(new_sentence){
+      new_sentence = 0;
+      sentence_nb++;
+      index_first_word = word_buffer_get_current_index(wb);
+    }
+    if(word_get_sent_seg(w)){
+      index_last_word = word_buffer_get_current_index(wb);
+      new_sentence = 1;
+
+      if(first_sentence == 1)
+	first_sentence = 0;
+      else
+	fprintf(output_file, ",");
+      fprintf(output_file, "\n");
+      print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
+    }
+  } while(word_buffer_move_right(wb));
+  
+  print_footer(output_file);
+  if(ctx->conll_filename)
+    fclose(output_file);
+  mcf2json_context_free(ctx);
+
+
+
+  return 0;
+}