Skip to content
Snippets Groups Projects
Select Git revision
  • 71c198d99f636e67abda9a3377e4ac5a30dec34d
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

mcf2json.c

Blame
  • mcf2json.c 9.00 KiB
    #include<stdio.h>
    #include<stdlib.h>
    #include<string.h>
    #include<getopt.h>
    
    #include"mcd.h"
    #include"util.h"
    #include"word_buffer.h"
    
    typedef struct {
      int help;
      int verbose;
      int debug_mode;
      char *program_name;
      char *conll_filename;
      char *mcf_filename;
      char *mcd_filename;
      mcd *mcd_struct;
    } context;
    
    void mcf2json_context_free(context *ctx)
    {
      if(ctx){
        if(ctx->program_name)
          free(ctx->program_name);
        if(ctx->conll_filename)
          free(ctx->conll_filename);
        if(ctx->mcf_filename)
          free(ctx->mcf_filename);
        if(ctx->mcd_filename)
          free(ctx->mcd_filename);
        if(ctx->mcd_struct)
          mcd_free(ctx->mcd_struct);
        free(ctx);
      }
    }
    
    context *mcf2json_context_new(void)
    {
      context *ctx = (context *)memalloc(sizeof(context));
    
      ctx->help = 0;
      ctx->verbose = 0;
      ctx->debug_mode = 0;
      ctx->program_name = NULL;
      ctx->conll_filename = NULL;
      ctx->mcf_filename = NULL;
      ctx->mcd_filename = NULL;
      ctx->mcd_struct = NULL;
      return ctx;
    }
    
    void mcf2json_context_general_help_message(context *ctx)
    {
      fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
      fprintf(stderr, "Options:\n");
      fprintf(stderr, "\t-h --help             : print this message\n");
      fprintf(stderr, "\t-v --verbose          : activate verbose mode\n");
      fprintf(stderr, "\t-C --mcd              : mcd filename\n");
      fprintf(stderr, "\t-i --mcf              : mcf filename (read from stdin if absent)\n");
      fprintf(stderr, "\t-o --conll            : conll filename (write to stdout if absent)\n");
    }
    
    void mcf2json_check_options(context *ctx){
      if(ctx->help){
        mcf2json_context_general_help_message(ctx);
        exit(1);
      }
    }
    
    context *mcf2json_context_read_options(int argc, char *argv[])
    {
      int c;
      int option_index = 0;
      context *ctx = mcf2json_context_new();
    
      ctx->program_name = strdup(argv[0]);
    
      static struct option long_options[6] =
        {
          {"help",                no_argument,       0, 'h'},
          {"verbose",             no_argument,       0, 'v'},
          {"debug",               no_argument,       0, 'd'},
          {"conll",               required_argument, 0, 'o'},
          {"mcd",                 required_argument, 0, 'C'}, 
          {"mcf",                 required_argument, 0, 'i'},
        };
      optind = 0;
      opterr = 0;
      
      while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ 
        switch (c)
          {
          case 'd':
    	ctx->debug_mode = 1;
    	break;
          case 'h':
    	ctx->help = 1;
    	break;
          case 'v':
    	ctx->verbose = 1;
    	break;
          case 'o':
    	ctx->conll_filename = strdup(optarg);
    	break;
          case 'i':
    	ctx->mcf_filename = strdup(optarg);
    	break;
          case 'C':
    	ctx->mcd_filename = strdup(optarg);
    	break;
          }
      }
    
      if(ctx->mcd_filename){
        ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
      }
      else{
        ctx->mcd_struct = mcd_build_wpmlgfs();
      }
    
      return ctx;
    }
    
    void print_footer(FILE *output_file)
    {
      fprintf(output_file, "}]\n");
      fprintf(output_file, "}\n");
    }
    
    
    void print_header(FILE *output_file)
    {
      fprintf(output_file, "{\n");
      fprintf(output_file, "\"header\":{\n");
      fprintf(output_file, "\"id\": \"\",\n");
      fprintf(output_file, "\"timestamp\": \"\",\n");
      fprintf(output_file, "\"labels_segment\": \"\",\n");
      fprintf(output_file, "\"labels_link\": \"\",\n");
      fprintf(output_file, "},\n");
      
      fprintf(output_file, "\"annotations\":{\n");
      fprintf(output_file, "\"name\": \"\",\n");
      fprintf(output_file, "\"time_start\": \"\",\n");
      fprintf(output_file, "\"time_end\": \"\",\n");
      fprintf(output_file, "},\n");
    }
    
    void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col)
    {
          fprintf(output_file, "{");
    
          fprintf(output_file, "\"orig\": %d, ", index);
          fprintf(output_file, "\"dest\":");
          if(gov_col){
    	if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
    	  fprintf(output_file, "0");
    	else
    	  fprintf(output_file, "%d", word_get_gov(w) + index);
          }
          else
    	fprintf(output_file, "_");
          fprintf(output_file, ", ");
          
    
          fprintf(output_file, "\"label\": \"");
          if(label_col != -1)
    	word_print_col_n(output_file, w, label_col);
          else
    	fprintf(output_file, "_");
          fprintf(output_file, "\", ");
    
          fprintf(output_file, "\"status_link\": \"\", ");
          fprintf(output_file, "\"status_lab\": \"\", ");
          fprintf(output_file, "\"timestamp\": \"\", ");
          fprintf(output_file, "\"author\": \"\", ");
          fprintf(output_file, "\"target\": \"\"");
          fprintf(output_file, "}");
    
        }
    
    
    void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
    {
      word *w;
      int index;
      int gov_col =   mcd_get_gov_col(word_buffer_get_mcd(wb));
      int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
      int first_link = 1;
      
      fprintf(output_file, "\"links\": [");
      for(index = index_first_word; index <= index_last_word; index++){
        w = word_buffer_get_word_n(wb, index);
        if(first_link == 1)
          first_link = 0;
        else
          fprintf(output_file, ",");
        fprintf(output_file, "\n");
        
        print_link(output_file, w, index - index_first_word + 1, gov_col, label_col);
      }
      fprintf(output_file," ]");
    }
    
    
    void print_segment(FILE *output_file, word *w, int index, int pos_col)
    {
      fprintf(output_file, "{ ");
      fprintf(output_file, "\"start\": %d, ", index);
      fprintf(output_file, "\"end\": %d, ", index);
      fprintf(output_file, "\"label\": \"");
      
      if(pos_col != -1)
        word_print_col_n(output_file, w, pos_col);
      else
        fprintf(output_file, "_");
      fprintf(output_file, "\", ");
      
      fprintf(output_file, "\"status_seg\": \"\", ");
      fprintf(output_file, "\"status_lab\": \"\", ");
      fprintf(output_file, "\"timestamp\": \"\", ");
      fprintf(output_file, "\"author\": \"\", ");
      fprintf(output_file, "\"target\": \"\", ");
      fprintf(output_file, "\"priority\": \"\"");
      fprintf(output_file, " }");
    }
    
    void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
    {
      word *w;
      int index;
      int pos_col =  mcd_get_pos_col(word_buffer_get_mcd(wb));
      int first_segment = 1;
      
      fprintf(output_file, "\"segments\": [");
      for(index = index_first_word; index <= index_last_word; index++){
        w = word_buffer_get_word_n(wb, index);
        if(first_segment == 1)
          first_segment = 0;
        else
          fprintf(output_file, ",");
        fprintf(output_file, "\n");
        print_segment(output_file, w, index - index_first_word + 1, pos_col);
      }
      fprintf(output_file," ],\n");
    }
    
    void print_token(FILE *output_file, word *w, int index, int form_col)
    {
      fprintf(output_file, "{ ");
      fprintf(output_file, "\"id\": %d, ", index);
      fprintf(output_file, "\"word\": \"");
      if(form_col != -1)
        word_print_col_n(output_file, w, form_col);
      else
        fprintf(output_file, "_");
      fprintf(output_file, "\", ");
      
      fprintf(output_file, "\"bold\": 0, ");
      fprintf(output_file, "\"newline\": 0 ");
      fprintf(output_file, "}");
    }
    
    
    void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
    {
      word *w;
      int index;
      int form_col =  mcd_get_form_col(word_buffer_get_mcd(wb));
      int first_token = 1;
    
      fprintf(output_file, "\"tokens\": [");
      for(index = index_first_word; index <= index_last_word; index++){
        w = word_buffer_get_word_n(wb, index);
        if(first_token == 1)
          first_token = 0;
        else
          fprintf(output_file, ",");
        fprintf(output_file, "\n");
        print_token(output_file, w, index - index_first_word + 1, form_col);
      }
      fprintf(output_file," ],\n");
    }
    
    
    void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int index_first_word, int index_last_word)
    {
      fprintf(output_file, "{\n");
      fprintf(output_file, "\"id\": \"s_%d\",\n", sentence_nb);
      print_tokens(output_file, wb, index_first_word, index_last_word);
      print_segments(output_file, wb, index_first_word, index_last_word);
      print_links(output_file, wb, index_first_word, index_last_word);
      fprintf(output_file, "}\n");
    }
    
    int main(int argc, char *argv[])
    {
      FILE *output_file;
      context *ctx = mcf2json_context_read_options(argc, argv);
      word_buffer *wb = NULL;
      word *w = NULL;
      int first_sentence = 1;
      int new_sentence = 1;
      int index_first_word;
      int index_last_word;
      int sentence_nb = 0;
    
      mcf2json_check_options(ctx);
      output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
      wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
    
      print_header(output_file);
      fprintf(output_file, "\"documents\": [");
      do{
        w = word_buffer_b0(wb);
        if(new_sentence){
          new_sentence = 0;
          sentence_nb++;
          index_first_word = word_buffer_get_current_index(wb);
        }
        if(word_get_sent_seg(w)){
          index_last_word = word_buffer_get_current_index(wb);
          new_sentence = 1;
    
          if(first_sentence == 1)
    	first_sentence = 0;
          else
    	fprintf(output_file, ",");
          fprintf(output_file, "\n");
          print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
        }
      } while(word_buffer_move_right(wb));
      
      print_footer(output_file);
      if(ctx->conll_filename)
        fclose(output_file);
      mcf2json_context_free(ctx);
    
    
    
      return 0;
    }