diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt index 242c3314caa90360abdd04edcea1e4ce410eaadf..c0b21b12795166a800df4972a9328012ada94e72 100644 --- a/maca_tools/CMakeLists.txt +++ b/maca_tools/CMakeLists.txt @@ -12,6 +12,10 @@ target_link_libraries(mcf2orfeo transparse) target_link_libraries(mcf2orfeo maca_common) install (TARGETS mcf2orfeo DESTINATION bin) +add_executable(mcf2json ./src/mcf2json.c) +target_link_libraries(mcf2json maca_common) +install (TARGETS mcf2json DESTINATION bin) + add_executable(maca_compute_l_rules ./src/maca_compute_l_rules.c) target_link_libraries(maca_compute_l_rules maca_common) install (TARGETS maca_compute_l_rules DESTINATION bin) diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c new file mode 100644 index 0000000000000000000000000000000000000000..df06d1c6a62dd2a661f23386e23760e9de00ef2b --- /dev/null +++ b/maca_tools/src/mcf2json.c @@ -0,0 +1,342 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<getopt.h> + +#include"mcd.h" +#include"util.h" +#include"word_buffer.h" + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *conll_filename; + char *mcf_filename; + char *mcd_filename; + mcd *mcd_struct; +} context; + +void mcf2json_context_free(context *ctx) +{ + if(ctx){ + if(ctx->program_name) + free(ctx->program_name); + if(ctx->conll_filename) + free(ctx->conll_filename); + if(ctx->mcf_filename) + free(ctx->mcf_filename); + if(ctx->mcd_filename) + free(ctx->mcd_filename); + if(ctx->mcd_struct) + mcd_free(ctx->mcd_struct); + free(ctx); + } +} + +context *mcf2json_context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->conll_filename = NULL; + ctx->mcf_filename = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + return ctx; +} + +void mcf2json_context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-C --mcd : mcd filename\n"); + fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); + fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); +} + +void mcf2json_check_options(context *ctx){ + if(ctx->help){ + mcf2json_context_general_help_message(ctx); + exit(1); + } +} + +context *mcf2json_context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = mcf2json_context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[6] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"conll", required_argument, 0, 'o'}, + {"mcd", required_argument, 0, 'C'}, + {"mcf", required_argument, 0, 'i'}, + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'o': + ctx->conll_filename = strdup(optarg); + break; + case 'i': + ctx->mcf_filename = strdup(optarg); + break; + case 'C': + ctx->mcd_filename = strdup(optarg); + break; + } + } + + if(ctx->mcd_filename){ + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + } + else{ + ctx->mcd_struct = mcd_build_wpmlgfs(); + } + + return ctx; +} + +void print_footer(FILE *output_file) +{ + fprintf(output_file, "}]\n"); + fprintf(output_file, "}\n"); +} + + +void print_header(FILE *output_file) +{ + fprintf(output_file, "{\n"); + fprintf(output_file, "\"header\":{\n"); + fprintf(output_file, "\"id\": \"\",\n"); + fprintf(output_file, "\"timestamp\": \"\",\n"); + fprintf(output_file, "\"labels_segment\": \"\",\n"); + fprintf(output_file, "\"labels_link\": \"\",\n"); + fprintf(output_file, "},\n"); + + fprintf(output_file, "\"annotations\":{\n"); + fprintf(output_file, "\"name\": \"\",\n"); + fprintf(output_file, "\"time_start\": \"\",\n"); + fprintf(output_file, "\"time_end\": \"\",\n"); + fprintf(output_file, "},\n"); +} + +void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col) +{ + fprintf(output_file, "{"); + + fprintf(output_file, "\"orig\": %d, ", index); + fprintf(output_file, "\"dest\":"); + if(gov_col){ + if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0)) + fprintf(output_file, "0"); + else + fprintf(output_file, "%d", word_get_gov(w) + index); + } + else + fprintf(output_file, "_"); + fprintf(output_file, ", "); + + + fprintf(output_file, "\"label\": \""); + if(label_col != -1) + word_print_col_n(output_file, w, label_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"status_link\": \"\", "); + fprintf(output_file, "\"status_lab\": \"\", "); + fprintf(output_file, "\"timestamp\": \"\", "); + fprintf(output_file, "\"author\": \"\", "); + fprintf(output_file, "\"target\": \"\""); + fprintf(output_file, "}"); + + } + + +void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) +{ + word *w; + int index; + int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb)); + int label_col = mcd_get_label_col(word_buffer_get_mcd(wb)); + int first_link = 1; + + fprintf(output_file, "\"links\": ["); + for(index = index_first_word; index <= index_last_word; index++){ + w = word_buffer_get_word_n(wb, index); + if(first_link == 1) + first_link = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + + print_link(output_file, w, index - index_first_word + 1, gov_col, label_col); + } + fprintf(output_file," ]"); +} + + +void print_segment(FILE *output_file, word *w, int index, int pos_col) +{ + fprintf(output_file, "{ "); + fprintf(output_file, "\"start\": %d, ", index); + fprintf(output_file, "\"end\": %d, ", index); + fprintf(output_file, "\"label\": \""); + + if(pos_col != -1) + word_print_col_n(output_file, w, pos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"status_seg\": \"\", "); + fprintf(output_file, "\"status_lab\": \"\", "); + fprintf(output_file, "\"timestamp\": \"\", "); + fprintf(output_file, "\"author\": \"\", "); + fprintf(output_file, "\"target\": \"\", "); + fprintf(output_file, "\"priority\": \"\""); + fprintf(output_file, " }"); +} + +void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) +{ + word *w; + int index; + int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); + int first_segment = 1; + + fprintf(output_file, "\"segments\": ["); + for(index = index_first_word; index <= index_last_word; index++){ + w = word_buffer_get_word_n(wb, index); + if(first_segment == 1) + first_segment = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_segment(output_file, w, index - index_first_word + 1, pos_col); + } + fprintf(output_file," ],\n"); +} + +void print_token(FILE *output_file, word *w, int index, int form_col) +{ + fprintf(output_file, "{ "); + fprintf(output_file, "\"id\": %d, ", index); + fprintf(output_file, "\"word\": \""); + if(form_col != -1) + word_print_col_n(output_file, w, form_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"bold\": 0, "); + fprintf(output_file, "\"newline\": 0 "); + fprintf(output_file, "}"); +} + + +void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) +{ + word *w; + int index; + int form_col = mcd_get_form_col(word_buffer_get_mcd(wb)); + int first_token = 1; + + fprintf(output_file, "\"tokens\": ["); + for(index = index_first_word; index <= index_last_word; index++){ + w = word_buffer_get_word_n(wb, index); + if(first_token == 1) + first_token = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_token(output_file, w, index - index_first_word + 1, form_col); + } + fprintf(output_file," ],\n"); +} + + +void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int index_first_word, int index_last_word) +{ + fprintf(output_file, "{\n"); + fprintf(output_file, "\"id\": \"s_%d\",\n", sentence_nb); + print_tokens(output_file, wb, index_first_word, index_last_word); + print_segments(output_file, wb, index_first_word, index_last_word); + print_links(output_file, wb, index_first_word, index_last_word); + fprintf(output_file, "}\n"); +} + +int main(int argc, char *argv[]) +{ + FILE *output_file; + context *ctx = mcf2json_context_read_options(argc, argv); + word_buffer *wb = NULL; + word *w = NULL; + int first_sentence = 1; + int new_sentence = 1; + int index_first_word; + int index_last_word; + int sentence_nb = 0; + + mcf2json_check_options(ctx); + output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; + wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); + + print_header(output_file); + fprintf(output_file, "\"documents\": ["); + do{ + w = word_buffer_b0(wb); + if(new_sentence){ + new_sentence = 0; + sentence_nb++; + index_first_word = word_buffer_get_current_index(wb); + } + if(word_get_sent_seg(w)){ + index_last_word = word_buffer_get_current_index(wb); + new_sentence = 1; + + if(first_sentence == 1) + first_sentence = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + } + } while(word_buffer_move_right(wb)); + + print_footer(output_file); + if(ctx->conll_filename) + fclose(output_file); + mcf2json_context_free(ctx); + + + + return 0; +}