Select Git revision
mcf2json.c 9.00 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<getopt.h>
#include"mcd.h"
#include"util.h"
#include"word_buffer.h"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *conll_filename;
char *mcf_filename;
char *mcd_filename;
mcd *mcd_struct;
} context;
void mcf2json_context_free(context *ctx)
{
if(ctx){
if(ctx->program_name)
free(ctx->program_name);
if(ctx->conll_filename)
free(ctx->conll_filename);
if(ctx->mcf_filename)
free(ctx->mcf_filename);
if(ctx->mcd_filename)
free(ctx->mcd_filename);
if(ctx->mcd_struct)
mcd_free(ctx->mcd_struct);
free(ctx);
}
}
context *mcf2json_context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->conll_filename = NULL;
ctx->mcf_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
return ctx;
}
void mcf2json_context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-C --mcd : mcd filename\n");
fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n");
fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n");
}
void mcf2json_check_options(context *ctx){
if(ctx->help){
mcf2json_context_general_help_message(ctx);
exit(1);
}
}
context *mcf2json_context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = mcf2json_context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[6] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"mcf", required_argument, 0, 'i'},
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'o':
ctx->conll_filename = strdup(optarg);
break;
case 'i':
ctx->mcf_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
}
}
if(ctx->mcd_filename){
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
}
else{
ctx->mcd_struct = mcd_build_wpmlgfs();
}
return ctx;
}
void print_footer(FILE *output_file)
{
fprintf(output_file, "}]\n");
fprintf(output_file, "}\n");
}
void print_header(FILE *output_file)
{
fprintf(output_file, "{\n");
fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"labels_segment\": \"\",\n");
fprintf(output_file, "\"labels_link\": \"\",\n");
fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n");
fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\",\n");
fprintf(output_file, "},\n");
}
void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col)
{
fprintf(output_file, "{");
fprintf(output_file, "\"orig\": %d, ", index);
fprintf(output_file, "\"dest\":");
if(gov_col){
if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
fprintf(output_file, "0");
else
fprintf(output_file, "%d", word_get_gov(w) + index);
}
else
fprintf(output_file, "_");
fprintf(output_file, ", ");
fprintf(output_file, "\"label\": \"");
if(label_col != -1)
word_print_col_n(output_file, w, label_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_link\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\"");
fprintf(output_file, "}");
}
void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb));
int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
int first_link = 1;
fprintf(output_file, "\"links\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_link == 1)
first_link = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_link(output_file, w, index - index_first_word + 1, gov_col, label_col);
}
fprintf(output_file," ]");
}
void print_segment(FILE *output_file, word *w, int index, int pos_col)
{
fprintf(output_file, "{ ");
fprintf(output_file, "\"start\": %d, ", index);
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"label\": \"");
if(pos_col != -1)
word_print_col_n(output_file, w, pos_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_seg\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\", ");
fprintf(output_file, "\"priority\": \"\"");
fprintf(output_file, " }");
}
void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_segment == 1)
first_segment = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, w, index - index_first_word + 1, pos_col);
}
fprintf(output_file," ],\n");
}
void print_token(FILE *output_file, word *w, int index, int form_col)
{
fprintf(output_file, "{ ");
fprintf(output_file, "\"id\": %d, ", index);
fprintf(output_file, "\"word\": \"");
if(form_col != -1)
word_print_col_n(output_file, w, form_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"bold\": 0, ");
fprintf(output_file, "\"newline\": 0 ");
fprintf(output_file, "}");
}
void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int first_token = 1;
fprintf(output_file, "\"tokens\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_token == 1)
first_token = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_token(output_file, w, index - index_first_word + 1, form_col);
}
fprintf(output_file," ],\n");
}
void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int index_first_word, int index_last_word)
{
fprintf(output_file, "{\n");
fprintf(output_file, "\"id\": \"s_%d\",\n", sentence_nb);
print_tokens(output_file, wb, index_first_word, index_last_word);
print_segments(output_file, wb, index_first_word, index_last_word);
print_links(output_file, wb, index_first_word, index_last_word);
fprintf(output_file, "}\n");
}
int main(int argc, char *argv[])
{
FILE *output_file;
context *ctx = mcf2json_context_read_options(argc, argv);
word_buffer *wb = NULL;
word *w = NULL;
int first_sentence = 1;
int new_sentence = 1;
int index_first_word;
int index_last_word;
int sentence_nb = 0;
mcf2json_check_options(ctx);
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
print_header(output_file);
fprintf(output_file, "\"documents\": [");
do{
w = word_buffer_b0(wb);
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
}
} while(word_buffer_move_right(wb));
print_footer(output_file);
if(ctx->conll_filename)
fclose(output_file);
mcf2json_context_free(ctx);
return 0;
}