Skip to content
Snippets Groups Projects
Commit 71c198d9 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added mcf2json to extract from mcf files json files for the annotation tool

parent 55796a1c
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,10 @@ target_link_libraries(mcf2orfeo transparse)
target_link_libraries(mcf2orfeo maca_common)
install (TARGETS mcf2orfeo DESTINATION bin)
add_executable(mcf2json ./src/mcf2json.c)
target_link_libraries(mcf2json maca_common)
install (TARGETS mcf2json DESTINATION bin)
add_executable(maca_compute_l_rules ./src/maca_compute_l_rules.c)
target_link_libraries(maca_compute_l_rules maca_common)
install (TARGETS maca_compute_l_rules DESTINATION bin)
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<getopt.h>
#include"mcd.h"
#include"util.h"
#include"word_buffer.h"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *conll_filename;
char *mcf_filename;
char *mcd_filename;
mcd *mcd_struct;
} context;
void mcf2json_context_free(context *ctx)
{
if(ctx){
if(ctx->program_name)
free(ctx->program_name);
if(ctx->conll_filename)
free(ctx->conll_filename);
if(ctx->mcf_filename)
free(ctx->mcf_filename);
if(ctx->mcd_filename)
free(ctx->mcd_filename);
if(ctx->mcd_struct)
mcd_free(ctx->mcd_struct);
free(ctx);
}
}
context *mcf2json_context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->conll_filename = NULL;
ctx->mcf_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
return ctx;
}
void mcf2json_context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-C --mcd : mcd filename\n");
fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n");
fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n");
}
void mcf2json_check_options(context *ctx){
if(ctx->help){
mcf2json_context_general_help_message(ctx);
exit(1);
}
}
context *mcf2json_context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = mcf2json_context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[6] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"mcf", required_argument, 0, 'i'},
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'o':
ctx->conll_filename = strdup(optarg);
break;
case 'i':
ctx->mcf_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
}
}
if(ctx->mcd_filename){
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
}
else{
ctx->mcd_struct = mcd_build_wpmlgfs();
}
return ctx;
}
void print_footer(FILE *output_file)
{
fprintf(output_file, "}]\n");
fprintf(output_file, "}\n");
}
void print_header(FILE *output_file)
{
fprintf(output_file, "{\n");
fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"labels_segment\": \"\",\n");
fprintf(output_file, "\"labels_link\": \"\",\n");
fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n");
fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\",\n");
fprintf(output_file, "},\n");
}
void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col)
{
fprintf(output_file, "{");
fprintf(output_file, "\"orig\": %d, ", index);
fprintf(output_file, "\"dest\":");
if(gov_col){
if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
fprintf(output_file, "0");
else
fprintf(output_file, "%d", word_get_gov(w) + index);
}
else
fprintf(output_file, "_");
fprintf(output_file, ", ");
fprintf(output_file, "\"label\": \"");
if(label_col != -1)
word_print_col_n(output_file, w, label_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_link\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\"");
fprintf(output_file, "}");
}
void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb));
int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
int first_link = 1;
fprintf(output_file, "\"links\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_link == 1)
first_link = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_link(output_file, w, index - index_first_word + 1, gov_col, label_col);
}
fprintf(output_file," ]");
}
void print_segment(FILE *output_file, word *w, int index, int pos_col)
{
fprintf(output_file, "{ ");
fprintf(output_file, "\"start\": %d, ", index);
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"label\": \"");
if(pos_col != -1)
word_print_col_n(output_file, w, pos_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_seg\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\", ");
fprintf(output_file, "\"priority\": \"\"");
fprintf(output_file, " }");
}
void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_segment == 1)
first_segment = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, w, index - index_first_word + 1, pos_col);
}
fprintf(output_file," ],\n");
}
void print_token(FILE *output_file, word *w, int index, int form_col)
{
fprintf(output_file, "{ ");
fprintf(output_file, "\"id\": %d, ", index);
fprintf(output_file, "\"word\": \"");
if(form_col != -1)
word_print_col_n(output_file, w, form_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"bold\": 0, ");
fprintf(output_file, "\"newline\": 0 ");
fprintf(output_file, "}");
}
void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int first_token = 1;
fprintf(output_file, "\"tokens\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_token == 1)
first_token = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_token(output_file, w, index - index_first_word + 1, form_col);
}
fprintf(output_file," ],\n");
}
void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int index_first_word, int index_last_word)
{
fprintf(output_file, "{\n");
fprintf(output_file, "\"id\": \"s_%d\",\n", sentence_nb);
print_tokens(output_file, wb, index_first_word, index_last_word);
print_segments(output_file, wb, index_first_word, index_last_word);
print_links(output_file, wb, index_first_word, index_last_word);
fprintf(output_file, "}\n");
}
int main(int argc, char *argv[])
{
FILE *output_file;
context *ctx = mcf2json_context_read_options(argc, argv);
word_buffer *wb = NULL;
word *w = NULL;
int first_sentence = 1;
int new_sentence = 1;
int index_first_word;
int index_last_word;
int sentence_nb = 0;
mcf2json_check_options(ctx);
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
print_header(output_file);
fprintf(output_file, "\"documents\": [");
do{
w = word_buffer_b0(wb);
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
}
} while(word_buffer_move_right(wb));
print_footer(output_file);
if(ctx->conll_filename)
fclose(output_file);
mcf2json_context_free(ctx);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment