diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt index e92ea9ab8d4d842952c6c7909b11c7d17c667e8a..242c3314caa90360abdd04edcea1e4ce410eaadf 100644 --- a/maca_tools/CMakeLists.txt +++ b/maca_tools/CMakeLists.txt @@ -6,6 +6,12 @@ target_link_libraries(mcf2conll transparse) target_link_libraries(mcf2conll maca_common) install (TARGETS mcf2conll DESTINATION bin) +add_executable(mcf2orfeo ./src/mcf2orfeo.c) +target_link_libraries(mcf2orfeo perceptron) +target_link_libraries(mcf2orfeo transparse) +target_link_libraries(mcf2orfeo maca_common) +install (TARGETS mcf2orfeo DESTINATION bin) + add_executable(maca_compute_l_rules ./src/maca_compute_l_rules.c) target_link_libraries(maca_compute_l_rules maca_common) install (TARGETS maca_compute_l_rules DESTINATION bin) diff --git a/maca_tools/src/mcf2orfeo.c b/maca_tools/src/mcf2orfeo.c new file mode 100644 index 0000000000000000000000000000000000000000..e0b2ed0f8fb3b7bf213de4d1dbbfa2b180ed5ae6 --- /dev/null +++ b/maca_tools/src/mcf2orfeo.c @@ -0,0 +1,264 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<getopt.h> + +#include"mcd.h" +#include"util.h" +#include"word_buffer.h" + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *conll_filename; + char *mcf_filename; + char *mcd_filename; + mcd *mcd_struct; +} context; + +void context_free(context *ctx) +{ + if(ctx){ + if(ctx->program_name) + free(ctx->program_name); + if(ctx->conll_filename) + free(ctx->conll_filename); + if(ctx->mcf_filename) + free(ctx->mcf_filename); + if(ctx->mcd_filename) + free(ctx->mcd_filename); + if(ctx->mcd_struct) + mcd_free(ctx->mcd_struct); + free(ctx); + } +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->conll_filename = NULL; + ctx->mcf_filename = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-C --mcd : mcd filename\n"); + fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); + fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); +} + +void mcf2conll_check_options(context *ctx){ + if(ctx->help){ + context_general_help_message(ctx); + exit(1); + } +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[6] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"conll", required_argument, 0, 'o'}, + {"mcd", required_argument, 0, 'C'}, + {"mcf", required_argument, 0, 'i'}, + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'o': + ctx->conll_filename = strdup(optarg); + break; + case 'i': + ctx->mcf_filename = strdup(optarg); + break; + case 'C': + ctx->mcd_filename = strdup(optarg); + break; + } + } + + if(ctx->mcd_filename){ + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + } + else{ + ctx->mcd_struct = mcd_build_wpmlgfs(); + } + + return ctx; +} + +int main(int argc, char *argv[]) +{ + FILE *output_file; + context *ctx = context_read_options(argc, argv); + mcf2conll_check_options(ctx); + + + word_buffer *wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); + word *w = NULL; + int form_col = mcd_get_form_col(ctx->mcd_struct); + int pos_col = mcd_get_pos_col(ctx->mcd_struct); + int cpos_col = mcd_get_cpos_col(ctx->mcd_struct); + int lemma_col = mcd_get_lemma_col(ctx->mcd_struct); + int gov_col = mcd_get_gov_col(ctx->mcd_struct); + int label_col = mcd_get_label_col(ctx->mcd_struct); + int feats_col = mcd_get_feats_col(ctx->mcd_struct); + int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct); + + int spkr_col = mcd_get_a_col(ctx->mcd_struct); + int start_col = mcd_get_b_col(ctx->mcd_struct); + int end_col = mcd_get_c_col(ctx->mcd_struct); + + int index = 1; + + output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; + + + do{ + w = word_buffer_b0(wb); + if(w){ + + fprintf(output_file, "%d\t", index); + + if(form_col != -1) + word_print_col_n(output_file, w, form_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(lemma_col != -1) + word_print_col_n(output_file, w, lemma_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + /* + if(cpos_col != -1) + word_print_col_n(output_file, w, cpos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + */ + if(pos_col != -1) + word_print_col_n(output_file, w, pos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(pos_col != -1) + word_print_col_n(output_file, w, pos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + /* + if(feats_col != -1) + word_print_col_n(output_file, w, feats_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + */ + + + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + + if(gov_col){ + if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0)) + fprintf(output_file, "0\t"); + else + fprintf(output_file, "%d\t", word_get_gov(w) + index); + } + else + fprintf(output_file, "_\t"); + + if(label_col != -1) + word_print_col_n(output_file, w, label_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + + if(start_col != -1) + word_print_col_n(output_file, w, start_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(end_col != -1) + word_print_col_n(output_file, w, end_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(spkr_col != -1) + word_print_col_n(output_file, w, spkr_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + + + /* fprintf(output_file, "\t_\t\n"); */ + /* fprintf(output_file, "_\t_\n"); */ + fprintf(output_file, "\n"); + if((sent_seg_col) && (word_get_sent_seg(w))){ + fprintf(output_file, "\n"); + index = 0; + } + + index ++; + } + } while(word_buffer_move_right(wb)); + + + if (wb->input_file != stdin) + fclose(wb->input_file); + + word_buffer_free(wb); + + if(ctx->conll_filename) + fclose(output_file); + context_free(ctx); + return 0; +}