diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index aa4ade656c36921b030a52c3cd8314c1b7daa4d9..560fd6cb4baa6edcd911d9eee32f9b054f37513e 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -61,9 +61,9 @@ #define MCD_WF_Person 45 #define MCD_WF_Tense 46 -#define MCD_WF_FILE 47 -#define MCD_WF_DIRECTORY 48 -#define MCD_WF_SPEAKER 49 +#define MCD_WF_FILE 48 +#define MCD_WF_DIRECTORY 49 +#define MCD_WF_SPEAKER 50 diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 7e8f30bdbb048ac2767d692d1cbc182fc3ed0c99..dac612deb263dd47a31fe4865187e5a6929eab52 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -63,6 +63,11 @@ typedef struct _word { #define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL]) #define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG]) #define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG]) + +#define word_get_file(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_FILE]) +#define word_get_directory(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_DIRECTORY]) +#define word_get_speaker(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SPEAKER]) + #define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A]) #define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B]) #define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C]) diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index 4381177cfd388d8012380286f515dc1ae5fd19a8..a194d9bfe29340a82d24a540db1f8e08786bd873 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -2,6 +2,9 @@ #include<stdlib.h> #include<string.h> #include<getopt.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> #include"mcd.h" #include"util.h" @@ -16,6 +19,7 @@ typedef struct { char *mcf_filename; char *mcd_filename; mcd *mcd_struct; + char *root_dir; } context; void mcf2json_context_free(context *ctx) @@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx) free(ctx->mcd_filename); if(ctx->mcd_struct) mcd_free(ctx->mcd_struct); + if(ctx->root_dir) + free(ctx->root_dir); free(ctx); } } @@ -47,6 +53,7 @@ context *mcf2json_context_new(void) ctx->mcf_filename = NULL; ctx->mcd_filename = NULL; ctx->mcd_struct = NULL; + ctx->root_dir = NULL; return ctx; } @@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx) fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); fprintf(stderr, "\t-C --mcd : mcd filename\n"); fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); - fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); + fprintf(stderr, "\t-r --root : root directory of the json files\n"); } void mcf2json_check_options(context *ctx){ @@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[]) {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, {"debug", no_argument, 0, 'd'}, - {"conll", required_argument, 0, 'o'}, {"mcd", required_argument, 0, 'C'}, {"mcf", required_argument, 0, 'i'}, + {"root", required_argument, 0, 'r'}, }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdC:i:r:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[]) case 'v': ctx->verbose = 1; break; - case 'o': - ctx->conll_filename = strdup(optarg); - break; case 'i': ctx->mcf_filename = strdup(optarg); break; case 'C': ctx->mcd_filename = strdup(optarg); break; + case 'r': + ctx->root_dir = strdup(optarg); + break; } } @@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[]) else{ ctx->mcd_struct = mcd_build_wpmlgfs(); } - return ctx; } @@ -317,7 +323,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind int main(int argc, char *argv[]) { - FILE *output_file; + FILE *output_file = NULL; context *ctx = mcf2json_context_read_options(argc, argv); word_buffer *wb = NULL; word *w = NULL; @@ -326,41 +332,101 @@ int main(int argc, char *argv[]) int index_first_word; int index_last_word; int sentence_nb = 0; - + char current_directory[1000]; + char current_file[1000]; + char previous_directory[1000]; + char previous_file[1000]; + char *root_directory = NULL; + char destination_file[1000]; + char destination_dir[1000]; + struct stat st = {0}; + mcf2json_check_options(ctx); mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename); - - output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); - print_header(output_file, ctx->mcd_struct); - do{ - w = word_buffer_b0(wb); - if(new_sentence){ - new_sentence = 0; - sentence_nb++; - index_first_word = word_buffer_get_current_index(wb); - } - if(word_get_sent_seg(w)){ - index_last_word = word_buffer_get_current_index(wb); - new_sentence = 1; - - if(first_sentence == 1) - first_sentence = 0; - else - fprintf(output_file, ","); - fprintf(output_file, "\n"); - print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + if(ctx->root_dir){ + if(stat(ctx->root_dir, &st) == -1) { + mkdir(ctx->root_dir, 0700); + fprintf(stderr, "creating directory %s\n", ctx->root_dir); } - } while(word_buffer_move_right(wb)); - - print_footer(output_file); - if(ctx->conll_filename) + do{ + w = word_buffer_b0(wb); + if(w == NULL) break; + word_sprint_col_n(current_directory, w, ctx->mcd_struct->wf2col[MCD_WF_DIRECTORY]); + word_sprint_col_n(current_file, w, ctx->mcd_struct->wf2col[MCD_WF_FILE]); + + if(strcmp(current_directory, previous_directory)){ + strcpy(destination_dir, ctx->root_dir); + strcat(destination_dir, "/"); + strcat(destination_dir, current_directory); + if (stat(destination_dir, &st) == -1) { + mkdir(destination_dir, 0700); + fprintf(stderr, "creating directory %s\n", destination_dir); + } + } + if(strcmp(current_file, previous_file)){ + strcpy(destination_file, destination_dir); + strcat(destination_file, "/"); + strcat(destination_file, current_file); + strcat(destination_file, ".json"); + fprintf(stderr, "creating file %s\n", destination_file); + if(output_file){ + print_footer(output_file); + fclose(output_file); + } + output_file = myfopen_no_exit(destination_file, "w"); + print_header(output_file, ctx->mcd_struct); + } + if(new_sentence){ + new_sentence = 0; + sentence_nb++; + index_first_word = word_buffer_get_current_index(wb); + } + if(word_get_sent_seg(w)){ + index_last_word = word_buffer_get_current_index(wb); + new_sentence = 1; + + if(first_sentence == 1) + first_sentence = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + } + strcpy(previous_file, current_file); + strcpy(previous_directory, current_directory); + } while(word_buffer_move_right(wb)); + print_footer(output_file); fclose(output_file); - mcf2json_context_free(ctx); - - + } + else{ //ctx->root_dir is NULL dump everything to stdout + output_file = stdout; + print_header(output_file, ctx->mcd_struct); + do{ + w = word_buffer_b0(wb); + if(new_sentence){ + new_sentence = 0; + sentence_nb++; + index_first_word = word_buffer_get_current_index(wb); + } + if(word_get_sent_seg(w)){ + index_last_word = word_buffer_get_current_index(wb); + new_sentence = 1; + + if(first_sentence == 1) + first_sentence = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + } + } while(word_buffer_move_right(wb)); + print_footer(output_file); + } + + mcf2json_context_free(ctx); return 0; }