diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index a86626fe4fa43f269579b86bb3ba769f525bf2ec..560fd6cb4baa6edcd911d9eee32f9b054f37513e 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -8,7 +8,7 @@ #define MCD_INVALID_VALUE -1 -#define MCD_WF_NB 48 +#define MCD_WF_NB 51 #define MCD_WF_ID 0 #define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */ @@ -61,6 +61,12 @@ #define MCD_WF_Person 45 #define MCD_WF_Tense 46 +#define MCD_WF_FILE 48 +#define MCD_WF_DIRECTORY 49 +#define MCD_WF_SPEAKER 50 + + + /*Abbr AdpType AdvType diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 51b962200a24538c1c2840906ec3585735aad827..dcf18dc38b52b93a22209ccb73733a5eddd91b08 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -63,6 +63,11 @@ typedef struct _word { #define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL]) #define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG]) #define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG]) + +#define word_get_file(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_FILE]) +#define word_get_directory(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_DIRECTORY]) +#define word_get_speaker(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SPEAKER]) + #define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A]) #define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B]) #define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C]) diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 4a2348e14f4f3c406c1af3636ff3b5f3a35822d5..34fd53b7bfbae62d856189c29183af4f735cd3c3 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -512,6 +512,12 @@ int mcd_wf_code(char *wf) /* if(!strcmp(wf, "INT")) return MCD_WF_INT; */ if(!strcmp(wf, "GOV")) return MCD_WF_GOV; if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG; + + if(!strcmp(wf, "FILE")) return MCD_WF_FILE; + if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY; + if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER; + + if(!strcmp(wf, "A")) return MCD_WF_A; if(!strcmp(wf, "B")) return MCD_WF_B; if(!strcmp(wf, "C")) return MCD_WF_C; diff --git a/maca_common/src/word.c b/maca_common/src/word.c index 21e303e844bc5e7fb9030a56995420bf55504a3f..33f27da9cc0865cfa55130e4dd1b5e2964b496a3 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -22,7 +22,6 @@ word *word_new(char *input) w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV; w->form = NULL; w->form_char16 = NULL; - w->index = -1; w->signature = -1; w->is_root = 0; diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l index 891e8f30fcf0473d476ac63f6ff8a0e3712d586d..5ae382b86d38d454d0627fec446e54048bd5d88b 100644 --- a/maca_tokenizer/src/en_tok_rules.l +++ b/maca_tokenizer/src/en_tok_rules.l @@ -12,6 +12,7 @@ extern char *token; /*%option noyywrap*/ %% +#.* ECHO; \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} [ \t]+ {maca_tokenizer_segment((char *)"", yytext);} [ ]*\. {maca_tokenizer_segment((char *)".", yytext);} diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 907beaaed67fccb2883872a844d1feaa36b5f8b1..964b702fbaa3749834e286966655d18f696eb04f 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -26,7 +26,7 @@ nosepar [^ \t\n] if(defait_amalgames){ BEGIN(state_defait_amalgames); } - +#.* ECHO; \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);} diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index 4381177cfd388d8012380286f515dc1ae5fd19a8..bfca900cd8763a8ec7ed11fb4f07e0c16efe1db5 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -2,6 +2,9 @@ #include<stdlib.h> #include<string.h> #include<getopt.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> #include"mcd.h" #include"util.h" @@ -16,6 +19,7 @@ typedef struct { char *mcf_filename; char *mcd_filename; mcd *mcd_struct; + char *root_dir; } context; void mcf2json_context_free(context *ctx) @@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx) free(ctx->mcd_filename); if(ctx->mcd_struct) mcd_free(ctx->mcd_struct); + if(ctx->root_dir) + free(ctx->root_dir); free(ctx); } } @@ -47,6 +53,7 @@ context *mcf2json_context_new(void) ctx->mcf_filename = NULL; ctx->mcd_filename = NULL; ctx->mcd_struct = NULL; + ctx->root_dir = NULL; return ctx; } @@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx) fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); fprintf(stderr, "\t-C --mcd : mcd filename\n"); fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); - fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); + fprintf(stderr, "\t-r --root : root directory of the json files\n"); } void mcf2json_check_options(context *ctx){ @@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[]) {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, {"debug", no_argument, 0, 'd'}, - {"conll", required_argument, 0, 'o'}, {"mcd", required_argument, 0, 'C'}, {"mcf", required_argument, 0, 'i'}, + {"root", required_argument, 0, 'r'}, }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdC:i:r:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[]) case 'v': ctx->verbose = 1; break; - case 'o': - ctx->conll_filename = strdup(optarg); - break; case 'i': ctx->mcf_filename = strdup(optarg); break; case 'C': ctx->mcd_filename = strdup(optarg); break; + case 'r': + ctx->root_dir = strdup(optarg); + break; } } @@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[]) else{ ctx->mcd_struct = mcd_build_wpmlgfs(); } - return ctx; } @@ -129,7 +135,7 @@ void print_footer(FILE *output_file) } -void print_header(FILE *output_file, mcd *mcd_struct) +void print_header(FILE *output_file, mcd *mcd_struct, char *filename) { int pos_col = mcd_get_pos_col(mcd_struct); int label_col = mcd_get_label_col(mcd_struct); @@ -143,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) fprintf(output_file, "\"header\":{\n"); fprintf(output_file, "\"id\": \"\",\n"); fprintf(output_file, "\"timestamp\": \"\",\n"); + fprintf(output_file, "\"filename\": \"%s\",\n", filename); fprintf(output_file, "\"labels_segment\": ["); for(i=0; i < dico_pos->nbelem; i++){ @@ -160,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) fprintf(output_file, "},\n"); - fprintf(output_file, "\"annotations\":{\n"); + fprintf(output_file, "\"annotation\":{\n"); fprintf(output_file, "\"name\": \"\",\n"); fprintf(output_file, "\"time_start\": \"\",\n"); fprintf(output_file, "\"time_end\": \"\"\n"); @@ -227,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i } -void print_segment(FILE *output_file, word_buffer *wb, int index) +void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index) { int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{ "); /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */ - fprintf(output_file, "\"start\": %d, ", index); + fprintf(output_file, "\"start\": %d, ", index - index_first_word); /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */ - fprintf(output_file, "\"end\": %d, ", index); + fprintf(output_file, "\"end\": %d, ", index - index_first_word); fprintf(output_file, "\"label\": \""); if(pos_col != -1) @@ -258,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in { int index; int first_segment = 1; - + fprintf(output_file, "\"segments\": ["); for(index = index_first_word; index <= index_last_word; index++){ if(first_segment == 1) first_segment = 0; else fprintf(output_file, ","); fprintf(output_file, "\n"); - print_segment(output_file, wb, index); + print_segment(output_file, wb, index_first_word, index); } fprintf(output_file," ],\n"); } @@ -317,7 +324,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind int main(int argc, char *argv[]) { - FILE *output_file; + FILE *output_file = NULL; context *ctx = mcf2json_context_read_options(argc, argv); word_buffer *wb = NULL; word *w = NULL; @@ -326,41 +333,107 @@ int main(int argc, char *argv[]) int index_first_word; int index_last_word; int sentence_nb = 0; - + char current_directory[1000]; + char current_file[1000]; + char previous_directory[1000]; + char previous_file[1000]; + char filename_for_header[1000]; + char *root_directory = NULL; + char destination_file[1000]; + char destination_dir[1000]; + struct stat st = {0}; + mcf2json_check_options(ctx); mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename); - - output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); - print_header(output_file, ctx->mcd_struct); - do{ - w = word_buffer_b0(wb); - if(new_sentence){ - new_sentence = 0; - sentence_nb++; - index_first_word = word_buffer_get_current_index(wb); - } - if(word_get_sent_seg(w)){ - index_last_word = word_buffer_get_current_index(wb); - new_sentence = 1; - - if(first_sentence == 1) - first_sentence = 0; - else - fprintf(output_file, ","); - fprintf(output_file, "\n"); - print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + if(ctx->root_dir){ + if(stat(ctx->root_dir, &st) == -1) { + mkdir(ctx->root_dir, 0700); + fprintf(stderr, "creating directory %s\n", ctx->root_dir); } - } while(word_buffer_move_right(wb)); - - print_footer(output_file); - if(ctx->conll_filename) + do{ + w = word_buffer_b0(wb); + if(w == NULL) break; + word_sprint_col_n(current_directory, w, ctx->mcd_struct->wf2col[MCD_WF_DIRECTORY]); + word_sprint_col_n(current_file, w, ctx->mcd_struct->wf2col[MCD_WF_FILE]); + + if(strcmp(current_directory, previous_directory)){ + strcpy(destination_dir, ctx->root_dir); + strcat(destination_dir, "/"); + strcat(destination_dir, current_directory); + if (stat(destination_dir, &st) == -1) { + mkdir(destination_dir, 0700); + fprintf(stderr, "creating directory %s\n", destination_dir); + } + } + if(strcmp(current_file, previous_file)){ + strcpy(destination_file, destination_dir); + strcat(destination_file, "/"); + strcat(destination_file, current_file); + strcat(destination_file, ".json"); + fprintf(stderr, "creating file %s\n", destination_file); + if(output_file){ + print_footer(output_file); + fclose(output_file); + } + output_file = myfopen_no_exit(destination_file, "w"); + strcpy(filename_for_header, current_directory); + strcat(filename_for_header, "/"); + strcat(filename_for_header, current_file); + strcat(filename_for_header, ".json"); + print_header(output_file, ctx->mcd_struct, filename_for_header); + first_sentence = 1; + } + if(new_sentence){ + new_sentence = 0; + sentence_nb++; + index_first_word = word_buffer_get_current_index(wb); + } + if(word_get_sent_seg(w)){ + index_last_word = word_buffer_get_current_index(wb); + new_sentence = 1; + + if(first_sentence == 1) + first_sentence = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + } + strcpy(previous_file, current_file); + strcpy(previous_directory, current_directory); + } while(word_buffer_move_right(wb)); + print_footer(output_file); fclose(output_file); - mcf2json_context_free(ctx); - - + } + else{ //ctx->root_dir is NULL dump everything to stdout + output_file = stdout; + print_header(output_file, ctx->mcd_struct, ""); + do{ + w = word_buffer_b0(wb); + if(new_sentence){ + new_sentence = 0; + sentence_nb++; + index_first_word = word_buffer_get_current_index(wb); + } + if(word_get_sent_seg(w)){ + index_last_word = word_buffer_get_current_index(wb); + new_sentence = 1; + + if(first_sentence == 1) + first_sentence = 0; + else + fprintf(output_file, ","); + fprintf(output_file, "\n"); + print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); + } + } while(word_buffer_move_right(wb)); + print_footer(output_file); + } + + mcf2json_context_free(ctx); return 0; } diff --git a/maca_trans_parser/src/oracle_parser_arc_eager.c b/maca_trans_parser/src/oracle_parser_arc_eager.c index e4aae787c960e5874ac87422af876ab9ae51db64..fd8199afb538789724edd59672825b787485252c 100644 --- a/maca_trans_parser/src/oracle_parser_arc_eager.c +++ b/maca_trans_parser/src/oracle_parser_arc_eager.c @@ -63,18 +63,19 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label) /* s0 is the root of the sentence */ if((s0_label == root_label) // && (word_get_label(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != root_label) - && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) ){ return MVT_PARSER_ROOT; } - + /* word on the top of the stack is an end of sentence marker */ if((word_get_sent_seg(word_buffer_get_word_n(ref, s0_index)) == 1) // && (word_get_sent_seg(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != 1) - && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) ){ return MVT_PARSER_EOS; } + /* LEFT ARC b0 is the governor and s0 the dependent */ if(s0_gov_index == b0_index){ return movement_parser_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index)));