From ee98b0818457d93d391cb286808a53e76956672a Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Wed, 7 Feb 2018 16:34:11 +0100 Subject: [PATCH] fixed some details in mcf2json modified tokenizer to ignore comments --- maca_tokenizer/src/en_tok_rules.l | 1 + maca_tokenizer/src/fr_tok_rules.l | 2 +- maca_tools/src/mcf2json.c | 25 ++++++++++++++++--------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l index 891e8f3..5ae382b 100644 --- a/maca_tokenizer/src/en_tok_rules.l +++ b/maca_tokenizer/src/en_tok_rules.l @@ -12,6 +12,7 @@ extern char *token; /*%option noyywrap*/ %% +#.* ECHO; \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} [ \t]+ {maca_tokenizer_segment((char *)"", yytext);} [ ]*\. {maca_tokenizer_segment((char *)".", yytext);} diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 907beaa..964b702 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -26,7 +26,7 @@ nosepar [^ \t\n] if(defait_amalgames){ BEGIN(state_defait_amalgames); } - +#.* ECHO; \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);} diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index a194d9b..bfca900 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -135,7 +135,7 @@ void print_footer(FILE *output_file) } -void print_header(FILE *output_file, mcd *mcd_struct) +void print_header(FILE *output_file, mcd *mcd_struct, char *filename) { int pos_col = mcd_get_pos_col(mcd_struct); int label_col = mcd_get_label_col(mcd_struct); @@ -149,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) fprintf(output_file, "\"header\":{\n"); fprintf(output_file, "\"id\": \"\",\n"); fprintf(output_file, "\"timestamp\": \"\",\n"); + fprintf(output_file, "\"filename\": \"%s\",\n", filename); fprintf(output_file, "\"labels_segment\": ["); for(i=0; i < dico_pos->nbelem; i++){ @@ -166,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) fprintf(output_file, "},\n"); - fprintf(output_file, "\"annotations\":{\n"); + fprintf(output_file, "\"annotation\":{\n"); fprintf(output_file, "\"name\": \"\",\n"); fprintf(output_file, "\"time_start\": \"\",\n"); fprintf(output_file, "\"time_end\": \"\"\n"); @@ -233,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i } -void print_segment(FILE *output_file, word_buffer *wb, int index) +void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index) { int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{ "); /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */ - fprintf(output_file, "\"start\": %d, ", index); + fprintf(output_file, "\"start\": %d, ", index - index_first_word); /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */ - fprintf(output_file, "\"end\": %d, ", index); + fprintf(output_file, "\"end\": %d, ", index - index_first_word); fprintf(output_file, "\"label\": \""); if(pos_col != -1) @@ -264,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in { int index; int first_segment = 1; - + fprintf(output_file, "\"segments\": ["); for(index = index_first_word; index <= index_last_word; index++){ if(first_segment == 1) first_segment = 0; else fprintf(output_file, ","); fprintf(output_file, "\n"); - print_segment(output_file, wb, index); + print_segment(output_file, wb, index_first_word, index); } fprintf(output_file," ],\n"); } @@ -336,6 +337,7 @@ int main(int argc, char *argv[]) char current_file[1000]; char previous_directory[1000]; char previous_file[1000]; + char filename_for_header[1000]; char *root_directory = NULL; char destination_file[1000]; char destination_dir[1000]; @@ -377,7 +379,12 @@ int main(int argc, char *argv[]) fclose(output_file); } output_file = myfopen_no_exit(destination_file, "w"); - print_header(output_file, ctx->mcd_struct); + strcpy(filename_for_header, current_directory); + strcat(filename_for_header, "/"); + strcat(filename_for_header, current_file); + strcat(filename_for_header, ".json"); + print_header(output_file, ctx->mcd_struct, filename_for_header); + first_sentence = 1; } if(new_sentence){ new_sentence = 0; @@ -404,7 +411,7 @@ int main(int argc, char *argv[]) else{ //ctx->root_dir is NULL dump everything to stdout output_file = stdout; - print_header(output_file, ctx->mcd_struct); + print_header(output_file, ctx->mcd_struct, ""); do{ w = word_buffer_b0(wb); if(new_sentence){ -- GitLab