Commit ee98b081 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed some details in mcf2json modified tokenizer to ignore comments

parent 35f18a2d
......@@ -12,6 +12,7 @@ extern char *token;
/*%option noyywrap*/
%%
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -135,7 +135,7 @@ void print_footer(FILE *output_file)
}
void print_header(FILE *output_file, mcd *mcd_struct)
void print_header(FILE *output_file, mcd *mcd_struct, char *filename)
{
int pos_col = mcd_get_pos_col(mcd_struct);
int label_col = mcd_get_label_col(mcd_struct);
......@@ -149,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"filename\": \"%s\",\n", filename);
fprintf(output_file, "\"labels_segment\": [");
for(i=0; i < dico_pos->nbelem; i++){
......@@ -166,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n");
fprintf(output_file, "\"annotation\":{\n");
fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\"\n");
......@@ -233,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
}
void print_segment(FILE *output_file, word_buffer *wb, int index)
void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
{
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index);
fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \"");
if(pos_col != -1)
......@@ -264,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
{
int index;
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, wb, index);
print_segment(output_file, wb, index_first_word, index);
}
fprintf(output_file," ],\n");
}
......@@ -336,6 +337,7 @@ int main(int argc, char *argv[])
char current_file[1000];
char previous_directory[1000];
char previous_file[1000];
char filename_for_header[1000];
char *root_directory = NULL;
char destination_file[1000];
char destination_dir[1000];
......@@ -377,7 +379,12 @@ int main(int argc, char *argv[])
fclose(output_file);
}
output_file = myfopen_no_exit(destination_file, "w");
print_header(output_file, ctx->mcd_struct);
strcpy(filename_for_header, current_directory);
strcat(filename_for_header, "/");
strcat(filename_for_header, current_file);
strcat(filename_for_header, ".json");
print_header(output_file, ctx->mcd_struct, filename_for_header);
first_sentence = 1;
}
if(new_sentence){
new_sentence = 0;
......@@ -404,7 +411,7 @@ int main(int argc, char *argv[])
else{ //ctx->root_dir is NULL dump everything to stdout
output_file = stdout;
print_header(output_file, ctx->mcd_struct);
print_header(output_file, ctx->mcd_struct, "");
do{
w = word_buffer_b0(wb);
if(new_sentence){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment