Skip to content
Snippets Groups Projects
Commit ee98b081 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed some details in mcf2json modified tokenizer to ignore comments

parent 35f18a2d
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,7 @@ extern char *token;
/*%option noyywrap*/
%%
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -135,7 +135,7 @@ void print_footer(FILE *output_file)
}
void print_header(FILE *output_file, mcd *mcd_struct)
void print_header(FILE *output_file, mcd *mcd_struct, char *filename)
{
int pos_col = mcd_get_pos_col(mcd_struct);
int label_col = mcd_get_label_col(mcd_struct);
......@@ -149,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"filename\": \"%s\",\n", filename);
fprintf(output_file, "\"labels_segment\": [");
for(i=0; i < dico_pos->nbelem; i++){
......@@ -166,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n");
fprintf(output_file, "\"annotation\":{\n");
fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\"\n");
......@@ -233,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
}
void print_segment(FILE *output_file, word_buffer *wb, int index)
void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
{
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index);
fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \"");
if(pos_col != -1)
......@@ -264,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
{
int index;
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, wb, index);
print_segment(output_file, wb, index_first_word, index);
}
fprintf(output_file," ],\n");
}
......@@ -336,6 +337,7 @@ int main(int argc, char *argv[])
char current_file[1000];
char previous_directory[1000];
char previous_file[1000];
char filename_for_header[1000];
char *root_directory = NULL;
char destination_file[1000];
char destination_dir[1000];
......@@ -377,7 +379,12 @@ int main(int argc, char *argv[])
fclose(output_file);
}
output_file = myfopen_no_exit(destination_file, "w");
print_header(output_file, ctx->mcd_struct);
strcpy(filename_for_header, current_directory);
strcat(filename_for_header, "/");
strcat(filename_for_header, current_file);
strcat(filename_for_header, ".json");
print_header(output_file, ctx->mcd_struct, filename_for_header);
first_sentence = 1;
}
if(new_sentence){
new_sentence = 0;
......@@ -404,7 +411,7 @@ int main(int argc, char *argv[])
else{ //ctx->root_dir is NULL dump everything to stdout
output_file = stdout;
print_header(output_file, ctx->mcd_struct);
print_header(output_file, ctx->mcd_struct, "");
do{
w = word_buffer_b0(wb);
if(new_sentence){
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment