Skip to content
Snippets Groups Projects
Commit 68ee1829 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed a bug in word.h

parents 1d536e6c ee98b081
Branches
No related tags found
No related merge requests found
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#define MCD_INVALID_VALUE -1 #define MCD_INVALID_VALUE -1
#define MCD_WF_NB 48 #define MCD_WF_NB 51
#define MCD_WF_ID 0 #define MCD_WF_ID 0
#define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */ #define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */
...@@ -61,6 +61,12 @@ ...@@ -61,6 +61,12 @@
#define MCD_WF_Person 45 #define MCD_WF_Person 45
#define MCD_WF_Tense 46 #define MCD_WF_Tense 46
#define MCD_WF_FILE 48
#define MCD_WF_DIRECTORY 49
#define MCD_WF_SPEAKER 50
/*Abbr /*Abbr
AdpType AdpType
AdvType AdvType
......
...@@ -63,6 +63,11 @@ typedef struct _word { ...@@ -63,6 +63,11 @@ typedef struct _word {
#define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL]) #define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL])
#define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG]) #define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG])
#define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG]) #define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG])
#define word_get_file(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_FILE])
#define word_get_directory(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_DIRECTORY])
#define word_get_speaker(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SPEAKER])
#define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A]) #define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A])
#define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B]) #define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B])
#define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C]) #define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C])
......
...@@ -512,6 +512,12 @@ int mcd_wf_code(char *wf) ...@@ -512,6 +512,12 @@ int mcd_wf_code(char *wf)
/* if(!strcmp(wf, "INT")) return MCD_WF_INT; */ /* if(!strcmp(wf, "INT")) return MCD_WF_INT; */
if(!strcmp(wf, "GOV")) return MCD_WF_GOV; if(!strcmp(wf, "GOV")) return MCD_WF_GOV;
if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG; if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG;
if(!strcmp(wf, "FILE")) return MCD_WF_FILE;
if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY;
if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER;
if(!strcmp(wf, "A")) return MCD_WF_A; if(!strcmp(wf, "A")) return MCD_WF_A;
if(!strcmp(wf, "B")) return MCD_WF_B; if(!strcmp(wf, "B")) return MCD_WF_B;
if(!strcmp(wf, "C")) return MCD_WF_C; if(!strcmp(wf, "C")) return MCD_WF_C;
......
...@@ -22,7 +22,6 @@ word *word_new(char *input) ...@@ -22,7 +22,6 @@ word *word_new(char *input)
w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV; w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
w->form = NULL; w->form = NULL;
w->form_char16 = NULL; w->form_char16 = NULL;
w->index = -1; w->index = -1;
w->signature = -1; w->signature = -1;
w->is_root = 0; w->is_root = 0;
......
...@@ -12,6 +12,7 @@ extern char *token; ...@@ -12,6 +12,7 @@ extern char *token;
/*%option noyywrap*/ /*%option noyywrap*/
%% %%
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);} [ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);} [ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
......
...@@ -26,7 +26,7 @@ nosepar [^ \t\n] ...@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){ if(defait_amalgames){
BEGIN(state_defait_amalgames); BEGIN(state_defait_amalgames);
} }
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);}
......
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
#include<stdlib.h> #include<stdlib.h>
#include<string.h> #include<string.h>
#include<getopt.h> #include<getopt.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include"mcd.h" #include"mcd.h"
#include"util.h" #include"util.h"
...@@ -16,6 +19,7 @@ typedef struct { ...@@ -16,6 +19,7 @@ typedef struct {
char *mcf_filename; char *mcf_filename;
char *mcd_filename; char *mcd_filename;
mcd *mcd_struct; mcd *mcd_struct;
char *root_dir;
} context; } context;
void mcf2json_context_free(context *ctx) void mcf2json_context_free(context *ctx)
...@@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx) ...@@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx)
free(ctx->mcd_filename); free(ctx->mcd_filename);
if(ctx->mcd_struct) if(ctx->mcd_struct)
mcd_free(ctx->mcd_struct); mcd_free(ctx->mcd_struct);
if(ctx->root_dir)
free(ctx->root_dir);
free(ctx); free(ctx);
} }
} }
...@@ -47,6 +53,7 @@ context *mcf2json_context_new(void) ...@@ -47,6 +53,7 @@ context *mcf2json_context_new(void)
ctx->mcf_filename = NULL; ctx->mcf_filename = NULL;
ctx->mcd_filename = NULL; ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL; ctx->mcd_struct = NULL;
ctx->root_dir = NULL;
return ctx; return ctx;
} }
...@@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx) ...@@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx)
fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-C --mcd : mcd filename\n"); fprintf(stderr, "\t-C --mcd : mcd filename\n");
fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n");
fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); fprintf(stderr, "\t-r --root : root directory of the json files\n");
} }
void mcf2json_check_options(context *ctx){ void mcf2json_check_options(context *ctx){
...@@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[]) ...@@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[])
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'}, {"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'}, {"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'}, {"mcd", required_argument, 0, 'C'},
{"mcf", required_argument, 0, 'i'}, {"mcf", required_argument, 0, 'i'},
{"root", required_argument, 0, 'r'},
}; };
optind = 0; optind = 0;
opterr = 0; opterr = 0;
while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ while ((c = getopt_long (argc, argv, "hvdC:i:r:", long_options, &option_index)) != -1){
switch (c) switch (c)
{ {
case 'd': case 'd':
...@@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[]) ...@@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[])
case 'v': case 'v':
ctx->verbose = 1; ctx->verbose = 1;
break; break;
case 'o':
ctx->conll_filename = strdup(optarg);
break;
case 'i': case 'i':
ctx->mcf_filename = strdup(optarg); ctx->mcf_filename = strdup(optarg);
break; break;
case 'C': case 'C':
ctx->mcd_filename = strdup(optarg); ctx->mcd_filename = strdup(optarg);
break; break;
case 'r':
ctx->root_dir = strdup(optarg);
break;
} }
} }
...@@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[]) ...@@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[])
else{ else{
ctx->mcd_struct = mcd_build_wpmlgfs(); ctx->mcd_struct = mcd_build_wpmlgfs();
} }
return ctx; return ctx;
} }
...@@ -129,7 +135,7 @@ void print_footer(FILE *output_file) ...@@ -129,7 +135,7 @@ void print_footer(FILE *output_file)
} }
void print_header(FILE *output_file, mcd *mcd_struct) void print_header(FILE *output_file, mcd *mcd_struct, char *filename)
{ {
int pos_col = mcd_get_pos_col(mcd_struct); int pos_col = mcd_get_pos_col(mcd_struct);
int label_col = mcd_get_label_col(mcd_struct); int label_col = mcd_get_label_col(mcd_struct);
...@@ -143,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) ...@@ -143,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "\"header\":{\n"); fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n"); fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n"); fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"filename\": \"%s\",\n", filename);
fprintf(output_file, "\"labels_segment\": ["); fprintf(output_file, "\"labels_segment\": [");
for(i=0; i < dico_pos->nbelem; i++){ for(i=0; i < dico_pos->nbelem; i++){
...@@ -160,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct) ...@@ -160,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "},\n"); fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n"); fprintf(output_file, "\"annotation\":{\n");
fprintf(output_file, "\"name\": \"\",\n"); fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n"); fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\"\n"); fprintf(output_file, "\"time_end\": \"\"\n");
...@@ -227,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i ...@@ -227,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
} }
void print_segment(FILE *output_file, word_buffer *wb, int index) void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
{ {
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index); word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ "); fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */ /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index); fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */ /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index); fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \""); fprintf(output_file, "\"label\": \"");
if(pos_col != -1) if(pos_col != -1)
...@@ -263,7 +270,7 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in ...@@ -263,7 +270,7 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
for(index = index_first_word; index <= index_last_word; index++){ for(index = index_first_word; index <= index_last_word; index++){
if(first_segment == 1) first_segment = 0; else fprintf(output_file, ","); if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n"); fprintf(output_file, "\n");
print_segment(output_file, wb, index); print_segment(output_file, wb, index_first_word, index);
} }
fprintf(output_file," ],\n"); fprintf(output_file," ],\n");
} }
...@@ -317,7 +324,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind ...@@ -317,7 +324,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
FILE *output_file; FILE *output_file = NULL;
context *ctx = mcf2json_context_read_options(argc, argv); context *ctx = mcf2json_context_read_options(argc, argv);
word_buffer *wb = NULL; word_buffer *wb = NULL;
word *w = NULL; word *w = NULL;
...@@ -326,17 +333,59 @@ int main(int argc, char *argv[]) ...@@ -326,17 +333,59 @@ int main(int argc, char *argv[])
int index_first_word; int index_first_word;
int index_last_word; int index_last_word;
int sentence_nb = 0; int sentence_nb = 0;
char current_directory[1000];
char current_file[1000];
char previous_directory[1000];
char previous_file[1000];
char filename_for_header[1000];
char *root_directory = NULL;
char destination_file[1000];
char destination_dir[1000];
struct stat st = {0};
mcf2json_check_options(ctx); mcf2json_check_options(ctx);
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename); mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename);
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
print_header(output_file, ctx->mcd_struct); if(ctx->root_dir){
if(stat(ctx->root_dir, &st) == -1) {
mkdir(ctx->root_dir, 0700);
fprintf(stderr, "creating directory %s\n", ctx->root_dir);
}
do{ do{
w = word_buffer_b0(wb); w = word_buffer_b0(wb);
if(w == NULL) break;
word_sprint_col_n(current_directory, w, ctx->mcd_struct->wf2col[MCD_WF_DIRECTORY]);
word_sprint_col_n(current_file, w, ctx->mcd_struct->wf2col[MCD_WF_FILE]);
if(strcmp(current_directory, previous_directory)){
strcpy(destination_dir, ctx->root_dir);
strcat(destination_dir, "/");
strcat(destination_dir, current_directory);
if (stat(destination_dir, &st) == -1) {
mkdir(destination_dir, 0700);
fprintf(stderr, "creating directory %s\n", destination_dir);
}
}
if(strcmp(current_file, previous_file)){
strcpy(destination_file, destination_dir);
strcat(destination_file, "/");
strcat(destination_file, current_file);
strcat(destination_file, ".json");
fprintf(stderr, "creating file %s\n", destination_file);
if(output_file){
print_footer(output_file);
fclose(output_file);
}
output_file = myfopen_no_exit(destination_file, "w");
strcpy(filename_for_header, current_directory);
strcat(filename_for_header, "/");
strcat(filename_for_header, current_file);
strcat(filename_for_header, ".json");
print_header(output_file, ctx->mcd_struct, filename_for_header);
first_sentence = 1;
}
if(new_sentence){ if(new_sentence){
new_sentence = 0; new_sentence = 0;
sentence_nb++; sentence_nb++;
...@@ -353,14 +402,38 @@ int main(int argc, char *argv[]) ...@@ -353,14 +402,38 @@ int main(int argc, char *argv[])
fprintf(output_file, "\n"); fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word); print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
} }
strcpy(previous_file, current_file);
strcpy(previous_directory, current_directory);
} while(word_buffer_move_right(wb)); } while(word_buffer_move_right(wb));
print_footer(output_file); print_footer(output_file);
if(ctx->conll_filename)
fclose(output_file); fclose(output_file);
mcf2json_context_free(ctx); }
else{ //ctx->root_dir is NULL dump everything to stdout
output_file = stdout;
print_header(output_file, ctx->mcd_struct, "");
do{
w = word_buffer_b0(wb);
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
}
} while(word_buffer_move_right(wb));
print_footer(output_file);
}
mcf2json_context_free(ctx);
return 0; return 0;
} }
...@@ -75,6 +75,7 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label) ...@@ -75,6 +75,7 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label)
){ ){
return MVT_PARSER_EOS; return MVT_PARSER_EOS;
} }
/* LEFT ARC b0 is the governor and s0 the dependent */ /* LEFT ARC b0 is the governor and s0 the dependent */
if(s0_gov_index == b0_index){ if(s0_gov_index == b0_index){
return movement_parser_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index))); return movement_parser_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index)));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment