diff --git a/CMakeLists.txt b/CMakeLists.txt index 903cc582a74196aabbb3bc58fbf6b9d50fbffb1d..009441f83bf81a0615aba3c9c42a665d1de53e58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,9 @@ SET(CMAKE_C_COMPILER g++) SET(CMAKE_CXX_COMPILER g++) -SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS") +SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS -ggdb") SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lm -lopenblas" ) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11 -ggdb") if (${CMAKE_C_COMPILER_VERSION} VERSION_LESS 5.3) diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 560fd6cb4baa6edcd911d9eee32f9b054f37513e..9bc126fe0f9902838a2953472a17936be92bb187 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -8,7 +8,7 @@ #define MCD_INVALID_VALUE -1 -#define MCD_WF_NB 51 +#define MCD_WF_NB 55 #define MCD_WF_ID 0 #define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */ @@ -65,6 +65,11 @@ #define MCD_WF_DIRECTORY 49 #define MCD_WF_SPEAKER 50 +#define MCD_WF_WORD_SPAN_STATUS 51 +#define MCD_WF_POS_LABEL_STATUS 52 +#define MCD_WF_DEP_SPAN_STATUS 53 +#define MCD_WF_DEP_LABEL_STATUS 54 + /*Abbr @@ -148,6 +153,13 @@ Xtra*/ #define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL] #define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG] #define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG] + +#define mcd_get_word_span_status_col(m) (m)->wf2col[MCD_WF_WORD_SPAN_STATUS] +#define mcd_get_pos_label_status_col(m) (m)->wf2col[MCD_WF_POS_LABEL_STATUS] +#define mcd_get_dep_span_status_col(m) (m)->wf2col[MCD_WF_DEP_SPAN_STATUS] +#define mcd_get_dep_label_status_col(m) (m)->wf2col[MCD_WF_DEP_LABEL_STATUS] + + #define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A] #define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B] #define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C] diff --git a/maca_common/include/word.h b/maca_common/include/word.h index dcf18dc38b52b93a22209ccb73733a5eddd91b08..c0ee708daaced5b0d1dd74eda52b1c4d27fae34d 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -109,6 +109,15 @@ typedef struct _word { #define word_set_label(w, val) ((w)->wf_array[MCD_WF_LABEL] = (val)) #define word_set_stag(w, val) ((w)->wf_array[MCD_WF_STAG] = (val)) #define word_set_sent_seg(w, val) ((w)->wf_array[MCD_WF_SENT_SEG] = (val)) + + +#define word_set_word_span_status(w, val) ((w)->wf_array[MCD_WF_WORD_SPAN_STATUS] = (val)) +#define word_set_pos_label_status(w, val) ((w)->wf_array[MCD_WF_POS_LABEL_STATUS] = (val)) +#define word_set_dep_span_status(w, val) ((w)->wf_array[MCD_WF_DEP_SPAN_STATUS] = (val)) +#define word_set_dep_label_status(w, val) ((w)->wf_array[MCD_WF_DEP_LABEL_STATUS] = (val)) + + + #define word_set_A(w, val) ((w)->wf_array[MCD_WF_A] = (val)) #define word_set_B(w, val) ((w)->wf_array[MCD_WF_B] = (val)) #define word_set_C(w, val) ((w)->wf_array[MCD_WF_C] = (val)) diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 34fd53b7bfbae62d856189c29183af4f735cd3c3..4eff7b1a147438c15f1385a118d71640ebc38f68 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -517,6 +517,11 @@ int mcd_wf_code(char *wf) if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY; if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER; + if(!strcmp(wf, "WORD_SPAN_STATUS")) return MCD_WF_WORD_SPAN_STATUS; + if(!strcmp(wf, "POS_LABEL_STATUS")) return MCD_WF_POS_LABEL_STATUS; + if(!strcmp(wf, "DEP_SPAN_STATUS")) return MCD_WF_DEP_SPAN_STATUS; + if(!strcmp(wf, "DEP_LABEL_STATUS")) return MCD_WF_DEP_LABEL_STATUS; + if(!strcmp(wf, "A")) return MCD_WF_A; if(!strcmp(wf, "B")) return MCD_WF_B; diff --git a/maca_tools/src/json2mcf.c b/maca_tools/src/json2mcf.c index 21e481c6a8a8b27a609f8ec456c9751999831ec1..738a6ac451db1f48db98372453e282ab8935ef14 100644 --- a/maca_tools/src/json2mcf.c +++ b/maca_tools/src/json2mcf.c @@ -239,25 +239,52 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat mcd_struct = word_buffer_get_mcd(wb); d = mcd_struct->dico_array[mcd_get_pos_col(mcd_struct)]; + w = word_buffer_get_word_n(wb, offset + start); + if(w == NULL){ + fprintf(stderr, "WARNING cannot access segment %d\n", offset + start); + return; + } if(status_lab && !strcmp(status_lab, "G")){ fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label); - w = word_buffer_get_word_n(wb, offset + start); + /* -------------------------------------*/ - /* added by alexis 210/07/18 for datcha */ - word_set_F(w, 1); + /* added by alexis 21/07/18 for datcha */ + // word_set_F(w, 1); /* -------------------------------------*/ - if(w == NULL){ - fprintf(stderr, "WARNING cannot access segment %d\n", offset + start); - } - else{ - if(d) - label_code = dico_string2int(d, label); + if(d) + label_code = dico_string2int(d, label); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", label); + else + word_set_pos(w, label_code); + } + + + /* -------------------------------------*/ + /* added by alexis 04/05/2021 for orfeo */ + if(status_seg){ + dico *d1 = mcd_struct->dico_array[mcd_get_word_span_status_col(mcd_struct)]; + if(d1){ + label_code = dico_string2int(d1, status_seg); if(label_code == -1) fprintf(stderr, "label %s unknown\n", label); else - word_set_pos(w, label_code); + word_set_word_span_status(w, label_code); + } } + if(status_lab){ + dico *d1 = mcd_struct->dico_array[mcd_get_pos_label_status_col(mcd_struct)]; + if(d1){ + label_code = dico_string2int(d1, status_lab); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", label); + else + word_set_pos_label_status(w, label_code); + + } + } + /* -------------------------------------*/ } void process_segment(json_attr_val *avl, word_buffer *wb, int offset) @@ -301,41 +328,57 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status mcd_struct = word_buffer_get_mcd(wb); d = mcd_struct->dico_array[mcd_get_label_col(mcd_struct)]; + + w = word_buffer_get_word_n(wb, offset + orig); + + if(w == NULL){ + fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig); + return; + } if(status_lab && !strcmp(status_lab, "G")){ fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label); - w = word_buffer_get_word_n(wb, offset + orig); - - if(w == NULL){ - fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig); - } - - else{ if(d) label_code = dico_string2int(d, label); if(label_code == -1) fprintf(stderr, "WARNING : label %s unknown\n", label); // else word_set_label(w, label_code); - } } if(status_link && !strcmp(status_link, "G")){ fprintf(stderr, "updating governor of token %d with %d\n", orig, dest); - w = word_buffer_get_word_n(wb, offset + orig); - - if(w == NULL){ - fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig); - } - - else{ - if(dest == -1) /* -1 is for root */ word_set_gov(w, 0); else word_set_gov(w, dest - orig); + } + /* -------------------------------------*/ + /* added by alexis 04/05/2021 for orfeo */ + if(status_link){ + dico *d1 = mcd_struct->dico_array[mcd_get_dep_span_status_col(mcd_struct)]; + if(d1){ + label_code = dico_string2int(d1, status_link); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", status_link); + else + word_set_dep_span_status(w, label_code); + } } + if(status_lab){ + dico *d1 = mcd_struct->dico_array[mcd_get_dep_label_status_col(mcd_struct)]; + if(d1){ + label_code = dico_string2int(d1, status_lab); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", status_lab); + else + word_set_dep_label_status(w, label_code); + + } + } + /* -------------------------------------*/ + } @@ -344,7 +387,10 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status void process_link(json_attr_val *avl, word_buffer *wb, int offset) { int orig, dest; - char *label, *status_link, *status_lab; + char *label = NULL; + char *status_link = NULL; + char *status_lab = NULL; + char *status_label = NULL; json_attr_val *av; for(av = avl; av != NULL; av = av->next){ @@ -353,11 +399,15 @@ void process_link(json_attr_val *avl, word_buffer *wb, int offset) if(!strcmp(av->attr, "dest")){dest = (int)(av->val->u.number); continue;} if(!strcmp(av->attr, "label")){label = av->val->u.string; continue;} if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;} + if(!strcmp(av->attr, "status_label")){status_label = av->val->u.string; continue;} if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} } // fprintf(stderr, "link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab); - update_link(wb, orig, dest, label, status_link, status_lab, offset); + if(status_label == NULL) + update_link(wb, orig, dest, label, status_link, status_lab, offset); + else + update_link(wb, orig, dest, label, status_link, status_label, offset); } @@ -448,6 +498,10 @@ void check_token(json_attr_val *avl, word_buffer *wb, int offset) } w = word_buffer_get_word_n(wb, id); + if(w == NULL){ + fprintf(stderr, "attention, w vaut nul, id = %d form = %s", id, form_json); + exit(1); + } form_mcf = w->form; fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf); if(!string_equal(form_json, form_mcf)){ @@ -506,7 +560,45 @@ int main(int argc, char *argv[]) json2mcf_check_options(ctx); mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename); + + + /*---------------------------------------------------*/ + /* ajouté par Alexis le 04/05/2021 pour prendre en compte la notion de statut dans les fichiers mcf*/ + /* add (eventually) missing labels to status dictionnaries */ + + dico *d; + d = ctx->mcd_struct->dico_array[mcd_get_word_span_status_col(ctx->mcd_struct)]; + if(d){ + dico_add(d, "_"); + dico_add(d, "G"); + dico_add(d, "C"); + } + + d = ctx->mcd_struct->dico_array[mcd_get_pos_label_status_col(ctx->mcd_struct)]; + if(d){ + dico_add(d, "_"); + dico_add(d, "G"); + dico_add(d, "C"); + } + + d = ctx->mcd_struct->dico_array[mcd_get_dep_span_status_col(ctx->mcd_struct)]; + if(d){ + dico_add(d, "_"); + dico_add(d, "G"); + dico_add(d, "C"); + } + + d = ctx->mcd_struct->dico_array[mcd_get_dep_label_status_col(ctx->mcd_struct)]; + if(d){ + dico_add(d, "_"); + dico_add(d, "G"); + dico_add(d, "C"); + } + + /*---------------------------------------------------*/ + wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); + root = json_parse_full(ctx->json_filename, 0); if(root->type != JSON_AVL){ diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index 08e523b22570a704ff91d75def8d6a85d6036d5e..0182a52a6ce387ae5b504fc1a0bcde90e558d530 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -181,6 +181,11 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb)); int label_col = mcd_get_label_col(word_buffer_get_mcd(wb)); int index_col = mcd_get_index_col(word_buffer_get_mcd(wb)); + + int dep_span_status_col = mcd_get_dep_span_status_col(word_buffer_get_mcd(wb)); + int dep_label_status_col = mcd_get_dep_label_status_col(word_buffer_get_mcd(wb)); + + word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{"); @@ -206,8 +211,23 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in fprintf(output_file, "_"); fprintf(output_file, "\", "); - fprintf(output_file, "\"status_link\": \"\", "); - fprintf(output_file, "\"status_lab\": \"\", "); + /* ajoute le 04/05/2021 par Alexis pour Orfeo */ + + fprintf(output_file, "\"status_link\": \""); + if(dep_span_status_col != -1) + word_print_col_n(output_file, w, dep_span_status_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"status_lab\": \""); + + if(dep_label_status_col != -1) + word_print_col_n(output_file, w, dep_label_status_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + /* ------------------------------------------- */ fprintf(output_file, "\"timestamp\": \"\", "); fprintf(output_file, "\"author\": \"\", "); fprintf(output_file, "\"target\": \"\""); @@ -234,6 +254,8 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index) { int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); + int word_span_status_col = mcd_get_word_span_status_col(word_buffer_get_mcd(wb)); + int pos_label_status_col = mcd_get_pos_label_status_col(word_buffer_get_mcd(wb)); word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{ "); @@ -247,8 +269,21 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int fprintf(output_file, "_"); fprintf(output_file, "\", "); - fprintf(output_file, "\"status_seg\": \"\", "); - fprintf(output_file, "\"status_lab\": \"\", "); + fprintf(output_file, "\"status_seg\": \""); + if(word_span_status_col != -1) + word_print_col_n(output_file, w, word_span_status_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"status_lab\": \""); + if(pos_label_status_col != -1) + word_print_col_n(output_file, w, pos_label_status_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"timestamp\": \"\", "); fprintf(output_file, "\"author\": \"\", "); fprintf(output_file, "\"target\": \"\", ");