Commit 28e92d31 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modification de json2mcf et mcf2json pour la prise en compte du statut des...

modification de json2mcf et mcf2json pour la prise en compte du statut des annotations. Les annotations corrigées par les annotateurs sont identifiées, ainsi que les annotations automatiques reconnues comme erronées
parent 0a141111
......@@ -10,8 +10,9 @@ SET(CMAKE_C_COMPILER g++)
SET(CMAKE_CXX_COMPILER g++)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS")
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS -ggdb")
SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lm -lopenblas" )
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11 -ggdb")
if (${CMAKE_C_COMPILER_VERSION} VERSION_LESS 5.3)
......
......@@ -8,7 +8,7 @@
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 51
#define MCD_WF_NB 55
#define MCD_WF_ID 0
#define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */
......@@ -65,6 +65,11 @@
#define MCD_WF_DIRECTORY 49
#define MCD_WF_SPEAKER 50
#define MCD_WF_WORD_SPAN_STATUS 51
#define MCD_WF_POS_LABEL_STATUS 52
#define MCD_WF_DEP_SPAN_STATUS 53
#define MCD_WF_DEP_LABEL_STATUS 54
/*Abbr
......@@ -148,6 +153,13 @@ Xtra*/
#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL]
#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG]
#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
#define mcd_get_word_span_status_col(m) (m)->wf2col[MCD_WF_WORD_SPAN_STATUS]
#define mcd_get_pos_label_status_col(m) (m)->wf2col[MCD_WF_POS_LABEL_STATUS]
#define mcd_get_dep_span_status_col(m) (m)->wf2col[MCD_WF_DEP_SPAN_STATUS]
#define mcd_get_dep_label_status_col(m) (m)->wf2col[MCD_WF_DEP_LABEL_STATUS]
#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A]
#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B]
#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C]
......
......@@ -109,6 +109,15 @@ typedef struct _word {
#define word_set_label(w, val) ((w)->wf_array[MCD_WF_LABEL] = (val))
#define word_set_stag(w, val) ((w)->wf_array[MCD_WF_STAG] = (val))
#define word_set_sent_seg(w, val) ((w)->wf_array[MCD_WF_SENT_SEG] = (val))
#define word_set_word_span_status(w, val) ((w)->wf_array[MCD_WF_WORD_SPAN_STATUS] = (val))
#define word_set_pos_label_status(w, val) ((w)->wf_array[MCD_WF_POS_LABEL_STATUS] = (val))
#define word_set_dep_span_status(w, val) ((w)->wf_array[MCD_WF_DEP_SPAN_STATUS] = (val))
#define word_set_dep_label_status(w, val) ((w)->wf_array[MCD_WF_DEP_LABEL_STATUS] = (val))
#define word_set_A(w, val) ((w)->wf_array[MCD_WF_A] = (val))
#define word_set_B(w, val) ((w)->wf_array[MCD_WF_B] = (val))
#define word_set_C(w, val) ((w)->wf_array[MCD_WF_C] = (val))
......
......@@ -517,6 +517,11 @@ int mcd_wf_code(char *wf)
if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY;
if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER;
if(!strcmp(wf, "WORD_SPAN_STATUS")) return MCD_WF_WORD_SPAN_STATUS;
if(!strcmp(wf, "POS_LABEL_STATUS")) return MCD_WF_POS_LABEL_STATUS;
if(!strcmp(wf, "DEP_SPAN_STATUS")) return MCD_WF_DEP_SPAN_STATUS;
if(!strcmp(wf, "DEP_LABEL_STATUS")) return MCD_WF_DEP_LABEL_STATUS;
if(!strcmp(wf, "A")) return MCD_WF_A;
if(!strcmp(wf, "B")) return MCD_WF_B;
......
......@@ -239,25 +239,52 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
mcd_struct = word_buffer_get_mcd(wb);
d = mcd_struct->dico_array[mcd_get_pos_col(mcd_struct)];
w = word_buffer_get_word_n(wb, offset + start);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + start);
return;
}
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
w = word_buffer_get_word_n(wb, offset + start);
/* -------------------------------------*/
/* added by alexis 210/07/18 for datcha */
word_set_F(w, 1);
/* added by alexis 21/07/18 for datcha */
// word_set_F(w, 1);
/* -------------------------------------*/
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + start);
}
else{
if(d)
label_code = dico_string2int(d, label);
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos(w, label_code);
}
/* -------------------------------------*/
/* added by alexis 04/05/2021 for orfeo */
if(status_seg){
dico *d1 = mcd_struct->dico_array[mcd_get_word_span_status_col(mcd_struct)];
if(d1){
label_code = dico_string2int(d1, status_seg);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos(w, label_code);
word_set_word_span_status(w, label_code);
}
}
if(status_lab){
dico *d1 = mcd_struct->dico_array[mcd_get_pos_label_status_col(mcd_struct)];
if(d1){
label_code = dico_string2int(d1, status_lab);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos_label_status(w, label_code);
}
}
/* -------------------------------------*/
}
void process_segment(json_attr_val *avl, word_buffer *wb, int offset)
......@@ -301,41 +328,57 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status
mcd_struct = word_buffer_get_mcd(wb);
d = mcd_struct->dico_array[mcd_get_label_col(mcd_struct)];
w = word_buffer_get_word_n(wb, offset + orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
return;
}
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
w = word_buffer_get_word_n(wb, offset + orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
}
else{
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "WARNING : label %s unknown\n", label);
// else
word_set_label(w, label_code);
}
}
if(status_link && !strcmp(status_link, "G")){
fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
w = word_buffer_get_word_n(wb, offset + orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
}
else{
if(dest == -1) /* -1 is for root */
word_set_gov(w, 0);
else
word_set_gov(w, dest - orig);
}
/* -------------------------------------*/
/* added by alexis 04/05/2021 for orfeo */
if(status_link){
dico *d1 = mcd_struct->dico_array[mcd_get_dep_span_status_col(mcd_struct)];
if(d1){
label_code = dico_string2int(d1, status_link);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", status_link);
else
word_set_dep_span_status(w, label_code);
}
}
if(status_lab){
dico *d1 = mcd_struct->dico_array[mcd_get_dep_label_status_col(mcd_struct)];
if(d1){
label_code = dico_string2int(d1, status_lab);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", status_lab);
else
word_set_dep_label_status(w, label_code);
}
}
/* -------------------------------------*/
}
......@@ -344,7 +387,10 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status
void process_link(json_attr_val *avl, word_buffer *wb, int offset)
{
int orig, dest;
char *label, *status_link, *status_lab;
char *label = NULL;
char *status_link = NULL;
char *status_lab = NULL;
char *status_label = NULL;
json_attr_val *av;
for(av = avl; av != NULL; av = av->next){
......@@ -353,11 +399,15 @@ void process_link(json_attr_val *avl, word_buffer *wb, int offset)
if(!strcmp(av->attr, "dest")){dest = (int)(av->val->u.number); continue;}
if(!strcmp(av->attr, "label")){label = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_label")){status_label = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
}
// fprintf(stderr, "link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab);
update_link(wb, orig, dest, label, status_link, status_lab, offset);
if(status_label == NULL)
update_link(wb, orig, dest, label, status_link, status_lab, offset);
else
update_link(wb, orig, dest, label, status_link, status_label, offset);
}
......@@ -448,6 +498,10 @@ void check_token(json_attr_val *avl, word_buffer *wb, int offset)
}
w = word_buffer_get_word_n(wb, id);
if(w == NULL){
fprintf(stderr, "attention, w vaut nul, id = %d form = %s", id, form_json);
exit(1);
}
form_mcf = w->form;
fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf);
if(!string_equal(form_json, form_mcf)){
......@@ -506,7 +560,45 @@ int main(int argc, char *argv[])
json2mcf_check_options(ctx);
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename);
/*---------------------------------------------------*/
/* ajouté par Alexis le 04/05/2021 pour prendre en compte la notion de statut dans les fichiers mcf*/
/* add (eventually) missing labels to status dictionnaries */
dico *d;
d = ctx->mcd_struct->dico_array[mcd_get_word_span_status_col(ctx->mcd_struct)];
if(d){
dico_add(d, "_");
dico_add(d, "G");
dico_add(d, "C");
}
d = ctx->mcd_struct->dico_array[mcd_get_pos_label_status_col(ctx->mcd_struct)];
if(d){
dico_add(d, "_");
dico_add(d, "G");
dico_add(d, "C");
}
d = ctx->mcd_struct->dico_array[mcd_get_dep_span_status_col(ctx->mcd_struct)];
if(d){
dico_add(d, "_");
dico_add(d, "G");
dico_add(d, "C");
}
d = ctx->mcd_struct->dico_array[mcd_get_dep_label_status_col(ctx->mcd_struct)];
if(d){
dico_add(d, "_");
dico_add(d, "G");
dico_add(d, "C");
}
/*---------------------------------------------------*/
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
root = json_parse_full(ctx->json_filename, 0);
if(root->type != JSON_AVL){
......
......@@ -181,6 +181,11 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb));
int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
int index_col = mcd_get_index_col(word_buffer_get_mcd(wb));
int dep_span_status_col = mcd_get_dep_span_status_col(word_buffer_get_mcd(wb));
int dep_label_status_col = mcd_get_dep_label_status_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{");
......@@ -206,8 +211,23 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_link\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
/* ajoute le 04/05/2021 par Alexis pour Orfeo */
fprintf(output_file, "\"status_link\": \"");
if(dep_span_status_col != -1)
word_print_col_n(output_file, w, dep_span_status_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_lab\": \"");
if(dep_label_status_col != -1)
word_print_col_n(output_file, w, dep_label_status_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
/* ------------------------------------------- */
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\"");
......@@ -234,6 +254,8 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
{
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
int word_span_status_col = mcd_get_word_span_status_col(word_buffer_get_mcd(wb));
int pos_label_status_col = mcd_get_pos_label_status_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
......@@ -247,8 +269,21 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_seg\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"status_seg\": \"");
if(word_span_status_col != -1)
word_print_col_n(output_file, w, word_span_status_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_lab\": \"");
if(pos_label_status_col != -1)
word_print_col_n(output_file, w, pos_label_status_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\", ");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment