diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h index 136b5606bf8e3008687bd8f4f5aeebdd0cf76ab3..184608f1fa06277abe805ae448628e396a16f8fd 100644 --- a/maca_common/include/word_buffer.h +++ b/maca_common/include/word_buffer.h @@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb); void word_buffer_print_compact(FILE *f, word_buffer *wb); int word_buffer_read_sentence(word_buffer *bw); word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct); -int word_buffer_locate_token(word_buffer *wb, int offset); +int word_buffer_locate_token_with_offset(word_buffer *wb, int offset); /* int word_buffer_is_empty(word_buffer *wb); int word_buffer_is_last(word_buffer *wb); diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index a15542bdaab0670230e13d44e96823cb970b888b..7e52dd3c1227ef1c0e955bb7383d0c4a97bfc4f3 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -156,7 +156,7 @@ int word_buffer_read_sentence(word_buffer *wb) return wb->nbelem ; } -int word_buffer_locate_token(word_buffer *wb, int offset) +int word_buffer_locate_token_with_offset(word_buffer *wb, int offset) { int c, first, last, middle; word *w_middle; @@ -165,7 +165,7 @@ int word_buffer_locate_token(word_buffer *wb, int offset) middle = (first+last)/2; while (first <= last) { - // printf("first = %d middle = %d last = %d\n", first, middle, last); + // printf("first = %d middle = %d last = %d\n", first, middle, last); w_middle = word_buffer_get_word_n(wb, middle); // printf("w middle = %d current offset = %d\n", w_middle, word_get_offset(w_middle)); if (word_get_offset(w_middle) < offset) diff --git a/maca_tools/src/json2mcf.c b/maca_tools/src/json2mcf.c index d44a70a551cce18713226ef03649b3a1116c7516..55109851ac73d13dc8e9299fd23b70adec760dba 100644 --- a/maca_tools/src/json2mcf.c +++ b/maca_tools/src/json2mcf.c @@ -228,7 +228,7 @@ void json2mcf_print_word_buffer(FILE *f, word_buffer *wb) */ -void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab) +void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab, int offset) { int index; word *w; @@ -241,9 +241,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat if(status_lab && !strcmp(status_lab, "G")){ fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label); - index = word_buffer_locate_token(wb, start); - w = word_buffer_get_word_n(wb, index); - + w = word_buffer_get_word_n(wb, offset + start); if(d) label_code = dico_string2int(d, label); if(label_code == -1) @@ -251,13 +249,9 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat else word_set_pos(w, label_code); } - } - - - -void process_segment(json_attr_val *avl, word_buffer *wb) +void process_segment(json_attr_val *avl, word_buffer *wb, int offset) { int start, end; char *label, *status_seg, *status_lab; @@ -271,24 +265,56 @@ void process_segment(json_attr_val *avl, word_buffer *wb) if(!strcmp(av->attr, "status_seg")){status_seg = av->val->u.string; continue;} if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} } - update_segment(wb, start, end, label, status_seg, status_lab); + update_segment(wb, start, end, label, status_seg, status_lab, offset); // printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab); } -void process_segments(json_struct *segments, word_buffer *wb) +void process_segments(json_struct *segments, word_buffer *wb, int offset) { json_struct *segment; // printf("process_segments\n"); for(segment = segments->u.first; segment != NULL; segment = segment->next){ - process_segment(segment->u.attr_val_list, wb); + process_segment(segment->u.attr_val_list, wb, offset); } } // {"orig": 1, "dest":2, "label": "suj", "status_link": "", "status_lab": "", "timestamp": "", "author": "", "target": ""}, +void update_link(word_buffer *wb, int orig, int dest, char *label, char *status_link, char *status_lab, int offset) +{ + + int index; + word *w = NULL; + int label_code = -1; + dico *d; + mcd *mcd_struct = NULL; + + mcd_struct = word_buffer_get_mcd(wb); + d = mcd_struct->dico_array[mcd_get_label_col(mcd_struct)]; + + if(status_lab && !strcmp(status_lab, "G")){ + fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label); + w = word_buffer_get_word_n(wb, offset + orig); + + if(d) + label_code = dico_string2int(d, label); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", label); + else + word_set_label(w, label_code); + } + + if(status_link && !strcmp(status_link, "G")){ + fprintf(stderr, "updating governor of token %d with %d\n", orig, dest); + w = word_buffer_get_word_n(wb, offset + orig); + word_set_gov(w, dest - orig); + } + + +} -void process_link(json_attr_val *avl, word_buffer *wb) +void process_link(json_attr_val *avl, word_buffer *wb, int offset) { int orig, dest; char *label, *status_link, *status_lab; @@ -302,27 +328,53 @@ void process_link(json_attr_val *avl, word_buffer *wb) if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;} if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} } - // printf("link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab); + // fprintf(stderr, "link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab); + update_link(wb, orig, dest, label, status_link, status_lab, offset); + + } -void process_links(json_struct *segments, word_buffer *wb) +void process_links(json_struct *links, word_buffer *wb, int offset) { json_struct *link; // printf("process_links\n"); - for(link = segments->u.first; link != NULL; link = link->next){ - process_link(link->u.attr_val_list, wb); + for(link = links->u.first; link != NULL; link = link->next){ + process_link(link->u.attr_val_list, wb, offset); } } +int get_id_of_first_token_in_document(json_struct *document) +{ + json_attr_val *avl = NULL; + json_struct *tokens, *token; + json_attr_val *avl2 = NULL; + for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ + if(!strcmp(avl->attr, (char *)"tokens")){ + tokens = avl->val; + if(tokens){ + token = tokens->u.first; + if(token){ + for(avl2 = token->u.attr_val_list; avl2 != NULL; avl2 = avl2->next){ + if(!strcmp(avl2->attr, (char *)"id")) + return (int)avl2->val->u.number; + } + } + } + } + } + return -1; +} + void process_document(json_struct *document, word_buffer *wb) { json_attr_val *avl = NULL; - // printf("process_document\n"); + int offset = get_id_of_first_token_in_document(document); + // printf("process_document, offset = %d\n", offset); for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ // if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); - if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb); - if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb); + if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset); + if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset); } } diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index fc5bb5d5e8a5b974e76de0874ed50856e47ace61..08e523b22570a704ff91d75def8d6a85d6036d5e 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -184,8 +184,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{"); - - // fprintf(output_file, "\"orig\": %d, ", word_get_offset(w)); fprintf(output_file, "\"orig\": %d, ", index - index_first_word); fprintf(output_file, "\"dest\":"); if(gov_col){ @@ -193,7 +191,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in fprintf(output_file, "-1"); else{ word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index); -// fprintf(output_file, "%d", word_get_offset(gov)); fprintf(output_file, "%d", word_get_gov(w) + index - index_first_word); } } @@ -240,9 +237,7 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int word *w = word_buffer_get_word_n(wb, index); fprintf(output_file, "{ "); - /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */ fprintf(output_file, "\"start\": %d, ", index - index_first_word); - /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */ fprintf(output_file, "\"end\": %d, ", index - index_first_word); fprintf(output_file, "\"label\": \""); @@ -278,15 +273,24 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in void print_token(FILE *output_file, word_buffer *wb, int index) { int form_col = mcd_get_form_col(word_buffer_get_mcd(wb)); - int offset_col = mcd_get_offset_col(word_buffer_get_mcd(wb)); int length_col = mcd_get_length_col(word_buffer_get_mcd(wb)); word *w = word_buffer_get_word_n(wb, index); + char token[5000]; + int length_token, i; fprintf(output_file, "{ "); - fprintf(output_file, "\"id\": %d, ", word_get_offset(w)); + fprintf(output_file, "\"id\": %d, ", word_get_index(w)); fprintf(output_file, "\"word\": \""); - if(form_col != -1) - word_print_col_n(output_file, w, form_col); + if(form_col != -1){ + word_sprint_col_n(token, w, form_col); + length_token = strlen(token); + for(i=0; i < length_token; i++){ + if(token[i] == '"') + fprintf(output_file, """); + else + fprintf(output_file, "%c", token[i]); + } + } else fprintf(output_file, "_"); fprintf(output_file, "\", "); diff --git a/maca_trans_parser/src/movements.c b/maca_trans_parser/src/movements.c index d4c6c8e0eb47e2d7ca7365b50eee5ff0e4947981..afaf9320fe085ed00a28e2b9b0cf713b4f2de3a7 100644 --- a/maca_trans_parser/src/movements.c +++ b/maca_trans_parser/src/movements.c @@ -187,7 +187,7 @@ int movement_eos_old(config *c, int movement_code) /* set word on the top of the stack to eos */ word_set_sent_seg(s0, 1); - stack_pop(config_get_stack(c)); + stack_pop(config_get_stack(c)); config_push_mvt(c, movement_code, s0, NULL); return 1; }