diff --git a/maca_common/include/json_parser.h b/maca_common/include/json_parser.h index 82664e5c006c931630c1ac97ce5686f2d2757c96..a254a09ae142936796314b550631d1d05ef24228 100644 --- a/maca_common/include/json_parser.h +++ b/maca_common/include/json_parser.h @@ -3,7 +3,7 @@ #include "json_tree.h" -#define YYTEXT_MAX 100 +#define YYTEXT_MAX 5000 #define EPSILON 0 /* symboles non terminaux */ diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h index 73f191638e98e2bd889b27b4303a2b1345c7f581..9efbaf09f8e90422907c4a85954fcc6cd62fb0ec 100644 --- a/maca_common/include/word_buffer.h +++ b/maca_common/include/word_buffer.h @@ -56,8 +56,8 @@ int word_buffer_read_sentence(word_buffer *bw); word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct); int word_buffer_locate_token_with_offset(word_buffer *wb, int offset); -word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov); -word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root); +word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov); +word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root); diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index 715729cd936bdafc98ad4466d0ee5ceaee6d9e0e..971c03781657755f150f63e9e3b6e15b1336558a 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -187,7 +187,7 @@ int word_buffer_locate_token_with_offset(word_buffer *wb, int offset) } -word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov) +word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov) { word *bm1 = word_buffer_bm1(wb); int rightmost_child_index = word_get_index(gov); @@ -198,7 +198,7 @@ word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov) } -word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root) +word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root) { word *rightmost_descendent = root; word *rightmost_child = NULL; @@ -206,7 +206,7 @@ word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root) while(change){ change = 0; - rightmost_child = word_buffer_get_rightmost_child(wb, rightmost_descendent); + rightmost_child = word_buffer_get_rightmost_child_of_s0(wb, rightmost_descendent); if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){ rightmost_descendent = rightmost_child; change = 1; diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 964b702fbaa3749834e286966655d18f696eb04f..f34cc006e4c6075c9ae2fd71e2f8b4cced81ada2 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -26,7 +26,7 @@ nosepar [^ \t\n] if(defait_amalgames){ BEGIN(state_defait_amalgames); } -#.* ECHO; +#.* {ECHO; printf("\n");} \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);} diff --git a/maca_tools/src/json2mcf.c b/maca_tools/src/json2mcf.c index 55109851ac73d13dc8e9299fd23b70adec760dba..a5bf8758cb272bd96122de1ae67c5acabcd3da1a 100644 --- a/maca_tools/src/json2mcf.c +++ b/maca_tools/src/json2mcf.c @@ -242,12 +242,21 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat if(status_lab && !strcmp(status_lab, "G")){ fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label); w = word_buffer_get_word_n(wb, offset + start); - if(d) - label_code = dico_string2int(d, label); - if(label_code == -1) - fprintf(stderr, "label %s unknown\n", label); - else - word_set_pos(w, label_code); + /* -------------------------------------*/ + /* added by alexis 210/07/18 for datcha */ + word_set_F(w, 1); + /* -------------------------------------*/ + if(w == NULL){ + fprintf(stderr, "WARNING cannot access segment %d\n", offset + start); + } + else{ + if(d) + label_code = dico_string2int(d, label); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", label); + else + word_set_pos(w, label_code); + } } } @@ -296,24 +305,42 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status if(status_lab && !strcmp(status_lab, "G")){ fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label); w = word_buffer_get_word_n(wb, offset + orig); + + if(w == NULL){ + fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig); + } - if(d) - label_code = dico_string2int(d, label); - if(label_code == -1) - fprintf(stderr, "label %s unknown\n", label); - else + else{ + if(d) + label_code = dico_string2int(d, label); + if(label_code == -1) + fprintf(stderr, "WARNING : label %s unknown\n", label); + // else word_set_label(w, label_code); + } } if(status_link && !strcmp(status_link, "G")){ fprintf(stderr, "updating governor of token %d with %d\n", orig, dest); w = word_buffer_get_word_n(wb, offset + orig); - word_set_gov(w, dest - orig); - } - + if(w == NULL){ + fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig); + } + + else{ + + if(dest == -1) /* -1 is for root */ + word_set_gov(w, 0); + else + word_set_gov(w, dest - orig); + } + } } + + + void process_link(json_attr_val *avl, word_buffer *wb, int offset) { int orig, dest; @@ -366,13 +393,58 @@ int get_id_of_first_token_in_document(json_struct *document) return -1; } + +int string_equal(char *s_json, char *s_mcf) +{ + if(!strcmp(s_json, """) && !strcmp(s_mcf, "\"")) return 1; + if(!strcmp(s_json, """") && !strcmp(s_mcf, "\";\"")) return 1; + if(!strcmp(s_json, s_mcf)) return 1; + return 0; +} + +//{"id":337,"word":"Bonjour","bold":0,"newline":0} + +void check_token(json_attr_val *avl, word_buffer *wb, int offset) +{ + int id; + char *form_json = NULL; + char *form_mcf = NULL; + json_attr_val *av; + word *w = NULL; + for(av = avl; av != NULL; av = av->next){ + // printf("attr = %s\n", av->attr); + if(!strcmp(av->attr, "id")){id = (int)(av->val->u.number); continue;} + if(!strcmp(av->attr, "word")){form_json = av->val->u.string; continue;} + + } + w = word_buffer_get_word_n(wb, id); + form_mcf = w->form; + fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf); + if(!string_equal(form_json, form_mcf)){ + fprintf(stderr, "ERROR, tokens do not correspond in json and mcf files\n"); + exit(1); + } +} + +void check_tokens(json_struct *tokens, word_buffer *wb, int offset) +{ + json_struct *token; + // printf("process_tokens\n"); + for(token = tokens->u.first; token != NULL; token = token->next){ + check_token(token->u.attr_val_list, wb, offset); + } +} + + + void process_document(json_struct *document, word_buffer *wb) { json_attr_val *avl = NULL; int offset = get_id_of_first_token_in_document(document); - // printf("process_document, offset = %d\n", offset); + fprintf(stderr, "process_document, offset = %d\n", offset); for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ // if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); + if(!strcmp(avl->attr, (char *)"tokens")) check_tokens(avl->val, wb, offset); if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset); if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset); } diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c index 8228022002369a1103bd568f07b3e40011d0c917..809adb92af0a21124c139919a236ea49b4a90f47 100644 --- a/maca_tools/src/mcf2conll.c +++ b/maca_tools/src/mcf2conll.c @@ -122,6 +122,38 @@ context *context_read_options(int argc, char *argv[]) return ctx; } + +word *get_rightmost_child(word_buffer *wb, word *gov) +{ + word *bm1 = word_buffer_bm1(wb); + int rightmost_child_index = word_get_index(gov); + for(int current_index = word_get_index(gov); current_index < wb->nbelem; current_index++) + if(word_get_gov_index(word_buffer_get_word_n(wb, current_index)) == word_get_index(gov)) + rightmost_child_index = current_index; + return word_buffer_get_word_n(wb, rightmost_child_index); +} + + +word *get_rightmost_descendent(word_buffer *wb, word *root) +{ + word *rightmost_descendent = root; + word *rightmost_child = NULL; + int change = 1; + + while(change){ + change = 0; + rightmost_child = get_rightmost_child(wb, rightmost_descendent); + if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){ + rightmost_descendent = rightmost_child; + change = 1; + } + } + + return rightmost_descendent; +} + + + int main(int argc, char *argv[]) { FILE *output_file; @@ -140,6 +172,8 @@ int main(int argc, char *argv[]) int feats_col = mcd_get_feats_col(ctx->mcd_struct); int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct); int index = 1; + char label[100]; + word *rightmost_descendent = NULL; output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; @@ -147,7 +181,6 @@ int main(int argc, char *argv[]) do{ w = word_buffer_b0(wb); if(w){ - fprintf(output_file, "%d\t", index); if(form_col != -1) @@ -189,8 +222,18 @@ int main(int argc, char *argv[]) else fprintf(output_file, "_\t"); - if(label_col != -1) + if(label_col != -1){ word_print_col_n(output_file, w, label_col); + word_sprint_col_n(label, w, label_col); + if(!strcmp(label, "root")){ + + rightmost_descendent = get_rightmost_descendent(wb, w); + // printf("index rightmost desc = %d form = %s\n", rightmost_descendent->index, rightmost_descendent->form); + if(rightmost_descendent){ + word_set_sent_seg(rightmost_descendent, 1); + } + } + } else fprintf(output_file, "_"); fprintf(output_file, "\t"); diff --git a/maca_trans_parser/src/movements.c b/maca_trans_parser/src/movements.c index 14a005defbc9cb11033799c4ab1be3bcc751fb2c..610eb7c5fbf289ed475b8bbcfe82a67d3f207d9d 100644 --- a/maca_trans_parser/src/movements.c +++ b/maca_trans_parser/src/movements.c @@ -148,7 +148,7 @@ int movement_root(config *c, int movement_code, int root_code) word_set_sent_seg(bm1, 1); - /* word *rd = word_buffer_get_rightmost_descendent(config_get_buffer(c), s0); + /* word *rd = word_buffer_get_rightmost_descendent_of_s0(config_get_buffer(c), s0); if(rd) word_set_sent_seg(rd, 1); */