diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 0d7761d753741e12621a4590015fc10bb24ee437..a86626fe4fa43f269579b86bb3ba769f525bf2ec 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -8,9 +8,11 @@ #define MCD_INVALID_VALUE -1 -#define MCD_WF_NB 47 +#define MCD_WF_NB 48 #define MCD_WF_ID 0 +#define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */ +#define MCD_WF_LENGTH 47 #define MCD_WF_FORM 1 #define MCD_WF_LEMMA 2 #define MCD_WF_CPOS 3 @@ -129,6 +131,8 @@ Xtra*/ #define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL] #define mcd_get_index_col(m) (m)->wf2col[MCD_WF_ID] +#define mcd_get_offset_col(m) (m)->wf2col[MCD_WF_OFFSET] +#define mcd_get_length_col(m) (m)->wf2col[MCD_WF_LENGTH] #define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM] #define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA] #define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS] diff --git a/maca_common/include/word.h b/maca_common/include/word.h index c6a4b56db3b7d6e1760ae59bf3b5d6f521688f9b..88f32b6a658251bc7a8a50e698220f840c3d443f 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -51,6 +51,8 @@ typedef struct _word { #define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5]) #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) +#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET]) +#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) #define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA]) #define word_get_cpos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS]) diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h index a2786386f32eed1b2413e42b575cb132c1d0a0dc..eb995bd12b6b699e19a4ad7543bbe9747916e5d7 100644 --- a/maca_common/include/word_buffer.h +++ b/maca_common/include/word_buffer.h @@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb); void word_buffer_print_compact(FILE *f, word_buffer *wb); int word_buffer_read_sentence(word_buffer *bw); word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct); - +int word_buffer_locate_token(word_buffer *wb, int offset); /* int word_buffer_is_empty(word_buffer *wb); int word_buffer_is_last(word_buffer *wb); diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 69117407b18c2303402578a5499d18685f0fb30d..4a2348e14f4f3c406c1af3636ff3b5f3a35822d5 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -500,6 +500,8 @@ dico_vec *mcd_build_dico_vec(mcd *mcd_struct) int mcd_wf_code(char *wf) { if(!strcmp(wf, "INDEX")) return MCD_WF_ID; + if(!strcmp(wf, "OFFSET")) return MCD_WF_OFFSET; + if(!strcmp(wf, "LENGTH")) return MCD_WF_LENGTH; if(!strcmp(wf, "FORM")) return MCD_WF_FORM; if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA; if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS; diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index b72f72e14efacfd65d46880cc2be82aa7a224095..a15542bdaab0670230e13d44e96823cb970b888b 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -156,6 +156,37 @@ int word_buffer_read_sentence(word_buffer *wb) return wb->nbelem ; } +int word_buffer_locate_token(word_buffer *wb, int offset) +{ + int c, first, last, middle; + word *w_middle; + first = 0; + last = wb->nbelem - 1; + middle = (first+last)/2; + + while (first <= last) { + // printf("first = %d middle = %d last = %d\n", first, middle, last); + w_middle = word_buffer_get_word_n(wb, middle); + // printf("w middle = %d current offset = %d\n", w_middle, word_get_offset(w_middle)); + if (word_get_offset(w_middle) < offset) + first = middle + 1; + else if (word_get_offset(w_middle) == offset) { + // printf("%d found at location %d.\n", offset, middle+1); + break; + } + else + last = middle - 1; + middle = (first + last)/2; + } + if (first > last){ + + // printf("Not found! %d is not present in the list.\n", offset); + return -1; + } + return middle; +} + + /*int word_buffer_end(word_buffer *wb) { return (wb->current_index >= wb->nbelem)? 1 : 0; diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c index 6e503c4503a106c788a30ebbaba0cf6d7fa76b13..2ffca7e4cb725e0dcba1d5e29331b7efe5c22f27 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -10,27 +10,39 @@ extern int print_offset; extern int print_token_length; void maca_tokenizer_segment(char *separator, char *text_matched){ + int first = 1; if(token_length != 0){ + if(print_offset){ + if(first == 1) first = 0; else printf("\t"); + printf("%d", offset); + } + if(print_token_length){ + if(first == 1) first = 0; else printf("\t"); + printf("%d", utf8_strlen(token)); + } + if(first == 1) first = 0; else printf("\t"); printf("%s", token); - if(print_offset) - printf("\t%d", offset); - if(print_token_length) - printf("\t%d", utf8_strlen(token)); printf("\n"); } offset += utf8_strlen(token); token_length = 0; token[0] = 0; - + first = 1; if(strlen(separator) != 0){ + if(print_offset){ + if(first == 1) first = 0; else printf("\t"); + printf("%d", offset); + } + if(print_token_length){ + if(first == 1) first = 0; else printf("\t"); + printf("%d", (int) utf8_strlen(separator)); + } + if(first == 1) first = 0; else printf("\t"); printf("%s", separator); - if(print_offset) - printf("\t%d", offset); - if(print_token_length) - printf("\t%d", (int) utf8_strlen(separator)); printf("\n"); } + offset += utf8_strlen(text_matched); } diff --git a/maca_tools/src/json2mcf.c b/maca_tools/src/json2mcf.c index c19d3b78e6ded56a8ba2eec5825d12cf0f048bc9..97da47daa426538dad9f9400faf2a14be527e589 100644 --- a/maca_tools/src/json2mcf.c +++ b/maca_tools/src/json2mcf.c @@ -129,11 +129,130 @@ context *json2mcf_context_read_options(int argc, char *argv[]) return ctx; } +void json2mcf_print_word_buffer(FILE *f, word_buffer *wb) +{ + int i; + word *w; + int col_nb = 0; + mcd *mcd_struct = word_buffer_get_mcd(wb); + char *string; + int wf; + + for(i=0; i < wb->nbelem; i++){ + w = word_buffer_get_word_n(wb, i); + for(col_nb=0; col_nb < mcd_struct->nb_col;col_nb++){ + wf = mcd_struct->wf[col_nb]; + if(col_nb > 0) {fprintf(f, "\t");} + if(mcd_struct->representation[col_nb] == MCD_REPRESENTATION_INT){ + fprintf(f, "%d", w->wf_array[wf]); + } + if(mcd_struct->representation[col_nb] == MCD_REPRESENTATION_VOCAB){ + string = mcd_get_str(mcd_struct, w->wf_array[wf], col_nb); + // string = dico_int2string(mcd_struct->dico_array[col_nb], w->wf_array[wf]); + if(string) + fprintf(f, "%s", string); + else + fprintf(f, "_"); + } + } + fprintf(f, "\n"); + } +} + +/* if((mcd_get_gov_col(mcd_struct) == -1) + && (mcd_get_label_col(mcd_struct) == -1) + && (mcd_get_sent_seg_col(mcd_struct) == -1)){ + printf("%s\t", word_get_input(w)); + printf("%d\t", word_get_gov(w)); + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if(word_get_sent_seg(w) == 1) + printf("1\n") ; + else + printf("0\n"); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_gov_col(mcd_struct)){ + printf("%d", word_get_gov(w)); + } + else + if(col_nb == mcd_get_label_col(mcd_struct)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s", label) ; + else + printf("_"); + } + else + if(col_nb == mcd_get_sent_seg_col(mcd_struct)){ + if(word_get_sent_seg(w) == 1) + printf("1") ; + else + printf("0"); + } + else{ + word_print_col_n(stdout, w, col_nb); + } + col_nb++; + token = strtok(NULL, "\t"); + } + if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){ + printf("\t%d", word_get_gov(w)); + } + if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("\t%s", label) ; + else + printf("\t_"); + } + if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){ + if(word_get_sent_seg(w) == 1) + printf("\t1") ; + else + printf("\t0"); + } + printf("\n"); + free(buffer); + } + } +} + +*/ + void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab) { + int index; + word *w; + int label_code; + dico *d; + + if(status_lab && !strcmp(status_lab, "GOLD")){ + printf("updating label of segment [%d-%d] with \"%s\"\n", start, end, label); + index = word_buffer_locate_token(wb, start); + w = word_buffer_get_word_n(wb, index); + d = word_buffer_get_mcd(wb)->dico_array[MCD_WF_CPOS]; + if(d) + label_code = dico_string2int(d, label); + if(label_code == -1) + fprintf(stderr, "label %s unknown\n", label); + else + word_set_pos(w, label_code); + } } + + + void process_segment(json_attr_val *avl, word_buffer *wb) { int start, end; @@ -149,14 +268,14 @@ void process_segment(json_attr_val *avl, word_buffer *wb) if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} } update_segment(wb, start, end, label, status_seg, status_lab); - // printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab); + // printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab); } void process_segments(json_struct *segments, word_buffer *wb) { json_struct *segment; - printf("process_segments\n"); + // printf("process_segments\n"); for(segment = segments->u.first; segment != NULL; segment = segment->next){ process_segment(segment->u.attr_val_list, wb); } @@ -185,7 +304,7 @@ void process_link(json_attr_val *avl, word_buffer *wb) void process_links(json_struct *segments, word_buffer *wb) { json_struct *link; - printf("process_links\n"); + // printf("process_links\n"); for(link = segments->u.first; link != NULL; link = link->next){ process_link(link->u.attr_val_list, wb); } @@ -195,9 +314,9 @@ void process_links(json_struct *segments, word_buffer *wb) void process_document(json_struct *document, word_buffer *wb) { json_attr_val *avl = NULL; - printf("process_document\n"); + // printf("process_document\n"); for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ - if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); + // if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb); if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb); } @@ -206,7 +325,7 @@ void process_document(json_struct *document, word_buffer *wb) void process_documents(json_struct *documents, word_buffer *wb) { json_struct *document; - printf("process_documents\n"); + // printf("process_documents\n"); for(document = documents->u.first; document != NULL; document = document->next){ process_document(document, wb); } @@ -228,6 +347,7 @@ int main(int argc, char *argv[]) json_attr_val *avl = NULL; json2mcf_check_options(ctx); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename); wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct); root = json_parse_full(ctx->json_filename, 0); @@ -236,14 +356,14 @@ int main(int argc, char *argv[]) exit(1); } for(avl = root->u.attr_val_list; avl != NULL; avl = avl->next){ - printf("section %s\n", avl->attr); + // printf("section %s\n", avl->attr); if(!strcmp(avl->attr, (char *)"documents")){ process_documents(avl->val, wb); } } //json_print_struct(stdout, root); - + json2mcf_print_word_buffer(stdout, wb); json_free_struct(root); json2mcf_context_free(ctx); return 0; diff --git a/maca_tools/src/mcf2json.c b/maca_tools/src/mcf2json.c index ba56ded57b6ce1262a9d87265f8ce9d7e004eb16..4381177cfd388d8012380286f515dc1ae5fd19a8 100644 --- a/maca_tools/src/mcf2json.c +++ b/maca_tools/src/mcf2json.c @@ -144,17 +144,19 @@ void print_header(FILE *output_file, mcd *mcd_struct) fprintf(output_file, "\"id\": \"\",\n"); fprintf(output_file, "\"timestamp\": \"\",\n"); - fprintf(output_file, "\"labels_segment\": \""); + fprintf(output_file, "\"labels_segment\": ["); for(i=0; i < dico_pos->nbelem; i++){ - fprintf(output_file, " %s", dico_pos->array[i]); + if(i != 0) fprintf(output_file, ", "); + fprintf(output_file, "\"%s\"", dico_pos->array[i]); } - fprintf(output_file, "\",\n"); + fprintf(output_file, "],\n"); - fprintf(output_file, "\"labels_link\": \""); + fprintf(output_file, "\"labels_link\": ["); for(i=0; i < dico_label->nbelem; i++){ - fprintf(output_file, " %s", dico_label->array[i]); + if(i != 0) fprintf(output_file, ", "); + fprintf(output_file, "\"%s\"", dico_label->array[i]); } - fprintf(output_file, "\"\n"); + fprintf(output_file, "]\n"); fprintf(output_file, "},\n"); @@ -167,67 +169,73 @@ void print_header(FILE *output_file, mcd *mcd_struct) } -void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col) +void print_link(FILE *output_file, word_buffer *wb, int index) { - fprintf(output_file, "{"); - - fprintf(output_file, "\"orig\": %d, ", index); - fprintf(output_file, "\"dest\":"); - if(gov_col){ - if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0)) - fprintf(output_file, "0"); - else - fprintf(output_file, "%d", word_get_gov(w) + index); - } - else - fprintf(output_file, "_"); - fprintf(output_file, ", "); - - - fprintf(output_file, "\"label\": \""); - if(label_col != -1) - word_print_col_n(output_file, w, label_col); - else - fprintf(output_file, "_"); - fprintf(output_file, "\", "); - - fprintf(output_file, "\"status_link\": \"\", "); - fprintf(output_file, "\"status_lab\": \"\", "); - fprintf(output_file, "\"timestamp\": \"\", "); - fprintf(output_file, "\"author\": \"\", "); - fprintf(output_file, "\"target\": \"\""); - fprintf(output_file, "}"); - + int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb)); + int label_col = mcd_get_label_col(word_buffer_get_mcd(wb)); + int index_col = mcd_get_index_col(word_buffer_get_mcd(wb)); + word *w = word_buffer_get_word_n(wb, index); + + fprintf(output_file, "{"); + + // fprintf(output_file, "\"orig\": %d, ", word_get_offset(w)); + fprintf(output_file, "\"orig\": %d, ", index); + fprintf(output_file, "\"dest\":"); + if(gov_col){ + if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0)) + fprintf(output_file, "0"); + else{ + word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index); +// fprintf(output_file, "%d", word_get_offset(gov)); + fprintf(output_file, "%d", word_get_gov(w) + index); } + } + else + fprintf(output_file, "_"); + fprintf(output_file, ", "); + + + fprintf(output_file, "\"label\": \""); + if(label_col != -1) + word_print_col_n(output_file, w, label_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\", "); + + fprintf(output_file, "\"status_link\": \"\", "); + fprintf(output_file, "\"status_lab\": \"\", "); + fprintf(output_file, "\"timestamp\": \"\", "); + fprintf(output_file, "\"author\": \"\", "); + fprintf(output_file, "\"target\": \"\""); + fprintf(output_file, "}"); + +} void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) { - word *w; int index; - int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb)); - int label_col = mcd_get_label_col(word_buffer_get_mcd(wb)); int first_link = 1; fprintf(output_file, "\"links\": ["); for(index = index_first_word; index <= index_last_word; index++){ - w = word_buffer_get_word_n(wb, index); - if(first_link == 1) - first_link = 0; - else - fprintf(output_file, ","); + if(first_link == 1) first_link = 0; else fprintf(output_file, ","); fprintf(output_file, "\n"); - - print_link(output_file, w, index - index_first_word + 1, gov_col, label_col); + print_link(output_file, wb, index); } fprintf(output_file," ]"); } -void print_segment(FILE *output_file, word *w, int index, int pos_col) +void print_segment(FILE *output_file, word_buffer *wb, int index) { + int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); + word *w = word_buffer_get_word_n(wb, index); + fprintf(output_file, "{ "); + /* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */ fprintf(output_file, "\"start\": %d, ", index); + /* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */ fprintf(output_file, "\"end\": %d, ", index); fprintf(output_file, "\"label\": \""); @@ -248,28 +256,27 @@ void print_segment(FILE *output_file, word *w, int index, int pos_col) void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) { - word *w; int index; - int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb)); int first_segment = 1; fprintf(output_file, "\"segments\": ["); for(index = index_first_word; index <= index_last_word; index++){ - w = word_buffer_get_word_n(wb, index); - if(first_segment == 1) - first_segment = 0; - else - fprintf(output_file, ","); + if(first_segment == 1) first_segment = 0; else fprintf(output_file, ","); fprintf(output_file, "\n"); - print_segment(output_file, w, index - index_first_word + 1, pos_col); + print_segment(output_file, wb, index); } fprintf(output_file," ],\n"); } -void print_token(FILE *output_file, word *w, int index, int form_col) +void print_token(FILE *output_file, word_buffer *wb, int index) { + int form_col = mcd_get_form_col(word_buffer_get_mcd(wb)); + int offset_col = mcd_get_offset_col(word_buffer_get_mcd(wb)); + int length_col = mcd_get_length_col(word_buffer_get_mcd(wb)); + word *w = word_buffer_get_word_n(wb, index); + fprintf(output_file, "{ "); - fprintf(output_file, "\"id\": %d, ", index); + fprintf(output_file, "\"id\": %d, ", word_get_offset(w)); fprintf(output_file, "\"word\": \""); if(form_col != -1) word_print_col_n(output_file, w, form_col); @@ -285,20 +292,14 @@ void print_token(FILE *output_file, word *w, int index, int form_col) void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word) { - word *w; int index; - int form_col = mcd_get_form_col(word_buffer_get_mcd(wb)); int first_token = 1; fprintf(output_file, "\"tokens\": ["); for(index = index_first_word; index <= index_last_word; index++){ - w = word_buffer_get_word_n(wb, index); - if(first_token == 1) - first_token = 0; - else - fprintf(output_file, ","); + if(first_token == 1) first_token = 0; else fprintf(output_file, ","); fprintf(output_file, "\n"); - print_token(output_file, w, index - index_first_word + 1, form_col); + print_token(output_file, wb, index); } fprintf(output_file," ],\n"); }