Skip to content
Snippets Groups Projects
Commit 0d6b8af7 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

Merge branch 'master' into ignore_punct

parents fa7d3b1f 4b6343c8
Branches
No related tags found
No related merge requests found
...@@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb); ...@@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb);
void word_buffer_print_compact(FILE *f, word_buffer *wb); void word_buffer_print_compact(FILE *f, word_buffer *wb);
int word_buffer_read_sentence(word_buffer *bw); int word_buffer_read_sentence(word_buffer *bw);
word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct); word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
int word_buffer_locate_token(word_buffer *wb, int offset); int word_buffer_locate_token_with_offset(word_buffer *wb, int offset);
/* /*
int word_buffer_is_empty(word_buffer *wb); int word_buffer_is_empty(word_buffer *wb);
int word_buffer_is_last(word_buffer *wb); int word_buffer_is_last(word_buffer *wb);
......
...@@ -156,7 +156,7 @@ int word_buffer_read_sentence(word_buffer *wb) ...@@ -156,7 +156,7 @@ int word_buffer_read_sentence(word_buffer *wb)
return wb->nbelem ; return wb->nbelem ;
} }
int word_buffer_locate_token(word_buffer *wb, int offset) int word_buffer_locate_token_with_offset(word_buffer *wb, int offset)
{ {
int c, first, last, middle; int c, first, last, middle;
word *w_middle; word *w_middle;
......
...@@ -228,7 +228,7 @@ void json2mcf_print_word_buffer(FILE *f, word_buffer *wb) ...@@ -228,7 +228,7 @@ void json2mcf_print_word_buffer(FILE *f, word_buffer *wb)
*/ */
void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab) void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab, int offset)
{ {
int index; int index;
word *w; word *w;
...@@ -241,9 +241,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat ...@@ -241,9 +241,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
if(status_lab && !strcmp(status_lab, "G")){ if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label); fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
index = word_buffer_locate_token(wb, start); w = word_buffer_get_word_n(wb, offset + start);
w = word_buffer_get_word_n(wb, index);
if(d) if(d)
label_code = dico_string2int(d, label); label_code = dico_string2int(d, label);
if(label_code == -1) if(label_code == -1)
...@@ -251,13 +249,9 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat ...@@ -251,13 +249,9 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
else else
word_set_pos(w, label_code); word_set_pos(w, label_code);
} }
} }
void process_segment(json_attr_val *avl, word_buffer *wb, int offset)
void process_segment(json_attr_val *avl, word_buffer *wb)
{ {
int start, end; int start, end;
char *label, *status_seg, *status_lab; char *label, *status_seg, *status_lab;
...@@ -271,24 +265,56 @@ void process_segment(json_attr_val *avl, word_buffer *wb) ...@@ -271,24 +265,56 @@ void process_segment(json_attr_val *avl, word_buffer *wb)
if(!strcmp(av->attr, "status_seg")){status_seg = av->val->u.string; continue;} if(!strcmp(av->attr, "status_seg")){status_seg = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
} }
update_segment(wb, start, end, label, status_seg, status_lab); update_segment(wb, start, end, label, status_seg, status_lab, offset);
// printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab); // printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab);
} }
void process_segments(json_struct *segments, word_buffer *wb) void process_segments(json_struct *segments, word_buffer *wb, int offset)
{ {
json_struct *segment; json_struct *segment;
// printf("process_segments\n"); // printf("process_segments\n");
for(segment = segments->u.first; segment != NULL; segment = segment->next){ for(segment = segments->u.first; segment != NULL; segment = segment->next){
process_segment(segment->u.attr_val_list, wb); process_segment(segment->u.attr_val_list, wb, offset);
} }
} }
// {"orig": 1, "dest":2, "label": "suj", "status_link": "", "status_lab": "", "timestamp": "", "author": "", "target": ""}, // {"orig": 1, "dest":2, "label": "suj", "status_link": "", "status_lab": "", "timestamp": "", "author": "", "target": ""},
void update_link(word_buffer *wb, int orig, int dest, char *label, char *status_link, char *status_lab, int offset)
{
int index;
word *w = NULL;
int label_code = -1;
dico *d;
mcd *mcd_struct = NULL;
mcd_struct = word_buffer_get_mcd(wb);
d = mcd_struct->dico_array[mcd_get_label_col(mcd_struct)];
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
w = word_buffer_get_word_n(wb, offset + orig);
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_label(w, label_code);
}
if(status_link && !strcmp(status_link, "G")){
fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
w = word_buffer_get_word_n(wb, offset + orig);
word_set_gov(w, dest - orig);
}
}
void process_link(json_attr_val *avl, word_buffer *wb) void process_link(json_attr_val *avl, word_buffer *wb, int offset)
{ {
int orig, dest; int orig, dest;
char *label, *status_link, *status_lab; char *label, *status_link, *status_lab;
...@@ -302,27 +328,53 @@ void process_link(json_attr_val *avl, word_buffer *wb) ...@@ -302,27 +328,53 @@ void process_link(json_attr_val *avl, word_buffer *wb)
if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;} if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;} if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
} }
// printf("link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab); // fprintf(stderr, "link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab);
update_link(wb, orig, dest, label, status_link, status_lab, offset);
} }
void process_links(json_struct *segments, word_buffer *wb) void process_links(json_struct *links, word_buffer *wb, int offset)
{ {
json_struct *link; json_struct *link;
// printf("process_links\n"); // printf("process_links\n");
for(link = segments->u.first; link != NULL; link = link->next){ for(link = links->u.first; link != NULL; link = link->next){
process_link(link->u.attr_val_list, wb); process_link(link->u.attr_val_list, wb, offset);
} }
} }
int get_id_of_first_token_in_document(json_struct *document)
{
json_attr_val *avl = NULL;
json_struct *tokens, *token;
json_attr_val *avl2 = NULL;
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
if(!strcmp(avl->attr, (char *)"tokens")){
tokens = avl->val;
if(tokens){
token = tokens->u.first;
if(token){
for(avl2 = token->u.attr_val_list; avl2 != NULL; avl2 = avl2->next){
if(!strcmp(avl2->attr, (char *)"id"))
return (int)avl2->val->u.number;
}
}
}
}
}
return -1;
}
void process_document(json_struct *document, word_buffer *wb) void process_document(json_struct *document, word_buffer *wb)
{ {
json_attr_val *avl = NULL; json_attr_val *avl = NULL;
// printf("process_document\n"); int offset = get_id_of_first_token_in_document(document);
// printf("process_document, offset = %d\n", offset);
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
// if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); // if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb); if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb); if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset);
} }
} }
......
...@@ -184,8 +184,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in ...@@ -184,8 +184,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
word *w = word_buffer_get_word_n(wb, index); word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{"); fprintf(output_file, "{");
// fprintf(output_file, "\"orig\": %d, ", word_get_offset(w));
fprintf(output_file, "\"orig\": %d, ", index - index_first_word); fprintf(output_file, "\"orig\": %d, ", index - index_first_word);
fprintf(output_file, "\"dest\":"); fprintf(output_file, "\"dest\":");
if(gov_col){ if(gov_col){
...@@ -193,7 +191,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in ...@@ -193,7 +191,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
fprintf(output_file, "-1"); fprintf(output_file, "-1");
else{ else{
word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index); word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index);
// fprintf(output_file, "%d", word_get_offset(gov));
fprintf(output_file, "%d", word_get_gov(w) + index - index_first_word); fprintf(output_file, "%d", word_get_gov(w) + index - index_first_word);
} }
} }
...@@ -240,9 +237,7 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int ...@@ -240,9 +237,7 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int
word *w = word_buffer_get_word_n(wb, index); word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ "); fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index - index_first_word); fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index - index_first_word); fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \""); fprintf(output_file, "\"label\": \"");
...@@ -278,15 +273,24 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in ...@@ -278,15 +273,24 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
void print_token(FILE *output_file, word_buffer *wb, int index) void print_token(FILE *output_file, word_buffer *wb, int index)
{ {
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb)); int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int offset_col = mcd_get_offset_col(word_buffer_get_mcd(wb));
int length_col = mcd_get_length_col(word_buffer_get_mcd(wb)); int length_col = mcd_get_length_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index); word *w = word_buffer_get_word_n(wb, index);
char token[5000];
int length_token, i;
fprintf(output_file, "{ "); fprintf(output_file, "{ ");
fprintf(output_file, "\"id\": %d, ", word_get_offset(w)); fprintf(output_file, "\"id\": %d, ", word_get_index(w));
fprintf(output_file, "\"word\": \""); fprintf(output_file, "\"word\": \"");
if(form_col != -1) if(form_col != -1){
word_print_col_n(output_file, w, form_col); word_sprint_col_n(token, w, form_col);
length_token = strlen(token);
for(i=0; i < length_token; i++){
if(token[i] == '"')
fprintf(output_file, "&quot");
else
fprintf(output_file, "%c", token[i]);
}
}
else else
fprintf(output_file, "_"); fprintf(output_file, "_");
fprintf(output_file, "\", "); fprintf(output_file, "\", ");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment