Skip to content
Snippets Groups Projects
Commit 0d6b8af7 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

Merge branch 'master' into ignore_punct

parents fa7d3b1f 4b6343c8
Branches
No related tags found
No related merge requests found
......@@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb);
void word_buffer_print_compact(FILE *f, word_buffer *wb);
int word_buffer_read_sentence(word_buffer *bw);
word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
int word_buffer_locate_token(word_buffer *wb, int offset);
int word_buffer_locate_token_with_offset(word_buffer *wb, int offset);
/*
int word_buffer_is_empty(word_buffer *wb);
int word_buffer_is_last(word_buffer *wb);
......
......@@ -156,7 +156,7 @@ int word_buffer_read_sentence(word_buffer *wb)
return wb->nbelem ;
}
int word_buffer_locate_token(word_buffer *wb, int offset)
int word_buffer_locate_token_with_offset(word_buffer *wb, int offset)
{
int c, first, last, middle;
word *w_middle;
......
......@@ -228,7 +228,7 @@ void json2mcf_print_word_buffer(FILE *f, word_buffer *wb)
*/
void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab)
void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab, int offset)
{
int index;
word *w;
......@@ -241,9 +241,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
index = word_buffer_locate_token(wb, start);
w = word_buffer_get_word_n(wb, index);
w = word_buffer_get_word_n(wb, offset + start);
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
......@@ -251,13 +249,9 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
else
word_set_pos(w, label_code);
}
}
void process_segment(json_attr_val *avl, word_buffer *wb)
void process_segment(json_attr_val *avl, word_buffer *wb, int offset)
{
int start, end;
char *label, *status_seg, *status_lab;
......@@ -271,24 +265,56 @@ void process_segment(json_attr_val *avl, word_buffer *wb)
if(!strcmp(av->attr, "status_seg")){status_seg = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
}
update_segment(wb, start, end, label, status_seg, status_lab);
update_segment(wb, start, end, label, status_seg, status_lab, offset);
// printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab);
}
void process_segments(json_struct *segments, word_buffer *wb)
void process_segments(json_struct *segments, word_buffer *wb, int offset)
{
json_struct *segment;
// printf("process_segments\n");
for(segment = segments->u.first; segment != NULL; segment = segment->next){
process_segment(segment->u.attr_val_list, wb);
process_segment(segment->u.attr_val_list, wb, offset);
}
}
// {"orig": 1, "dest":2, "label": "suj", "status_link": "", "status_lab": "", "timestamp": "", "author": "", "target": ""},
void update_link(word_buffer *wb, int orig, int dest, char *label, char *status_link, char *status_lab, int offset)
{
int index;
word *w = NULL;
int label_code = -1;
dico *d;
mcd *mcd_struct = NULL;
mcd_struct = word_buffer_get_mcd(wb);
d = mcd_struct->dico_array[mcd_get_label_col(mcd_struct)];
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
w = word_buffer_get_word_n(wb, offset + orig);
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_label(w, label_code);
}
if(status_link && !strcmp(status_link, "G")){
fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
w = word_buffer_get_word_n(wb, offset + orig);
word_set_gov(w, dest - orig);
}
}
void process_link(json_attr_val *avl, word_buffer *wb)
void process_link(json_attr_val *avl, word_buffer *wb, int offset)
{
int orig, dest;
char *label, *status_link, *status_lab;
......@@ -302,27 +328,53 @@ void process_link(json_attr_val *avl, word_buffer *wb)
if(!strcmp(av->attr, "status_link")){status_link = av->val->u.string; continue;}
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
}
// printf("link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab);
// fprintf(stderr, "link : orig = %d dest = %d label = %s status_link = %s status_lab = %s\n", orig, dest, label, status_link, status_lab);
update_link(wb, orig, dest, label, status_link, status_lab, offset);
}
void process_links(json_struct *segments, word_buffer *wb)
void process_links(json_struct *links, word_buffer *wb, int offset)
{
json_struct *link;
// printf("process_links\n");
for(link = segments->u.first; link != NULL; link = link->next){
process_link(link->u.attr_val_list, wb);
for(link = links->u.first; link != NULL; link = link->next){
process_link(link->u.attr_val_list, wb, offset);
}
}
int get_id_of_first_token_in_document(json_struct *document)
{
json_attr_val *avl = NULL;
json_struct *tokens, *token;
json_attr_val *avl2 = NULL;
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
if(!strcmp(avl->attr, (char *)"tokens")){
tokens = avl->val;
if(tokens){
token = tokens->u.first;
if(token){
for(avl2 = token->u.attr_val_list; avl2 != NULL; avl2 = avl2->next){
if(!strcmp(avl2->attr, (char *)"id"))
return (int)avl2->val->u.number;
}
}
}
}
}
return -1;
}
void process_document(json_struct *document, word_buffer *wb)
{
json_attr_val *avl = NULL;
// printf("process_document\n");
int offset = get_id_of_first_token_in_document(document);
// printf("process_document, offset = %d\n", offset);
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
// if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset);
}
}
......
......@@ -184,8 +184,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{");
// fprintf(output_file, "\"orig\": %d, ", word_get_offset(w));
fprintf(output_file, "\"orig\": %d, ", index - index_first_word);
fprintf(output_file, "\"dest\":");
if(gov_col){
......@@ -193,7 +191,6 @@ void print_link(FILE *output_file, word_buffer *wb, int index_first_word, int in
fprintf(output_file, "-1");
else{
word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index);
// fprintf(output_file, "%d", word_get_offset(gov));
fprintf(output_file, "%d", word_get_gov(w) + index - index_first_word);
}
}
......@@ -240,9 +237,7 @@ void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \"");
......@@ -278,15 +273,24 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
void print_token(FILE *output_file, word_buffer *wb, int index)
{
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int offset_col = mcd_get_offset_col(word_buffer_get_mcd(wb));
int length_col = mcd_get_length_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
char token[5000];
int length_token, i;
fprintf(output_file, "{ ");
fprintf(output_file, "\"id\": %d, ", word_get_offset(w));
fprintf(output_file, "\"id\": %d, ", word_get_index(w));
fprintf(output_file, "\"word\": \"");
if(form_col != -1)
word_print_col_n(output_file, w, form_col);
if(form_col != -1){
word_sprint_col_n(token, w, form_col);
length_token = strlen(token);
for(i=0; i < length_token; i++){
if(token[i] == '"')
fprintf(output_file, "&quot");
else
fprintf(output_file, "%c", token[i]);
}
}
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment