Skip to content
Snippets Groups Projects
Commit d1620403 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added fields DIRECTORY and FILE in mcd files

parent fba33332
No related branches found
No related tags found
No related merge requests found
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "json_tree.h" #include "json_tree.h"
#define YYTEXT_MAX 100 #define YYTEXT_MAX 5000
#define EPSILON 0 #define EPSILON 0
/* symboles non terminaux */ /* symboles non terminaux */
......
...@@ -56,8 +56,8 @@ int word_buffer_read_sentence(word_buffer *bw); ...@@ -56,8 +56,8 @@ int word_buffer_read_sentence(word_buffer *bw);
word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct); word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
int word_buffer_locate_token_with_offset(word_buffer *wb, int offset); int word_buffer_locate_token_with_offset(word_buffer *wb, int offset);
word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov); word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov);
word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root); word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root);
......
...@@ -187,7 +187,7 @@ int word_buffer_locate_token_with_offset(word_buffer *wb, int offset) ...@@ -187,7 +187,7 @@ int word_buffer_locate_token_with_offset(word_buffer *wb, int offset)
} }
word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov) word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov)
{ {
word *bm1 = word_buffer_bm1(wb); word *bm1 = word_buffer_bm1(wb);
int rightmost_child_index = word_get_index(gov); int rightmost_child_index = word_get_index(gov);
...@@ -198,7 +198,7 @@ word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov) ...@@ -198,7 +198,7 @@ word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov)
} }
word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root) word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root)
{ {
word *rightmost_descendent = root; word *rightmost_descendent = root;
word *rightmost_child = NULL; word *rightmost_child = NULL;
...@@ -206,7 +206,7 @@ word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root) ...@@ -206,7 +206,7 @@ word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root)
while(change){ while(change){
change = 0; change = 0;
rightmost_child = word_buffer_get_rightmost_child(wb, rightmost_descendent); rightmost_child = word_buffer_get_rightmost_child_of_s0(wb, rightmost_descendent);
if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){ if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
rightmost_descendent = rightmost_child; rightmost_descendent = rightmost_child;
change = 1; change = 1;
......
...@@ -26,7 +26,7 @@ nosepar [^ \t\n] ...@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){ if(defait_amalgames){
BEGIN(state_defait_amalgames); BEGIN(state_defait_amalgames);
} }
#.* ECHO; #.* {ECHO; printf("\n");}
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);}
......
...@@ -242,6 +242,14 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat ...@@ -242,6 +242,14 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
if(status_lab && !strcmp(status_lab, "G")){ if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label); fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
w = word_buffer_get_word_n(wb, offset + start); w = word_buffer_get_word_n(wb, offset + start);
/* -------------------------------------*/
/* added by alexis 210/07/18 for datcha */
word_set_F(w, 1);
/* -------------------------------------*/
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + start);
}
else{
if(d) if(d)
label_code = dico_string2int(d, label); label_code = dico_string2int(d, label);
if(label_code == -1) if(label_code == -1)
...@@ -250,6 +258,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat ...@@ -250,6 +258,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
word_set_pos(w, label_code); word_set_pos(w, label_code);
} }
} }
}
void process_segment(json_attr_val *avl, word_buffer *wb, int offset) void process_segment(json_attr_val *avl, word_buffer *wb, int offset)
{ {
...@@ -297,23 +306,41 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status ...@@ -297,23 +306,41 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status
fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label); fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
w = word_buffer_get_word_n(wb, offset + orig); w = word_buffer_get_word_n(wb, offset + orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
}
else{
if(d) if(d)
label_code = dico_string2int(d, label); label_code = dico_string2int(d, label);
if(label_code == -1) if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label); fprintf(stderr, "WARNING : label %s unknown\n", label);
else // else
word_set_label(w, label_code); word_set_label(w, label_code);
} }
}
if(status_link && !strcmp(status_link, "G")){ if(status_link && !strcmp(status_link, "G")){
fprintf(stderr, "updating governor of token %d with %d\n", orig, dest); fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
w = word_buffer_get_word_n(wb, offset + orig); w = word_buffer_get_word_n(wb, offset + orig);
word_set_gov(w, dest - orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
} }
else{
if(dest == -1) /* -1 is for root */
word_set_gov(w, 0);
else
word_set_gov(w, dest - orig);
}
}
} }
void process_link(json_attr_val *avl, word_buffer *wb, int offset) void process_link(json_attr_val *avl, word_buffer *wb, int offset)
{ {
int orig, dest; int orig, dest;
...@@ -366,13 +393,58 @@ int get_id_of_first_token_in_document(json_struct *document) ...@@ -366,13 +393,58 @@ int get_id_of_first_token_in_document(json_struct *document)
return -1; return -1;
} }
int string_equal(char *s_json, char *s_mcf)
{
if(!strcmp(s_json, "&quot") && !strcmp(s_mcf, "\"")) return 1;
if(!strcmp(s_json, "&quot;&quot") && !strcmp(s_mcf, "\";\"")) return 1;
if(!strcmp(s_json, s_mcf)) return 1;
return 0;
}
//{"id":337,"word":"Bonjour","bold":0,"newline":0}
void check_token(json_attr_val *avl, word_buffer *wb, int offset)
{
int id;
char *form_json = NULL;
char *form_mcf = NULL;
json_attr_val *av;
word *w = NULL;
for(av = avl; av != NULL; av = av->next){
// printf("attr = %s\n", av->attr);
if(!strcmp(av->attr, "id")){id = (int)(av->val->u.number); continue;}
if(!strcmp(av->attr, "word")){form_json = av->val->u.string; continue;}
}
w = word_buffer_get_word_n(wb, id);
form_mcf = w->form;
fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf);
if(!string_equal(form_json, form_mcf)){
fprintf(stderr, "ERROR, tokens do not correspond in json and mcf files\n");
exit(1);
}
}
void check_tokens(json_struct *tokens, word_buffer *wb, int offset)
{
json_struct *token;
// printf("process_tokens\n");
for(token = tokens->u.first; token != NULL; token = token->next){
check_token(token->u.attr_val_list, wb, offset);
}
}
void process_document(json_struct *document, word_buffer *wb) void process_document(json_struct *document, word_buffer *wb)
{ {
json_attr_val *avl = NULL; json_attr_val *avl = NULL;
int offset = get_id_of_first_token_in_document(document); int offset = get_id_of_first_token_in_document(document);
// printf("process_document, offset = %d\n", offset); fprintf(stderr, "process_document, offset = %d\n", offset);
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){ for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
// if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string); // if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
if(!strcmp(avl->attr, (char *)"tokens")) check_tokens(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset); if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset); if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset);
} }
......
...@@ -122,6 +122,38 @@ context *context_read_options(int argc, char *argv[]) ...@@ -122,6 +122,38 @@ context *context_read_options(int argc, char *argv[])
return ctx; return ctx;
} }
word *get_rightmost_child(word_buffer *wb, word *gov)
{
word *bm1 = word_buffer_bm1(wb);
int rightmost_child_index = word_get_index(gov);
for(int current_index = word_get_index(gov); current_index < wb->nbelem; current_index++)
if(word_get_gov_index(word_buffer_get_word_n(wb, current_index)) == word_get_index(gov))
rightmost_child_index = current_index;
return word_buffer_get_word_n(wb, rightmost_child_index);
}
word *get_rightmost_descendent(word_buffer *wb, word *root)
{
word *rightmost_descendent = root;
word *rightmost_child = NULL;
int change = 1;
while(change){
change = 0;
rightmost_child = get_rightmost_child(wb, rightmost_descendent);
if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
rightmost_descendent = rightmost_child;
change = 1;
}
}
return rightmost_descendent;
}
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
FILE *output_file; FILE *output_file;
...@@ -140,6 +172,8 @@ int main(int argc, char *argv[]) ...@@ -140,6 +172,8 @@ int main(int argc, char *argv[])
int feats_col = mcd_get_feats_col(ctx->mcd_struct); int feats_col = mcd_get_feats_col(ctx->mcd_struct);
int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct); int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct);
int index = 1; int index = 1;
char label[100];
word *rightmost_descendent = NULL;
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
...@@ -147,7 +181,6 @@ int main(int argc, char *argv[]) ...@@ -147,7 +181,6 @@ int main(int argc, char *argv[])
do{ do{
w = word_buffer_b0(wb); w = word_buffer_b0(wb);
if(w){ if(w){
fprintf(output_file, "%d\t", index); fprintf(output_file, "%d\t", index);
if(form_col != -1) if(form_col != -1)
...@@ -189,8 +222,18 @@ int main(int argc, char *argv[]) ...@@ -189,8 +222,18 @@ int main(int argc, char *argv[])
else else
fprintf(output_file, "_\t"); fprintf(output_file, "_\t");
if(label_col != -1) if(label_col != -1){
word_print_col_n(output_file, w, label_col); word_print_col_n(output_file, w, label_col);
word_sprint_col_n(label, w, label_col);
if(!strcmp(label, "root")){
rightmost_descendent = get_rightmost_descendent(wb, w);
// printf("index rightmost desc = %d form = %s\n", rightmost_descendent->index, rightmost_descendent->form);
if(rightmost_descendent){
word_set_sent_seg(rightmost_descendent, 1);
}
}
}
else else
fprintf(output_file, "_"); fprintf(output_file, "_");
fprintf(output_file, "\t"); fprintf(output_file, "\t");
......
...@@ -148,7 +148,7 @@ int movement_root(config *c, int movement_code, int root_code) ...@@ -148,7 +148,7 @@ int movement_root(config *c, int movement_code, int root_code)
word_set_sent_seg(bm1, 1); word_set_sent_seg(bm1, 1);
/* word *rd = word_buffer_get_rightmost_descendent(config_get_buffer(c), s0); /* word *rd = word_buffer_get_rightmost_descendent_of_s0(config_get_buffer(c), s0);
if(rd) if(rd)
word_set_sent_seg(rd, 1); */ word_set_sent_seg(rd, 1); */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment