Commit d1620403 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added fields DIRECTORY and FILE in mcd files

parent fba33332
......@@ -3,7 +3,7 @@
#include "json_tree.h"
#define YYTEXT_MAX 100
#define YYTEXT_MAX 5000
#define EPSILON 0
/* symboles non terminaux */
......
......@@ -56,8 +56,8 @@ int word_buffer_read_sentence(word_buffer *bw);
word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
int word_buffer_locate_token_with_offset(word_buffer *wb, int offset);
word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov);
word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root);
word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov);
word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root);
......
......@@ -187,7 +187,7 @@ int word_buffer_locate_token_with_offset(word_buffer *wb, int offset)
}
word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov)
word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov)
{
word *bm1 = word_buffer_bm1(wb);
int rightmost_child_index = word_get_index(gov);
......@@ -198,7 +198,7 @@ word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov)
}
word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root)
word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root)
{
word *rightmost_descendent = root;
word *rightmost_child = NULL;
......@@ -206,7 +206,7 @@ word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root)
while(change){
change = 0;
rightmost_child = word_buffer_get_rightmost_child(wb, rightmost_descendent);
rightmost_child = word_buffer_get_rightmost_child_of_s0(wb, rightmost_descendent);
if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
rightmost_descendent = rightmost_child;
change = 1;
......
......@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
#.* ECHO;
#.* {ECHO; printf("\n");}
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -242,12 +242,21 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
w = word_buffer_get_word_n(wb, offset + start);
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos(w, label_code);
/* -------------------------------------*/
/* added by alexis 210/07/18 for datcha */
word_set_F(w, 1);
/* -------------------------------------*/
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + start);
}
else{
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos(w, label_code);
}
}
}
......@@ -296,24 +305,42 @@ void update_link(word_buffer *wb, int orig, int dest, char *label, char *status
if(status_lab && !strcmp(status_lab, "G")){
fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
w = word_buffer_get_word_n(wb, offset + orig);
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
}
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
else{
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "WARNING : label %s unknown\n", label);
// else
word_set_label(w, label_code);
}
}
if(status_link && !strcmp(status_link, "G")){
fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
w = word_buffer_get_word_n(wb, offset + orig);
word_set_gov(w, dest - orig);
}
if(w == NULL){
fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
}
else{
if(dest == -1) /* -1 is for root */
word_set_gov(w, 0);
else
word_set_gov(w, dest - orig);
}
}
}
void process_link(json_attr_val *avl, word_buffer *wb, int offset)
{
int orig, dest;
......@@ -366,13 +393,58 @@ int get_id_of_first_token_in_document(json_struct *document)
return -1;
}
int string_equal(char *s_json, char *s_mcf)
{
if(!strcmp(s_json, "&quot") && !strcmp(s_mcf, "\"")) return 1;
if(!strcmp(s_json, "&quot;&quot") && !strcmp(s_mcf, "\";\"")) return 1;
if(!strcmp(s_json, s_mcf)) return 1;
return 0;
}
//{"id":337,"word":"Bonjour","bold":0,"newline":0}
void check_token(json_attr_val *avl, word_buffer *wb, int offset)
{
int id;
char *form_json = NULL;
char *form_mcf = NULL;
json_attr_val *av;
word *w = NULL;
for(av = avl; av != NULL; av = av->next){
// printf("attr = %s\n", av->attr);
if(!strcmp(av->attr, "id")){id = (int)(av->val->u.number); continue;}
if(!strcmp(av->attr, "word")){form_json = av->val->u.string; continue;}
}
w = word_buffer_get_word_n(wb, id);
form_mcf = w->form;
fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf);
if(!string_equal(form_json, form_mcf)){
fprintf(stderr, "ERROR, tokens do not correspond in json and mcf files\n");
exit(1);
}
}
void check_tokens(json_struct *tokens, word_buffer *wb, int offset)
{
json_struct *token;
// printf("process_tokens\n");
for(token = tokens->u.first; token != NULL; token = token->next){
check_token(token->u.attr_val_list, wb, offset);
}
}
void process_document(json_struct *document, word_buffer *wb)
{
json_attr_val *avl = NULL;
int offset = get_id_of_first_token_in_document(document);
// printf("process_document, offset = %d\n", offset);
fprintf(stderr, "process_document, offset = %d\n", offset);
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
// if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
if(!strcmp(avl->attr, (char *)"tokens")) check_tokens(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset);
}
......
......@@ -122,6 +122,38 @@ context *context_read_options(int argc, char *argv[])
return ctx;
}
word *get_rightmost_child(word_buffer *wb, word *gov)
{
word *bm1 = word_buffer_bm1(wb);
int rightmost_child_index = word_get_index(gov);
for(int current_index = word_get_index(gov); current_index < wb->nbelem; current_index++)
if(word_get_gov_index(word_buffer_get_word_n(wb, current_index)) == word_get_index(gov))
rightmost_child_index = current_index;
return word_buffer_get_word_n(wb, rightmost_child_index);
}
word *get_rightmost_descendent(word_buffer *wb, word *root)
{
word *rightmost_descendent = root;
word *rightmost_child = NULL;
int change = 1;
while(change){
change = 0;
rightmost_child = get_rightmost_child(wb, rightmost_descendent);
if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
rightmost_descendent = rightmost_child;
change = 1;
}
}
return rightmost_descendent;
}
int main(int argc, char *argv[])
{
FILE *output_file;
......@@ -140,6 +172,8 @@ int main(int argc, char *argv[])
int feats_col = mcd_get_feats_col(ctx->mcd_struct);
int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct);
int index = 1;
char label[100];
word *rightmost_descendent = NULL;
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
......@@ -147,7 +181,6 @@ int main(int argc, char *argv[])
do{
w = word_buffer_b0(wb);
if(w){
fprintf(output_file, "%d\t", index);
if(form_col != -1)
......@@ -189,8 +222,18 @@ int main(int argc, char *argv[])
else
fprintf(output_file, "_\t");
if(label_col != -1)
if(label_col != -1){
word_print_col_n(output_file, w, label_col);
word_sprint_col_n(label, w, label_col);
if(!strcmp(label, "root")){
rightmost_descendent = get_rightmost_descendent(wb, w);
// printf("index rightmost desc = %d form = %s\n", rightmost_descendent->index, rightmost_descendent->form);
if(rightmost_descendent){
word_set_sent_seg(rightmost_descendent, 1);
}
}
}
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
......
......@@ -148,7 +148,7 @@ int movement_root(config *c, int movement_code, int root_code)
word_set_sent_seg(bm1, 1);
/* word *rd = word_buffer_get_rightmost_descendent(config_get_buffer(c), s0);
/* word *rd = word_buffer_get_rightmost_descendent_of_s0(config_get_buffer(c), s0);
if(rd)
word_set_sent_seg(rd, 1); */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment