added fields DIRECTORY and FILE in mcd files

d1620403 · Alexis Nasr · fba33332 · d1620403 · d1620403 · d1620403
Commit d1620403 authored 6 years ago by Alexis Nasr
--- a/maca_common/include/json_parser.h
+++ b/maca_common/include/json_parser.h
@@ -3,7 +3,7 @@
 #include "json_tree.h"
-#define YYTEXT_MAX 100
+#define YYTEXT_MAX 5000
 #define EPSILON 0
 /* symboles non terminaux */

--- a/maca_common/include/word_buffer.h
+++ b/maca_common/include/word_buffer.h
@@ -56,8 +56,8 @@ int          word_buffer_read_sentence(word_buffer *bw);
 word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
 int          word_buffer_locate_token_with_offset(word_buffer *wb, int offset);
-word        *word_buffer_get_rightmost_child(word_buffer *wb, word *gov);
+word        *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov);
-word        *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root);
+word        *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root);

--- a/maca_common/src/word_buffer.c
+++ b/maca_common/src/word_buffer.c
@@ -187,7 +187,7 @@ int word_buffer_locate_token_with_offset(word_buffer *wb, int offset)
 } 
-word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov)
+word *word_buffer_get_rightmost_child_of_s0(word_buffer *wb, word *gov)
 {
  word *bm1 = word_buffer_bm1(wb);
  int rightmost_child_index = word_get_index(gov);
@@ -198,7 +198,7 @@ word *word_buffer_get_rightmost_child(word_buffer *wb, word *gov)
 }
-word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root)
+word *word_buffer_get_rightmost_descendent_of_s0(word_buffer *wb, word *root)
 {
  word *rightmost_descendent = root;
  word *rightmost_child = NULL;
@@ -206,7 +206,7 @@ word *word_buffer_get_rightmost_descendent(word_buffer *wb, word *root)
  while(change){
    change = 0;
-    rightmost_child = word_buffer_get_rightmost_child(wb, rightmost_descendent);
+    rightmost_child = word_buffer_get_rightmost_child_of_s0(wb, rightmost_descendent);
    if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
      rightmost_descendent = rightmost_child;
      change = 1;

--- a/maca_tokenizer/src/fr_tok_rules.l
+++ b/maca_tokenizer/src/fr_tok_rules.l
@@ -26,7 +26,7 @@ nosepar [^ \t\n]
 	if(defait_amalgames){
 	BEGIN(state_defait_amalgames);
        }
-#.*    ECHO;
+#.*    {ECHO; printf("\n");}
 \<[^\>]*\>   {maca_tokenizer_segment((char *)"", yytext);}
 {separ}+     {maca_tokenizer_segment((char *)"", yytext);}
 \.   {maca_tokenizer_segment((char *)".", yytext);}

--- a/maca_tools/src/json2mcf.c
+++ b/maca_tools/src/json2mcf.c
@@ -242,6 +242,14 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
  if(status_lab && !strcmp(status_lab, "G")){
    fprintf(stderr, "updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
    w = word_buffer_get_word_n(wb, offset + start);
+    /* -------------------------------------*/
+    /* added by alexis 210/07/18 for datcha */
+    word_set_F(w, 1);
+    /* -------------------------------------*/
+    if(w == NULL){
+      fprintf(stderr, "WARNING cannot access segment %d\n", offset + start);
+    }
+    else{
      if(d)
 	label_code = dico_string2int(d, label);
      if(label_code == -1)
@@ -250,6 +258,7 @@ void update_segment(word_buffer *wb, int start, int end, char *label, char *stat
 	word_set_pos(w, label_code);
    }
  }
+}
 void process_segment(json_attr_val *avl, word_buffer *wb, int offset)
 {
@@ -297,23 +306,41 @@ void update_link(word_buffer *wb,  int orig, int dest, char *label, char *status
    fprintf(stderr, "updating label of link %d -> %d with \"%s\"\n", orig, dest, label);
    w = word_buffer_get_word_n(wb, offset + orig);
+    if(w == NULL){
+      fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
+    }
+    else{
      if(d)
 	label_code = dico_string2int(d, label);
      if(label_code == -1)
-      fprintf(stderr, "label %s unknown\n", label);
+	fprintf(stderr, "WARNING : label %s unknown\n", label);
-    else
+      //    else
      word_set_label(w, label_code);
    }
+  }
  if(status_link && !strcmp(status_link, "G")){
    fprintf(stderr, "updating governor of token %d with %d\n", orig, dest);
    w = word_buffer_get_word_n(wb, offset + orig);
-    word_set_gov(w, dest - orig);
+    if(w == NULL){
+      fprintf(stderr, "WARNING cannot access segment %d\n", offset + orig);
    }
+    else{
+      if(dest == -1) /* -1 is for root */
+	word_set_gov(w, 0);
+      else
+	word_set_gov(w, dest - orig);
+    }
+  }
 }
 void process_link(json_attr_val *avl, word_buffer *wb, int offset)
 {
  int orig, dest;
@@ -366,13 +393,58 @@ int get_id_of_first_token_in_document(json_struct *document)
  return -1;
 }
+int string_equal(char *s_json, char *s_mcf)
+{
+  if(!strcmp(s_json, "&quot") && !strcmp(s_mcf, "\"")) return 1;
+  if(!strcmp(s_json, "&quot;&quot") && !strcmp(s_mcf, "\";\"")) return 1;
+  if(!strcmp(s_json, s_mcf)) return 1;
+  return 0;
+}
+//{"id":337,"word":"Bonjour","bold":0,"newline":0}
+void check_token(json_attr_val *avl, word_buffer *wb, int offset)
+{
+  int id;
+  char *form_json = NULL;
+  char *form_mcf = NULL;
+  json_attr_val *av;
+  word *w = NULL;
+  for(av = avl; av != NULL; av = av->next){
+    //      printf("attr = %s\n", av->attr);
+    if(!strcmp(av->attr, "id")){id = (int)(av->val->u.number); continue;}
+    if(!strcmp(av->attr, "word")){form_json = av->val->u.string; continue;}
+  }
+  w = word_buffer_get_word_n(wb, id);
+  form_mcf = w->form;
+  fprintf(stderr, "id : %d \t json : %s \t mcf : %s\n", id, form_json, form_mcf);
+  if(!string_equal(form_json, form_mcf)){
+    fprintf(stderr, "ERROR, tokens do not correspond in json and mcf files\n");
+    exit(1);
+  }
+}
+void check_tokens(json_struct *tokens, word_buffer *wb, int offset)
+{
+  json_struct *token;
+  //  printf("process_tokens\n");
+  for(token = tokens->u.first; token != NULL; token = token->next){
+    check_token(token->u.attr_val_list, wb, offset);
+  }
+}
 void process_document(json_struct *document, word_buffer *wb)
 {
  json_attr_val *avl = NULL;
  int offset = get_id_of_first_token_in_document(document);
-  //  printf("process_document, offset = %d\n", offset);
+  fprintf(stderr, "process_document, offset = %d\n", offset);
  for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
    //    if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
+    if(!strcmp(avl->attr, (char *)"tokens")) check_tokens(avl->val, wb, offset);
    if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb, offset);
    if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb, offset);
  }

--- a/maca_tools/src/mcf2conll.c
+++ b/maca_tools/src/mcf2conll.c
@@ -122,6 +122,38 @@ context *context_read_options(int argc, char *argv[])
  return ctx;
 }
+word *get_rightmost_child(word_buffer *wb, word *gov)
+{
+  word *bm1 = word_buffer_bm1(wb);
+  int rightmost_child_index = word_get_index(gov);
+  for(int current_index = word_get_index(gov); current_index < wb->nbelem; current_index++)
+    if(word_get_gov_index(word_buffer_get_word_n(wb, current_index)) == word_get_index(gov))
+      rightmost_child_index = current_index;
+  return word_buffer_get_word_n(wb, rightmost_child_index);
+}
+word *get_rightmost_descendent(word_buffer *wb, word *root)
+{
+  word *rightmost_descendent = root;
+  word *rightmost_child = NULL;
+  int change = 1;
+  while(change){
+    change = 0;
+    rightmost_child = get_rightmost_child(wb, rightmost_descendent);
+    if(word_get_index(rightmost_child) > word_get_index(rightmost_descendent)){
+      rightmost_descendent = rightmost_child;
+      change = 1;
+    }
+  }
+  return rightmost_descendent;
+}
 int main(int argc, char *argv[])
 {
  FILE *output_file;
@@ -140,6 +172,8 @@ int main(int argc, char *argv[])
  int feats_col = mcd_get_feats_col(ctx->mcd_struct);
  int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct);
  int index = 1;
+  char label[100];
+  word *rightmost_descendent = NULL;
  output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
@@ -147,7 +181,6 @@ int main(int argc, char *argv[])
  do{
    w = word_buffer_b0(wb);
    if(w){
      fprintf(output_file, "%d\t", index);
      if(form_col != -1)
@@ -189,8 +222,18 @@ int main(int argc, char *argv[])
      else
 	fprintf(output_file, "_\t");
-      if(label_col != -1)
+      if(label_col != -1){
 	word_print_col_n(output_file, w, label_col);
+	word_sprint_col_n(label, w, label_col);
+	if(!strcmp(label, "root")){
+	  rightmost_descendent = get_rightmost_descendent(wb, w);
+	  //	  printf("index rightmost desc = %d form = %s\n", rightmost_descendent->index, rightmost_descendent->form);
+	  if(rightmost_descendent){
+	    word_set_sent_seg(rightmost_descendent, 1);
+	  }
+	}
+      }
      else
 	fprintf(output_file, "_");
      fprintf(output_file, "\t");

--- a/maca_trans_parser/src/movements.c
+++ b/maca_trans_parser/src/movements.c
@@ -148,7 +148,7 @@ int movement_root(config *c, int movement_code, int root_code)
  word_set_sent_seg(bm1, 1); 
-  /*  word *rd = word_buffer_get_rightmost_descendent(config_get_buffer(c), s0);
+  /*  word *rd = word_buffer_get_rightmost_descendent_of_s0(config_get_buffer(c), s0);
  if(rd)
  word_set_sent_seg(rd, 1); */