first version of datcha available

88ff0e4e · Alexis Nasr · d8808930 · 88ff0e4e · 88ff0e4e · 88ff0e4e
Commit 88ff0e4e authored Oct 13, 2016 by Alexis Nasr
--- a/datcha/data/morpho-lexicon/fplm_change_pos.pl
+++ b/datcha/data/morpho-lexicon/fplm_change_pos.pl
@@ -23,7 +23,8 @@ $orfeo2datcha{"VRB"} = "VER";

 while(<>)
 {
-    ($form, $pos, $lemma, $morpho) = split;
+    chop;
+    ($form, $pos, $lemma, $morpho) = split /\t/;
    $pos_datcha = $orfeo2datcha{$pos};
    print "$form\t$pos_datcha\t$lemma\t$morpho\n";
 }
--- a/datcha/eval/Makefile
+++ b/datcha/eval/Makefile
-TEST=../data/treebank/test.conll07
-DEV=../data/treebank/dev.conll07
+TEST=../data/treebank/tchat91.mcf
+#TEST=../data/treebank/test.mcf
+DEV=../data/treebank/dev.mcf
 LANGUAGE=datcha
 MCD_FILE=eval.mcd


--- a/en/maca_trans_parser/Makefile
+++ b/en/maca_trans_parser/Makefile
@@ -7,11 +7,10 @@ CFF_CUTOFF_TRAIN=train.cutoff.cff
 PERCEPTRON_ITERATIONS=5
 CFF_CUTOFF=1
 FEATURES_MODEL_FILENAME=maca_trans_parser.fm
-MCD_FILENAME=maca_trans_parser.mcd
+MCD_FILENAME=plgfs.mcd
 VOCABS_FILENAME=maca_trans_parser.vocab 
 MODEL_FILENAME=maca_trans_parser.model 
 NUMBER_OF_SENTENCES=10000000
-STREAM_MODE=
-#STREAM_MODE= -S
+STREAM_MODE= -S

 include ../../makefiles/maca_trans_parser.makefile
--- a/fr_stream/maca_trans_parser/Makefile
+++ b/fr_stream/maca_trans_parser/Makefile
@@ -9,7 +9,8 @@ CFF_CUTOFF=1
 FEATURES_MODEL_FILENAME=maca_trans_parser.fm
 VOCABS_FILENAME=maca_trans_parser.vocab 
 MODEL_FILENAME=maca_trans_parser.model 
-NUMBER_OF_SENTENCES=10000000
+#NUMBER_OF_SENTENCES=10000000
+NUMBER_OF_SENTENCES=1000
 MCD_FILENAME=plgfs.mcd
 STREAM_MODE= -S


--- a/fr_stream/maca_trans_parser/maca_trans_parser.fm
+++ b/fr_stream/maca_trans_parser/maca_trans_parser.fm
+s0f
 s0l
 s0p
 s1p
+b0f
 b0l
 b0p
+b1f
 b1l
 b1p
 b2p
@@ -11,6 +14,9 @@ ldep_s0r
 rdep_s0r
 ldep_b0r
 rdep_b0r
+s0f b0f
+s0f b0l
+s0l b0f
 s0l b0l
 s0p b0p
 b0p b0l
@@ -18,15 +24,21 @@ b0p ldep_b0r
 s1p b1p
 b1p b2p
 s0p b0p b0l
+s0p b0p b0f
 s0p ldep_s0r rdep_s0r
 s0p s0l b0p
+s0p s0f b0p
 s0p b0p dist_s0_b0
 s1p s0p b0p
 b0p b1p b2p
 b1p b2p b3p
 s0p b0p b1p
 b1p b1l b2p b3p
+b1p b1f b2p b3p
 b1p b1l b2p b2l b3p
+b1p b1l b2p b2f b3p
+b1p b1f b2p b2l b3p
+b1p b1f b2p b2f b3p
 t1
 t2
 t3

--- a/jh/Makefile
+++ b/jh/Makefile
@@ -3,7 +3,7 @@
 all: compile install evaluation

 compile: 
-#	$(MAKE) -C data/morpho-lexicon compile
+	$(MAKE) -C data/morpho-lexicon compile
 	$(MAKE) -C data/treebank compile
 	$(MAKE) -C maca_trans_parser compile
 	$(MAKE) -C maca_trans_tagger compile
@@ -13,7 +13,7 @@ install:
 	-mkdir -p bin
 	$(MAKE) -C maca_trans_parser install
 	$(MAKE) -C maca_trans_tagger install
-#	$(MAKE) -C maca_lemmatizer install
+	$(MAKE) -C maca_lemmatizer install
 #	$(MAKE) -C maca_crf_tagger install
 #	@tar -cvzf ./maca_datas.tgz bin

@@ -21,7 +21,7 @@ evaluation:
 	$(MAKE) -C eval

 clean:
-#	$(MAKE) -C data/morpho-lexicon clean
+	$(MAKE) -C data/morpho-lexicon clean
 	$(MAKE) -C data/treebank clean
 	$(MAKE) -C maca_trans_parser clean
 	$(MAKE) -C maca_trans_tagger clean

--- a/makefiles/eval.makefile
+++ b/makefiles/eval.makefile
@@ -21,10 +21,10 @@ test_Wp: test_W
 	$(TAGGER) -L $(LANGUAGE) -C $(MCD_FILE) -i $< -S > $@

 test_Wpl: test_Wp
-	$(LEMMATIZER) -C $(MCD_FILE) -i $< > $@
+	$(LEMMATIZER) -C $(MCD_FILE) -L $(LANGUAGE) -i $< > $@

 test_WPl: test_WP
-	$(LEMMATIZER) -C $(MCD_FILE) -i $< > $@
+	$(LEMMATIZER) -C $(MCD_FILE) -L $(LANGUAGE) -i $< > $@

 test_WPLgfs: test_WPL
 	$(PARSER) -L $(LANGUAGE) -C $(MCD_FILE) -S -i $< > $@

--- a/tools/conll_lib.c
+++ b/tools/conll_lib.c
@@ -25,9 +25,17 @@ int parse_line(FILE *f, sentence *s);
 void renumber_sentence(sentence *s)
 {
  int i;
+  word *w;
  for(i=0 ; i < s->l; i++){
    s->words[i]->id = i;
  }
+  for(i=0 ; i < s->l; i++){
+    w = s->words[i];
+    if(w->mother)
+      w->head = w->mother->id;
+    else
+      w->head = 0;
+  }
 }



--- a/tools/conll_lib.h
+++ b/tools/conll_lib.h
@@ -21,7 +21,7 @@
 #include <stdio.h>
 #include "hash_str.h"

-#define MAX_WORDS_IN_SENTENCE 500
+#define MAX_WORDS_IN_SENTENCE 700
 #define MAX_STR 10000
 #define MAX_LINE_LENGTH 50000

@@ -43,7 +43,7 @@ typedef struct w
                 /* or identical to the coarse-grained part-of-speech tag if not available.*/
  char feats[MAX_STR];  /* Unordered set of syntactic and/or morphological features (depending on the particular language)*/
                 /*, separated by a vertical bar (|), or an underscore if not available.*/
-  unsigned head; /* Head of the current token, which is either a value of ID or zero ('0').*/
+  int head; /* Head of the current token, which is either a value of ID or zero ('0').*/
  char deprel[MAX_STR];  /* Dependency relation to the HEAD. The set of dependency relations depends on the particular language.*/
                 /* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/
  unsigned phead;/* Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. */

--- a/tools/ftb2datcha.c
+++ b/tools/ftb2datcha.c
@@ -219,6 +219,8 @@ int main(int argc, char *argv[])
    change_pos(s, op.h_pos);
    change_cpos(s, op.h_pos);

+    renumber_sentence(s);
+    compute_relative_index_of_heads(s);

    print_sentence_no_newline(s);
    

--- a/tools/ftb2fr_stream.c
+++ b/tools/ftb2fr_stream.c
@@ -18,8 +18,6 @@ typedef struct options
  hash_str *h_fct;
 } options;

-
-
 void change_pos_fr(sentence *s, hash_str *h_pos)
 {
  int i;
@@ -207,16 +205,22 @@ int main(int argc, char *argv[])
  sentence *s = allocate_sentence();
  int snum = 0;
  int res;
+  int root_index;
+  int root_to_end = 0;
+  int new_root_head;
  
  parse_options(argc, argv, &op);
  print_options(&op); 
  for(res = load_sentence(op.fd_parses, s); res && (snum < op.snum); res = load_sentence(op.fd_parses, s)){
    s->num = snum;
-    snum++;
+
+    if(s->l > 200) continue;
    if(!sentence_ends_with_poncts(s)) continue;
    if(number_of_roots_in_sentence(s) != 1) continue;
    if(sentence_contains_missinghead(s)) continue;

+    snum++;
+    
    /* change_pos_and_cpos_of_dot(s, &op); */
    change_form_and_lemma_of_numbers(s);
    change_pos_fr(s, op.h_pos);
@@ -224,6 +228,16 @@ int main(int argc, char *argv[])
    retokenize_three_dots(s);
    tokenize_dot(s, "titre", "poncts", "abbrev");
    renumber_sentence(s);
+    compute_relative_index_of_heads(s);
+
+
+    /*    root_index = get_root_index(s);
+    new_root_head = - root_index - root_to_end;
+    change_root_head(s, new_root_head);
+    root_to_end = s->l - root_index - 1;
+    */
+
+    
    print_sentence_no_newline(s);
    
  }

--- a/tools/ftb_lib.c
+++ b/tools/ftb_lib.c
@@ -5,6 +5,46 @@
 #include "conll_lib.h"
 #include "ftb_lib.h"

+void compute_relative_index_of_heads(sentence *s)
+{
+  int i;
+  word *w;
+  
+  for(i=1; i<s->l; i++){
+    w = s->words[i];
+    w->head = w->head - i; 
+  }
+}
+
+
+void change_root_head(sentence *s, int new_head_index)
+{
+  int i;
+  word *w;
+  
+  for(i=1; i<s->l; i++){
+    w = s->words[i];
+    if(!strcmp(w->deprel, "root")){
+      w->head = new_head_index;
+      break;
+    }
+  }
+}
+
+int get_root_index(sentence *s)
+{
+  int i;
+  word *w;
+  
+  for(i=1; i<s->l; i++){
+    w = s->words[i];
+    if(!strcmp(w->deprel, "root"))
+      return i;
+  }
+  return -1;
+}
+
+
 int sentence_ends_with_poncts(sentence *s)
 {
  word *w;
@@ -133,10 +173,13 @@ void print_sentence_no_newline(sentence *s)
    fprintf(stdout, "\t%s", w->lemma);
    /* fprintf(stdout, "\t%s", w->cpostag); */
    /* fprintf(stdout, "\t%s", w->feats); */
-    if(w->mother == NULL)
+
+    fprintf(stdout, "\t%d", w->head);
+
+    /*    if(w->mother == NULL)
      fprintf(stdout, "\t0");
    else
-      fprintf(stdout, "\t%d", w->mother->id - w->id);
+    fprintf(stdout, "\t%d", w->mother->id - w->id);*/
    /*
    else{
      if(strcmp(w->deprel, "root"))

--- a/tools/ftb_lib.h
+++ b/tools/ftb_lib.h
@@ -32,5 +32,7 @@ void change_label_of_last_dep(sentence *s);
 void retokenize_three_dots(sentence *s);
 void tokenize_dot(sentence *s, char *gov_postag, char *dep_postag, char *label);
 void print_sentence_no_newline(sentence *s);
-
+int get_root_index(sentence *s);
+void change_root_head(sentence *s, int new_head_index);
+void compute_relative_index_of_heads(sentence *s);
 #endif