Skip to content
Snippets Groups Projects
Commit 88ff0e4e authored by Alexis Nasr's avatar Alexis Nasr
Browse files

first version of datcha available

parent d8808930
No related branches found
No related tags found
No related merge requests found
......@@ -23,7 +23,8 @@ $orfeo2datcha{"VRB"} = "VER";
while(<>)
{
($form, $pos, $lemma, $morpho) = split;
chop;
($form, $pos, $lemma, $morpho) = split /\t/;
$pos_datcha = $orfeo2datcha{$pos};
print "$form\t$pos_datcha\t$lemma\t$morpho\n";
}
TEST=../data/treebank/test.conll07
DEV=../data/treebank/dev.conll07
TEST=../data/treebank/tchat91.mcf
#TEST=../data/treebank/test.mcf
DEV=../data/treebank/dev.mcf
LANGUAGE=datcha
MCD_FILE=eval.mcd
......
......@@ -7,11 +7,10 @@ CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=5
CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=maca_trans_parser.fm
MCD_FILENAME=maca_trans_parser.mcd
MCD_FILENAME=plgfs.mcd
VOCABS_FILENAME=maca_trans_parser.vocab
MODEL_FILENAME=maca_trans_parser.model
NUMBER_OF_SENTENCES=10000000
STREAM_MODE=
#STREAM_MODE= -S
STREAM_MODE= -S
include ../../makefiles/maca_trans_parser.makefile
......@@ -9,7 +9,8 @@ CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=maca_trans_parser.fm
VOCABS_FILENAME=maca_trans_parser.vocab
MODEL_FILENAME=maca_trans_parser.model
NUMBER_OF_SENTENCES=10000000
#NUMBER_OF_SENTENCES=10000000
NUMBER_OF_SENTENCES=1000
MCD_FILENAME=plgfs.mcd
STREAM_MODE= -S
......
s0f
s0l
s0p
s1p
b0f
b0l
b0p
b1f
b1l
b1p
b2p
......@@ -11,6 +14,9 @@ ldep_s0r
rdep_s0r
ldep_b0r
rdep_b0r
s0f b0f
s0f b0l
s0l b0f
s0l b0l
s0p b0p
b0p b0l
......@@ -18,15 +24,21 @@ b0p ldep_b0r
s1p b1p
b1p b2p
s0p b0p b0l
s0p b0p b0f
s0p ldep_s0r rdep_s0r
s0p s0l b0p
s0p s0f b0p
s0p b0p dist_s0_b0
s1p s0p b0p
b0p b1p b2p
b1p b2p b3p
s0p b0p b1p
b1p b1l b2p b3p
b1p b1f b2p b3p
b1p b1l b2p b2l b3p
b1p b1l b2p b2f b3p
b1p b1f b2p b2l b3p
b1p b1f b2p b2f b3p
t1
t2
t3
......
......@@ -3,7 +3,7 @@
all: compile install evaluation
compile:
# $(MAKE) -C data/morpho-lexicon compile
$(MAKE) -C data/morpho-lexicon compile
$(MAKE) -C data/treebank compile
$(MAKE) -C maca_trans_parser compile
$(MAKE) -C maca_trans_tagger compile
......@@ -13,7 +13,7 @@ install:
-mkdir -p bin
$(MAKE) -C maca_trans_parser install
$(MAKE) -C maca_trans_tagger install
# $(MAKE) -C maca_lemmatizer install
$(MAKE) -C maca_lemmatizer install
# $(MAKE) -C maca_crf_tagger install
# @tar -cvzf ./maca_datas.tgz bin
......@@ -21,7 +21,7 @@ evaluation:
$(MAKE) -C eval
clean:
# $(MAKE) -C data/morpho-lexicon clean
$(MAKE) -C data/morpho-lexicon clean
$(MAKE) -C data/treebank clean
$(MAKE) -C maca_trans_parser clean
$(MAKE) -C maca_trans_tagger clean
......
......@@ -21,10 +21,10 @@ test_Wp: test_W
$(TAGGER) -L $(LANGUAGE) -C $(MCD_FILE) -i $< -S > $@
test_Wpl: test_Wp
$(LEMMATIZER) -C $(MCD_FILE) -i $< > $@
$(LEMMATIZER) -C $(MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPl: test_WP
$(LEMMATIZER) -C $(MCD_FILE) -i $< > $@
$(LEMMATIZER) -C $(MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPLgfs: test_WPL
$(PARSER) -L $(LANGUAGE) -C $(MCD_FILE) -S -i $< > $@
......
......@@ -25,9 +25,17 @@ int parse_line(FILE *f, sentence *s);
void renumber_sentence(sentence *s)
{
int i;
word *w;
for(i=0 ; i < s->l; i++){
s->words[i]->id = i;
}
for(i=0 ; i < s->l; i++){
w = s->words[i];
if(w->mother)
w->head = w->mother->id;
else
w->head = 0;
}
}
......
......@@ -21,7 +21,7 @@
#include <stdio.h>
#include "hash_str.h"
#define MAX_WORDS_IN_SENTENCE 500
#define MAX_WORDS_IN_SENTENCE 700
#define MAX_STR 10000
#define MAX_LINE_LENGTH 50000
......@@ -43,7 +43,7 @@ typedef struct w
/* or identical to the coarse-grained part-of-speech tag if not available.*/
char feats[MAX_STR]; /* Unordered set of syntactic and/or morphological features (depending on the particular language)*/
/*, separated by a vertical bar (|), or an underscore if not available.*/
unsigned head; /* Head of the current token, which is either a value of ID or zero ('0').*/
int head; /* Head of the current token, which is either a value of ID or zero ('0').*/
char deprel[MAX_STR]; /* Dependency relation to the HEAD. The set of dependency relations depends on the particular language.*/
/* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/
unsigned phead;/* Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. */
......
......@@ -219,6 +219,8 @@ int main(int argc, char *argv[])
change_pos(s, op.h_pos);
change_cpos(s, op.h_pos);
renumber_sentence(s);
compute_relative_index_of_heads(s);
print_sentence_no_newline(s);
......
......@@ -18,8 +18,6 @@ typedef struct options
hash_str *h_fct;
} options;
void change_pos_fr(sentence *s, hash_str *h_pos)
{
int i;
......@@ -207,16 +205,22 @@ int main(int argc, char *argv[])
sentence *s = allocate_sentence();
int snum = 0;
int res;
int root_index;
int root_to_end = 0;
int new_root_head;
parse_options(argc, argv, &op);
print_options(&op);
for(res = load_sentence(op.fd_parses, s); res && (snum < op.snum); res = load_sentence(op.fd_parses, s)){
s->num = snum;
snum++;
if(s->l > 200) continue;
if(!sentence_ends_with_poncts(s)) continue;
if(number_of_roots_in_sentence(s) != 1) continue;
if(sentence_contains_missinghead(s)) continue;
snum++;
/* change_pos_and_cpos_of_dot(s, &op); */
change_form_and_lemma_of_numbers(s);
change_pos_fr(s, op.h_pos);
......@@ -224,6 +228,16 @@ int main(int argc, char *argv[])
retokenize_three_dots(s);
tokenize_dot(s, "titre", "poncts", "abbrev");
renumber_sentence(s);
compute_relative_index_of_heads(s);
/* root_index = get_root_index(s);
new_root_head = - root_index - root_to_end;
change_root_head(s, new_root_head);
root_to_end = s->l - root_index - 1;
*/
print_sentence_no_newline(s);
}
......
......@@ -5,6 +5,46 @@
#include "conll_lib.h"
#include "ftb_lib.h"
void compute_relative_index_of_heads(sentence *s)
{
int i;
word *w;
for(i=1; i<s->l; i++){
w = s->words[i];
w->head = w->head - i;
}
}
void change_root_head(sentence *s, int new_head_index)
{
int i;
word *w;
for(i=1; i<s->l; i++){
w = s->words[i];
if(!strcmp(w->deprel, "root")){
w->head = new_head_index;
break;
}
}
}
int get_root_index(sentence *s)
{
int i;
word *w;
for(i=1; i<s->l; i++){
w = s->words[i];
if(!strcmp(w->deprel, "root"))
return i;
}
return -1;
}
int sentence_ends_with_poncts(sentence *s)
{
word *w;
......@@ -133,10 +173,13 @@ void print_sentence_no_newline(sentence *s)
fprintf(stdout, "\t%s", w->lemma);
/* fprintf(stdout, "\t%s", w->cpostag); */
/* fprintf(stdout, "\t%s", w->feats); */
if(w->mother == NULL)
fprintf(stdout, "\t%d", w->head);
/* if(w->mother == NULL)
fprintf(stdout, "\t0");
else
fprintf(stdout, "\t%d", w->mother->id - w->id);
fprintf(stdout, "\t%d", w->mother->id - w->id);*/
/*
else{
if(strcmp(w->deprel, "root"))
......
......@@ -32,5 +32,7 @@ void change_label_of_last_dep(sentence *s);
void retokenize_three_dots(sentence *s);
void tokenize_dot(sentence *s, char *gov_postag, char *dep_postag, char *label);
void print_sentence_no_newline(sentence *s);
int get_root_index(sentence *s);
void change_root_head(sentence *s, int new_head_index);
void compute_relative_index_of_heads(sentence *s);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment