Skip to content
Snippets Groups Projects
Commit 5591defd authored by Franck Dary's avatar Franck Dary
Browse files

tape_alphabets is now created using a TM

parent aef0880f
Branches
No related tags found
No related merge requests found
...@@ -107,8 +107,8 @@ word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct) ...@@ -107,8 +107,8 @@ word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct)
while(word_buffer_read_next_word(wb)){ while(word_buffer_read_next_word(wb)){
/* printf("load word %d\n", wb->nbelem - 1); */ /* printf("load word %d\n", wb->nbelem - 1); */
} }
//if(mcf_filename != NULL) if(mcf_filename != NULL)
// fclose(f); fclose(f);
return wb; return wb;
} }
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
#include<string.h> #include<string.h>
#include<unistd.h> #include<unistd.h>
#include<getopt.h> #include<getopt.h>
/* #include"movement_parser_arc_eager.h" */
#include"movements.h" #include"movements.h"
#include"oracle_parser_arc_eager.h" #include"oracle_parser_arc_eager.h"
#include"oracle_tagger.h" #include"oracle_tagger.h"
...@@ -15,7 +14,6 @@ ...@@ -15,7 +14,6 @@
#include"feat_types.h" #include"feat_types.h"
#include"dico.h" #include"dico.h"
#include"dico_vec.h" #include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h" #include"config2feat_vec.h"
#include"classifier.h" #include"classifier.h"
...@@ -44,7 +42,7 @@ void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) ...@@ -44,7 +42,7 @@ void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
} }
} }
void maca_tm_mcf2scf_help_message(context *ctx) void maca_tm_extract_tape_alphabets_help_message(context *ctx)
{ {
context_general_help_message(ctx); context_general_help_message(ctx);
context_mode_help_message(ctx); context_mode_help_message(ctx);
...@@ -52,7 +50,6 @@ void maca_tm_mcf2scf_help_message(context *ctx) ...@@ -52,7 +50,6 @@ void maca_tm_mcf2scf_help_message(context *ctx)
context_tm_help_message(ctx); context_tm_help_message(ctx);
fprintf(stderr, "INPUT\n"); fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx); context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n"); fprintf(stderr, "IN TEST MODE\n");
...@@ -62,22 +59,20 @@ void maca_tm_mcf2scf_help_message(context *ctx) ...@@ -62,22 +59,20 @@ void maca_tm_mcf2scf_help_message(context *ctx)
context_cff_help_message(ctx); context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n"); fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx); context_vocabs_help_message(ctx);
} }
void maca_tm_mcf2scf_check_options(context *ctx) void maca_tm_extract_tape_alphabets_check_options(context *ctx)
{ {
if(!ctx->input_filename if(!ctx->input_filename
|| ctx->help || ctx->help
/* || !ctx->mcd_filename */ || !ctx->mcd_filename
/* || !(ctx->cff_filename || ctx->fann_filename) */
){ ){
maca_tm_mcf2scf_help_message(ctx); maca_tm_extract_tape_alphabets_help_message(ctx);
exit(1); exit(1);
} }
} }
void generate_scf_file(context *ctx) void generate_tape_alphabets(context *ctx)
{ {
config *c; config *c;
int mvt_code; int mvt_code;
...@@ -123,18 +118,22 @@ void generate_scf_file(context *ctx) ...@@ -123,18 +118,22 @@ void generate_scf_file(context *ctx)
dico *d_lemma = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LEMMA"); dico *d_lemma = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LEMMA");
dico *d_rules = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"d_rules"); dico *d_rules = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"d_rules");
dico *d_pos = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"POS"); dico *d_pos = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"POS");
int word_nb = 0;
while(!word_buffer_end(config_get_buffer(c)) && (sentence_nb < ctx->sent_nb)){ while(!word_buffer_end(config_get_buffer(c)) && (sentence_nb < ctx->sent_nb)){
current_state = machine->state_array[c->current_state_nb]; current_state = machine->state_array[c->current_state_nb];
classif = current_state->classif; classif = current_state->classif;
mvt_tagset *tagset = classifier_get_output_tagset(classif); mvt_tagset *tagset = classifier_get_output_tagset(classif);
mvt_tagset_update(tagset); /*
fprintf(stderr, "Classif=%s\n", classif->name); fprintf(stderr, "Classif=%s\n", classif->name);
for(tm_transition *trans = current_state->trans_list; trans; trans = trans->next)
fprintf(stderr, "%d %d\n", trans->label, trans->destination);
if (!classif){ if (!classif){
fprintf(stderr, "ERROR %s : classifier is NULL\n", __func__); fprintf(stderr, "ERROR %s : classifier is NULL\n", __func__);
exit(1); exit(1);
} }
*/
if(ctx->f2p) if(ctx->f2p)
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
...@@ -178,7 +177,10 @@ void generate_scf_file(context *ctx) ...@@ -178,7 +177,10 @@ void generate_scf_file(context *ctx)
mvt_type = mvt_tagset_get_type(tagset, mvt_code); mvt_type = mvt_tagset_get_type(tagset, mvt_code);
if(classif->type == classifier::Type::Classifier) if(classif->type == classifier::Type::Classifier)
{
config2feat_vec_fann(classif->fm, c, classif->d_features, classif->fv, ctx->mode); config2feat_vec_fann(classif->fm, c, classif->d_features, classif->fv, ctx->mode);
feat_vec_add_values_to_dicos(classif->fv, classif->fm, machine->d_tapes, ctx->mcd_struct);
}
if(ctx->debug_mode){ if(ctx->debug_mode){
config_print(stdout,c); config_print(stdout,c);
...@@ -191,13 +193,15 @@ void generate_scf_file(context *ctx) ...@@ -191,13 +193,15 @@ void generate_scf_file(context *ctx)
/* advance head in ref word buffer */ /* advance head in ref word buffer */
if((mvt_type == MVT_RIGHT) || (mvt_type == MVT_SHIFT) || (mvt_type == MVT_FWD)){ if((mvt_type == MVT_RIGHT) || (mvt_type == MVT_SHIFT) || (mvt_type == MVT_FWD)){
word_buffer_move_right(ref); word_buffer_move_right(ref);
if((++word_nb % 1000) == 0)
fprintf(stderr, "\rword %d", word_nb);
} }
else if (mvt_type == MVT_BKWD){ else if (mvt_type == MVT_BKWD){
word_buffer_move_left(ref); word_buffer_move_left(ref);
} }
if(mvt_type == MVT_EOS) if(mvt_type == MVT_EOS)
if((++sentence_nb % 100) == 0) fprintf(stderr, "\rsentence %d", sentence_nb); sentence_nb++;
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
...@@ -208,7 +212,7 @@ void generate_scf_file(context *ctx) ...@@ -208,7 +212,7 @@ void generate_scf_file(context *ctx)
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
context *ctx = context_read_options(argc, argv); context *ctx = context_read_options(argc, argv);
maca_tm_mcf2scf_check_options(ctx); maca_tm_extract_tape_alphabets_check_options(ctx);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, 0.5); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, 0.5);
for(unsigned int i = 0; i <= FEAT_TYPE_NB; i++){ for(unsigned int i = 0; i <= FEAT_TYPE_NB; i++){
...@@ -217,6 +221,11 @@ int main(int argc, char *argv[]) ...@@ -217,6 +221,11 @@ int main(int argc, char *argv[])
dico_vec_add(ctx->vocabs, dico_new(name,1000)); dico_vec_add(ctx->vocabs, dico_new(name,1000));
} }
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
dico_set_add_unknown_strings();
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
word_buffer_free(ref);
dico_vec_print(ctx->vocabs_filename, ctx->vocabs); dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
context_free(ctx); context_free(ctx);
...@@ -226,8 +235,7 @@ int main(int argc, char *argv[]) ...@@ -226,8 +235,7 @@ int main(int argc, char *argv[])
ctx->machine = machine; ctx->machine = machine;
mcd_link_to_dico(ctx->mcd_struct, machine->d_tapes, ctx->verbose); mcd_link_to_dico(ctx->mcd_struct, machine->d_tapes, ctx->verbose);
dico_set_add_unknown_strings(); generate_tape_alphabets(ctx);
generate_scf_file(ctx);
dico_unset_add_unknown_strings(); dico_unset_add_unknown_strings();
tm_save_d_tapes(machine); tm_save_d_tapes(machine);
......
...@@ -234,9 +234,8 @@ int main(int argc, char *argv[]) ...@@ -234,9 +234,8 @@ int main(int argc, char *argv[])
classifier_set_d_features(classif, dico_new(string, 1000000)); classifier_set_d_features(classif, dico_new(string, 1000000));
} }
dico_set_add_unknown_strings();
generate_scf_file(ctx);
dico_unset_add_unknown_strings(); dico_unset_add_unknown_strings();
generate_scf_file(ctx);
/* in train mode print all feature dictionnaries that have been created as well as classifiers descriptions */ /* in train mode print all feature dictionnaries that have been created as well as classifiers descriptions */
if(ctx->mode == TRAIN_MODE){ if(ctx->mode == TRAIN_MODE){
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment