Skip to content
Snippets Groups Projects
Commit aef0880f authored by Franck Dary's avatar Franck Dary
Browse files

Added a new executable to create tape_alphabets.dic

parent d0ca3bf0
No related branches found
No related tags found
No related merge requests found
......@@ -110,6 +110,13 @@ target_link_libraries(maca_extract_tape_alphabets transparse)
target_link_libraries(maca_extract_tape_alphabets maca_common)
install (TARGETS maca_extract_tape_alphabets DESTINATION bin)
add_executable(maca_tm_extract_tape_alphabets ./src/maca_tm_extract_tape_alphabets.c)
target_link_libraries(maca_tm_extract_tape_alphabets perceptron)
target_link_libraries(maca_tm_extract_tape_alphabets mlp)
target_link_libraries(maca_tm_extract_tape_alphabets transparse)
target_link_libraries(maca_tm_extract_tape_alphabets maca_common)
install (TARGETS maca_tm_extract_tape_alphabets DESTINATION bin)
#=======
#add_executable(maca_trans_chunker_mcf2cff ./src/maca_trans_chunker_mcf2cff.c)
#target_link_libraries(maca_trans_chunker_mcf2cff perceptron)
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
/* #include"movement_parser_arc_eager.h" */
#include"movements.h"
#include"oracle_parser_arc_eager.h"
#include"oracle_tagger.h"
#include"oracle_lemmatizer_rules.h"
#include"oracle_lemmatizer_lookup.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"feat_types.h"
#include"dico.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
#include"classifier.h"
int oracle_morpho(config *c, mvt_tagset *tagset)
{
int morpho_feats = word_get_feats(word_buffer_b0(config_get_buffer(c)));
int mvt_code = mvt_tagset_get_code(tagset, MVT_MORPHO, morpho_feats);
return mvt_code;
}
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
{
int i;
word *w;
char lower_form[100];
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
w->signature = form2pos_get_signature(f2p, w->form);
if(w->signature == -1){
strcpy(lower_form, w->form);
to_lower_string(lower_form);
w->signature = form2pos_get_signature(f2p, lower_form);
}
}
}
void maca_tm_mcf2scf_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
context_tm_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_tm_mcf2scf_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_tm_mcf2scf_help_message(ctx);
exit(1);
}
}
void generate_scf_file(context *ctx)
{
config *c;
int mvt_code;
int sentence_nb = 0;
int root_label = 0;
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
Tm *machine = ctx->machine;
tm_state *current_state = NULL;
int mvt_type;
dico *d_synt_labels;
classifier *classif = NULL;
FILE *output_file;
/* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
/* the idea is to ignore syntax in the mcf file that will be read */
/* it is ugly !!! */
mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
/*mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_POS); */
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG);
/* open output file */
output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w", ctx->verbose) : stdout;
d_synt_labels = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LABEL");
if(d_synt_labels == NULL){
fprintf(stderr, "cannot find syntactic label alphabet in transition machine tape dictionaries\n");
}
else{
root_label = dico_string2int(d_synt_labels, (char *) ctx->root_label);
}
/* c = config_new(mcf_file, ctx->mcd_struct, 5); */
c = config_new(mcf_file, mcd_struct_hyp, 5);
/* c = config_new_load_all_mcf(ctx->input_filename, ctx->mcd_struct); */
dico *d_form = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"FORM");
dico *d_lemma = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LEMMA");
dico *d_rules = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"d_rules");
dico *d_pos = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"POS");
while(!word_buffer_end(config_get_buffer(c)) && (sentence_nb < ctx->sent_nb)){
current_state = machine->state_array[c->current_state_nb];
classif = current_state->classif;
mvt_tagset *tagset = classifier_get_output_tagset(classif);
mvt_tagset_update(tagset);
fprintf(stderr, "Classif=%s\n", classif->name);
if (!classif){
fprintf(stderr, "ERROR %s : classifier is NULL\n", __func__);
exit(1);
}
if(ctx->f2p)
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
if(classif->type == classifier::Type::Classifier){
if(!strcmp("PARSER", classifier_get_oracle_name(classif))){
mvt_code = oracle_parser_arc_eager(c, ref, root_label, tagset);
}
else if(!strcmp("MORPHO", classifier_get_oracle_name(classif))){
mvt_code = oracle_morpho(c, tagset);
}
else if(!strcmp("TAGGER", classifier_get_oracle_name(classif))){
mvt_code = oracle_tagger(c, tagset);
}
else if(!strcmp("LEMMATIZER_RULES", classifier_get_oracle_name(classif))){
mvt_code = oracle_lemmatizer_rules(c, tagset, d_form, d_lemma, d_rules);
}
else{
fprintf(stderr, "do not know which oracle to use for state %s, oracle_name = %s\n", current_state->name, classifier_get_oracle_name(classif));
exit(1);
}
}
else if(classif->type == classifier::Type::Forced){
mvt_code = 0;
}
else if(classif->type == classifier::Type::Lookup){
if(!strcmp("LEMMATIZER_LOOKUP", classifier_get_oracle_name(classif))){
mvt_code = oracle_lemmatizer_lookup(c, tagset, d_form, d_lemma, d_pos, classif->fplm);
}
else{
fprintf(stderr, "do not know which oracle to use for state %s, oracle_name = %s\n", current_state->name, classifier_get_oracle_name(classif));
exit(1);
}
}
else{
fprintf(stderr, "ERROR %s : classifier '%s' wrong type '%s'\n", __func__, classif->name, type2string(classif->type));
exit(1);
}
mvt_type = mvt_tagset_get_type(tagset, mvt_code);
if(classif->type == classifier::Type::Classifier)
config2feat_vec_fann(classif->fm, c, classif->d_features, classif->fv, ctx->mode);
if(ctx->debug_mode){
config_print(stdout,c);
mvt_tagset_print_mvt(stdout, tagset, mvt_code);
fprintf(stdout, "\n");
}
movement_apply(c, mvt_code, tagset, root_label, machine);
/* advance head in ref word buffer */
if((mvt_type == MVT_RIGHT) || (mvt_type == MVT_SHIFT) || (mvt_type == MVT_FWD)){
word_buffer_move_right(ref);
}
else if (mvt_type == MVT_BKWD){
word_buffer_move_left(ref);
}
if(mvt_type == MVT_EOS)
if((++sentence_nb % 100) == 0) fprintf(stderr, "\rsentence %d", sentence_nb);
}
fprintf(stderr, "\n");
if(ctx->cff_filename)
fclose(output_file);
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
maca_tm_mcf2scf_check_options(ctx);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, 0.5);
for(unsigned int i = 0; i <= FEAT_TYPE_NB; i++){
char *name = (char*)feat_type_int2string(i);
if(!dico_vec_get_dico(ctx->vocabs, name))
dico_vec_add(ctx->vocabs, dico_new(name,1000));
}
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
context_free(ctx);
ctx = context_read_options(argc, argv);
Tm *machine = tm_load(ctx->tm_filename, ctx->maca_data_path, ctx->verbose);
ctx->machine = machine;
mcd_link_to_dico(ctx->mcd_struct, machine->d_tapes, ctx->verbose);
dico_set_add_unknown_strings();
generate_scf_file(ctx);
dico_unset_add_unknown_strings();
tm_save_d_tapes(machine);
context_free(ctx);
return 0;
}
......@@ -18,6 +18,23 @@ mvt_tagset *mvt_tagset_new(char *name, dico *d_labels)
return t;
}
void mvt_tagset_free(mvt_tagset *t)
{
free(t);
}
void mvt_tagset_update(mvt_tagset *t)
{
if(!t->create_itself)
{
fprintf(stderr, "ERROR (%s) : tagset '%s' cannot be updated, 'create_itself' is NULL\n", __func__, t->name);
}
mvt_tagset *new_tagset = t->create_itself(t->d_labels);
memcpy(t, new_tagset, sizeof *t);
mvt_tagset_free(new_tagset);
}
int mvt_tagset_get_label(mvt_tagset *t, int code)
{
return code - t->start[mvt_tagset_get_type(t, code)];
......@@ -156,6 +173,9 @@ mvt_tagset *mvt_tagset_parser(dico *d_labels)
t->end[MVT_RIGHT] = t->nbelem;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_parser;
return t;
}
......@@ -168,6 +188,9 @@ mvt_tagset *mvt_tagset_tagger(dico *d_labels)
t->end[MVT_POS] = t->nbelem - 1;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_tagger;
return t;
}
......@@ -180,6 +203,9 @@ mvt_tagset *mvt_tagset_morpho(dico *d_labels)
t->end[MVT_MORPHO] = t->nbelem - 1;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_morpho;
return t;
}
......@@ -192,6 +218,9 @@ mvt_tagset *mvt_tagset_lemmatizer_rules(dico *d_labels)
t->end[MVT_LEMMATIZER_RULES] = t->nbelem - 1;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_lemmatizer_rules;
return t;
}
......@@ -201,6 +230,9 @@ mvt_tagset *mvt_tagset_backtracker(dico *d_labels)
t->start[MVT_BKWD] = t->end[MVT_BKWD] = t->nbelem++;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_backtracker;
return t;
}
......@@ -210,6 +242,9 @@ mvt_tagset *mvt_tagset_forward(dico *d_labels)
t->start[MVT_FWD] = t->end[MVT_FWD] = t->nbelem++;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_forward;
return t;
}
......@@ -222,6 +257,9 @@ mvt_tagset *mvt_tagset_lookup(dico *d_labels)
t->end[MVT_LOOKUP_FOUND] = t->nbelem - 1;
//mvt_tagset_print(stderr, t);
t->create_itself = &mvt_tagset_lookup;
return t;
}
......@@ -231,5 +269,8 @@ mvt_tagset *mvt_tagset_std(void)
t->start[MVT_FWD] = t->end[MVT_FWD] = t->nbelem++;
//mvt_tagset_print(stderr, t);
t->create_itself = NULL;
return t;
}
......@@ -20,16 +20,20 @@
#include"dico.h"
typedef struct{
struct mvt_tagset{
char *name;
dico *d_labels;
int nbelem;
/* int array[MVT_TYPES_NB]; */
int start[MVT_TYPES_NB];
int end[MVT_TYPES_NB];
}mvt_tagset;
mvt_tagset * (*create_itself)(dico *); //The function used to create this instance, usefull for updates
};
mvt_tagset *mvt_tagset_new(char *name, dico *d_labels);
void mvt_tagset_free(mvt_tagset *t);
void mvt_tagset_update(mvt_tagset *t); //If d_labels change you need to call this
int mvt_tagset_get_label(mvt_tagset *t, int code);
int mvt_tagset_get_type(mvt_tagset *t, int code);
int mvt_tagset_get_code(mvt_tagset *t, int type, int label);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment