Commit 01502e5d authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added executable maca_trans_parser_conll2fann

parent cd2b4cf3
......@@ -32,14 +32,17 @@ void tag_sentence(macaon::Decoder& decoder, macaon::BinaryLexicon* lexicon, cons
}
std::cout << "\n";
} else {
std::cout << lines[i][wordField] << "\t" << tagged[i] << "\n";
for(size_t j = 0; j < lines[i].size(); j++) {
std::cout << lines[i][j] << "\t";
}
std::cout << tagged[i] << "\n";
}
}
std::cout << "\n";
}
void usage(const char* argv0) {
std::cerr << "usage: " << argv0 << " [--conll07] <model> [lexicon]\n";
std::cerr << "usage: " << argv0 << " [--conll07|--column <num>] <model> [lexicon]\n";
exit(1);
}
......@@ -56,6 +59,10 @@ int main(int argc, char** argv) {
} else if(arg == "--conll07") {
isConll07 = true;
word_offset = 1;
} else if(arg == "--column" && i < argc - 1) {
arg = argv[i + 1];
word_offset = strtol(arg.c_str(), NULL, 10) - 1;
i++;
} else if(modelName == "") {
modelName = arg;
} else if(lexiconName =="") {
......@@ -64,7 +71,7 @@ int main(int argc, char** argv) {
usage(argv[0]);
}
}
if(modelName == "") usage(argv[0]);
if(modelName == "" || word_offset < 0) usage(argv[0]);
macaon::Decoder decoder(modelName);
macaon::BinaryLexicon *lexicon = NULL;
......
......@@ -33,6 +33,8 @@ context *context_new(void)
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->form_column = -1;
ctx->pos_column = -1;
return ctx;
}
......@@ -48,6 +50,16 @@ void context_general_help_message(context *ctx)
void context_conll_help_message(context *ctx){
fprintf(stderr, "\t-i --conll <file> : conll file name\n");
}
void context_form_column_help_message(context *ctx){
fprintf(stderr, "\t-F --form_column <int> : column containing form\n");
}
void context_pos_column_help_message(context *ctx){
fprintf(stderr, "\t-P --pos_column <int> : column containing part of speech tag\n");
}
void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
......@@ -78,12 +90,14 @@ context *context_read_options(int argc, char *argv[])
{"mcd", required_argument, 0, 'm'},
{"language", required_argument, 0, 'C'},
{"fplm", required_argument, 0, 'f'},
{"form_column", required_argument, 0, 'F'},
{"pos_column", required_argument, 0, 'P'},
{"maca_data_path", required_argument, 0, 'M'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:f:m:C:M:", long_options, &option_index)) != -1){
while ((c = getopt_long (argc, argv, "hvdi:f:m:C:M:F:P:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
......@@ -95,6 +109,12 @@ context *context_read_options(int argc, char *argv[])
case 'v':
ctx->verbose = 1;
break;
case 'F':
ctx->form_column = atoi(optarg);
break;
case 'P':
ctx->pos_column = atoi(optarg);
break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
......@@ -116,7 +136,7 @@ context *context_read_options(int argc, char *argv[])
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename == NULL)
if((ctx->mcd_filename == NULL) && ((ctx->form_column == -1) || (ctx->pos_column == -1)))
ctx->mcd_struct = mcd_build_conll07();
return ctx;
......
......@@ -19,6 +19,8 @@ typedef struct {
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
int form_column;
int pos_column;
} context;
......@@ -33,6 +35,7 @@ void context_language_help_message(context *ctx);
void context_fplm_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
void context_form_column_help_message(context *ctx);
void context_pos_column_help_message(context *ctx);
#endif
......@@ -17,16 +17,18 @@ void maca_lemmatizer_help_message(context *ctx)
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_fplm_help_message(ctx);
context_form_column_help_message(ctx);
context_pos_column_help_message(ctx);
}
void maca_lemmatizer_check_options(context *ctx){
if(!ctx->conll_filename
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
|| ctx->help
){
maca_lemmatizer_help_message(ctx);
exit(1);
......@@ -93,12 +95,29 @@ int main(int argc, char *argv[])
int index_form_pos;
char **lemma_array;
context *ctx;
int form_column;
int pos_column;
FILE *f = NULL;
ctx = context_read_options(argc, argv);
maca_lemmatizer_check_options(ctx);
FILE *f = myfopen(ctx->conll_filename, "r");
if(ctx->pos_column != -1)
pos_column = ctx->pos_column;
else
pos_column = ctx->mcd_struct->type2col[FEAT_TYPE_POS];
if(ctx->form_column != -1)
form_column = ctx->form_column;
else
form_column = ctx->mcd_struct->type2col[FEAT_TYPE_FORM];
if(ctx->conll_filename == NULL)
f = stdin;
else
f = myfopen(ctx->conll_filename, "r");
lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht);
/* look for a valid word */
......@@ -116,9 +135,11 @@ int main(int argc, char *argv[])
form = NULL;
pos = NULL;
do{
if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_FORM))
/* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == form_column)) */
if(column_nb == form_column)
form = strdup(token);
if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_POS))
/* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == pos_column)) */
if(column_nb == pos_column)
pos = strdup(token);
column_nb++;
} while((token = strtok(NULL , "\t")));
......@@ -148,6 +169,9 @@ int main(int argc, char *argv[])
}
free(lemma_array);
hash_free(form_pos_ht);
if(ctx->conll_filename)
fclose(f);
return 0;
}
......
......@@ -27,6 +27,11 @@ add_library(transparse STATIC ${SOURCES})
#compiling, linking and installing executables
add_executable(maca_trans_parser_conll2fann ./src/maca_trans_parser_conll2fann.c)
target_link_libraries(maca_trans_parser_conll2fann transparse)
target_link_libraries(maca_trans_parser_conll2fann maca_common)
install (TARGETS maca_trans_parser_conll2fann DESTINATION bin)
add_executable(maca_trans_parser_conll2cff ./src/transform_treebank.c)
target_link_libraries(maca_trans_parser_conll2cff transparse)
target_link_libraries(maca_trans_parser_conll2cff maca_common)
......
......@@ -29,8 +29,10 @@ void config_add_next_word_to_buffer(config *c)
w = word_read(c->f, c->mcd_struct);
if(w == NULL) return;
if(word_get_index(w) == -1)
if(word_get_index(w) == -1){
w->feat_array[FEAT_TYPE_INDEX] = c->current_index++;
printf("current index = %d\n", c->current_index);
}
queue_add(c->bf, w);
}
......
......@@ -11,6 +11,10 @@
#include"feat_vec.h"
#include"global_feat_vec.h"
#define config_get_stack(c) (c)->st
#define config_get_buffer(c) (c)->bf
#define config_get_depset(c) (c)->ds
typedef struct {
stack *st; /* the stack */
queue *bf; /* the buffer */
......
......@@ -273,6 +273,9 @@ context *context_read_options(int argc, char *argv[])
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->features_model_filename){
ctx->features_model = feat_model_read(ctx->features_model_filename);
}
......@@ -286,7 +289,6 @@ context *context_read_options(int argc, char *argv[])
if(ctx->features_model && ctx->mcd_struct)
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
*/
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename == NULL){
ctx->mcd_struct = mcd_build_conll07();
......
......@@ -30,12 +30,12 @@ void decode_help_message(context *ctx)
}
void decode_check_options(context *ctx){
if(!ctx->conll_filename
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
|| ctx->help
){
decode_help_message(ctx);
exit(1);
......@@ -93,8 +93,11 @@ int main(int argc, char *argv[])
}
else{*/
conll_file= myfopen(ctx->conll_filename, "r");
if(ctx->conll_filename)
conll_file= myfopen(ctx->conll_filename, "r");
else
conll_file = stdin;
if(ctx->perc_model_filename){
if(ctx->beam_width == 1){
simple_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, ctx->dico_labels, ft, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode);
......
......@@ -77,7 +77,7 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels)
for(i=1; i < d->length; i++){
if((d->array[i].gov) && (d->array[i].dep)){
fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label));
}
}
fprintf(f, "\n");
......
This diff is collapsed.
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement.h"
#include"oracle.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"corpus.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void transform_treebank_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
context_mcd_help_message(ctx);
context_features_model_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_alphabet_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
context_fann_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_alphabet_help_message(ctx);
}
void transform_treebank_check_options(context *ctx)
{
if(!ctx->conll_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
){
transform_treebank_help_message(ctx);
exit(1);
}
}
void print_mvt_fann(FILE *f, int mvt_nb, int mvt_code)
{
int i;
if(mvt_code == 0)
fprintf(f, "1");
else
fprintf(f, "0");
for(i=1; i < mvt_nb; i++){
if(i == mvt_code)
fprintf(f, " 1");
else
fprintf(f, " 0");
}
fprintf(f, "\n");
}
int generate_training_file_buffer(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int nb_trans = 0;
int sentence_nb = 0;
FILE *conll_file = myfopen(ctx->conll_filename, "r");
FILE *conll_file_ref = myfopen(ctx->conll_filename, "r");
c = config_initial(conll_file, ctx->mcd_struct, 1000, 0);
while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, NULL); */
queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
while(!config_is_terminal(c)){
/* config_print(stdout,c); */
if(ctx->fann_filename)
config2feat_vec_fann(ctx->features_model, c, fv, ctx->mode);
else /*if(ctx->cff_filename)*/
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_code = oracle(c, ref);
nb_trans++;
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */
if(ctx->cff_filename){
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
}
else if(ctx->fann_filename){
feat_vec_print_dnn(output_file, fv, ctx->features_model, ctx->mcd_struct);
print_mvt_fann(output_file, ctx->mvt_nb, mvt_code);
fprintf(output_file, "\n\n");
}
if(mvt_type == MVT_LEFT){
/* printf("LEFT\n"); */
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
/* printf("RIGHT\n"); */
movement_right_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_SHIFT){
/* printf("SHIFT\n"); */
movement_shift(c, 0, 0);
continue;
}
}
config_free(c);
c = config_initial(conll_file, ctx->mcd_struct, 1000, 0);
sentence_nb++;
}
return nb_trans;
}
int main(int argc, char *argv[])
{
context *ctx;
int input_size, output_size;
int nb_trans = 0;
FILE *output_file;
ctx = context_read_options(argc, argv);
transform_treebank_check_options(ctx);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs);
}
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1;
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
input_size = ctx->features_model->dim;
output_size = ctx->mvt_nb;
fprintf(stderr, "input size = %d\n", input_size);
fprintf(stderr, "output size = %d\n", output_size);
/* open output file */
output_file = myfopen(ctx->fann_filename, "w");
/* spaces are here to leave room to write number of lines in the file when it will be known */
fprintf(output_file, " %d %d\n", input_size, output_size);
nb_trans = generate_training_file_buffer(output_file, ctx);
rewind(output_file);
fprintf(output_file, "%d", nb_trans);
fclose(output_file);
context_free(ctx);
return 0;
}
k
......@@ -17,12 +17,16 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct)
{
char buffer[10000];
word *w = NULL;
int index = 1;
while(fgets(buffer, 10000, f)){
if(feof(f)) break;
/* fprintf(stderr, "%s", buffer); */
/* fprintf(stderr, "%s", buffer); */
if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */
w = word_parse_buffer(buffer, mcd_struct);
if(word_get_index(w) == -1){
w->feat_array[FEAT_TYPE_INDEX] = index++;
}
queue_add(bf, w);
}
return bf->nbelem - 1; /* because of the dummy word */
......
......@@ -59,7 +59,7 @@ void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *
config_connect_subtrees(c, root_label);
depset_print2(stdout, c->ds, dico_labels);
config_free(c);
/* config_free(c); */
c = config_initial(f, mcd_struct, 1000, 0);
}
}
......
......@@ -5,6 +5,7 @@
#include"word.h"
#define stack_height(s) (s)->top
#define stack_nbelem(s) (s)->top
#define stack_elt_n(s, n) (s)->array[(s)->top - (n) - 1]
typedef struct {
......
......@@ -59,15 +59,6 @@ void print_mvt_fann(FILE *f, int mvt_nb, int mvt_code)
}
fprintf(f, "\n");
}
int word_one_is_in_the_stack(stack *s)
{
int i;
for(i=0; i < s->size; i++)
if(word_get_index(s->array[i]) == 1) return 1;
return 0;
}
int generate_training_file_stream(FILE *output_file, context *ctx)
{
......@@ -122,7 +113,6 @@ int generate_training_file_stream(FILE *output_file, context *ctx)
fprintf(output_file, "\n\n");
}
s0 = stack_top(c->st);
if(s0)s0_index = word_get_index(s0);
......@@ -132,53 +122,25 @@ int generate_training_file_stream(FILE *output_file, context *ctx)
if(queue_is_empty(c->bf)) break;
#if 0
if((b0_index == 1) && ((premier_mot_consomme == 1) || word_one_is_in_the_stack(c->st))){
printf("end of sentence\n");
printf("config : ");
config_print(stdout,c);
if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */
/* printf("sentence complete config : ");
config_print(stdout,c); */
/* printf("sentence is complete\n"); */
/* create the root arc */
movement_right_arc(c, mvt_label, 0);
/* shift dummy word in stack */
movement_shift(c, 1, 0);
/* config_print(stdout,c); */
/* empty depset */
depset_free(c->ds);
c->ds = depset_new();
/* sentence_free(ref); */
sentence_nb++;
premier_mot_consomme = 0;
/* create a new stack */
stack_free(c->st);
c->st = stack_new();
/* push a dummy word */
stack_push(c->st, word_create_dummy(ctx->mcd_struct));
break;
}
#endif
#if 1
if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */
/* printf("sentence complete config : ");
config_print(stdout,c); */
/* printf("sentence is complete\n"); */
/* create the root arc */
movement_right_arc(c, mvt_label, 0);
/* shift dummy word in stack */
movement_shift(c, 1, 0);
/* config_print(stdout,c); */
/* empty depset */
depset_free(c->ds);
c->ds = depset_new();
/* sentence_free(ref); */
sentence_nb++;
break;
}
#endif
if(mvt_type == MVT_LEFT){
/* printf("LEFT\n"); */
......@@ -218,7 +180,7 @@ int generate_training_file_buffer(FILE *output_file, context *ctx)
c = config_initial(conll_file, ctx->mcd_struct, 1000, 0);
while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, NULL); */