diff --git a/CMakeLists.txt b/CMakeLists.txt index 36a4ce1bf623f5917eeed906f80c4b64813a074a..517442d92e09f86fe97a70297de6d031faec2710 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ include_directories(maca_common/include) include_directories(perceptron/lib/include) add_subdirectory(maca_common) +add_subdirectory(maca_tools) add_subdirectory(perceptron) add_subdirectory(maca_lemmatizer) add_subdirectory(maca_trans_parser) diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index f82aaa5ac8d8189378665d6cec9b64427a70f206..15655bf02a1812de517eb966801da6911d03b128 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -44,17 +44,19 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht) int num = 0; char **lemma_array; int lemma_array_size = 10000; - + char buffer[10000]; FILE *f= myfopen(fplm_filename, "r"); int fields_nb; lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *)); - while(!feof(f)){ - fields_nb = fscanf(f, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + while(fgets(buffer, 10000, f)){ + + fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); /* if(!strcmp(form, "d")) */ - /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ + /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ if(fields_nb != 4){ + fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); fprintf(stderr, "incorrect fplm entry, skipping it\n"); continue; } @@ -161,6 +163,9 @@ int main(int argc, char *argv[]) lemma = lemma_array[index_form_pos]; } else + if(ctx->verbose){ + fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + } lemma = form; } diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b5a40e8317025b1b7915b23b3d9b6a10a87bfb5 --- /dev/null +++ b/maca_tools/CMakeLists.txt @@ -0,0 +1,8 @@ +#compiling, linking and installing executables + +add_executable(mcf2conll ./src/mcf2conll.c) +target_link_libraries(mcf2conll perceptron) +target_link_libraries(mcf2conll transparse) +target_link_libraries(mcf2conll maca_common) +install (TARGETS mcf2conll DESTINATION bin) + diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c new file mode 100644 index 0000000000000000000000000000000000000000..f518ff5e4b0a8d9efd647ef0e7acef1d9f0b918e --- /dev/null +++ b/maca_tools/src/mcf2conll.c @@ -0,0 +1,285 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<getopt.h> + +#include"mcd.h" +#include"util.h" +#include"word_buffer.h" + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *conll_filename; + char *mcf_filename; + char *mcd_filename; + mcd *mcd_struct; +} context; + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->conll_filename = NULL; + ctx->mcf_filename = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-C --mcd : mcd filename\n"); + fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n"); + fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n"); +} + +mcd *mcd_build_wplgfs(void) +{ + mcd *m = mcd_new(6); + int col; + + col = 0; + m->wf[col]=MCD_WF_FORM; + m->wf_str[col]=strdup("FORM"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FORM] = col; + + col = 1; + m->wf[col]=MCD_WF_POS; + m->wf_str[col]=strdup("POS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_POS] = 1; + + col = 2; + m->wf[col]=MCD_WF_LEMMA; + m->wf_str[col]=strdup("LEMMA"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LEMMA] = 2; + + col = 3; + m->wf[col]=MCD_WF_GOV; + m->wf_str[col]=strdup("GOV"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_GOV] = 3; + + col = 4; + m->wf[col]=MCD_WF_LABEL; + m->wf_str[col]=strdup("LABEL"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LABEL] = 4; + + col = 5; + m->wf[col]=MCD_WF_SENT_SEG; + m->wf_str[col]=strdup("SENT_SEG"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_SENT_SEG] = 5; + + return m; +} + +void mcf2conll_check_options(context *ctx){ + if(ctx->help){ + context_general_help_message(ctx); + exit(1); + } +} + +void str_print_col_n(FILE *f, char *buffer, int n) +{ + int i; + int col = 0; + int l= strlen(buffer); + for(i=0; i < l; i++){ + if(buffer[i] == '\t') { + col++; + continue; + } + if(col == n) + fprintf(f, "%c", buffer[i]); + } +} + + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[6] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"conll", required_argument, 0, 'o'}, + {"mcd", required_argument, 0, 'C'}, + {"mcf", required_argument, 0, 'i'}, + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'o': + ctx->conll_filename = strdup(optarg); + break; + case 'i': + ctx->mcf_filename = strdup(optarg); + break; + case 'C': + ctx->mcd_filename = strdup(optarg); + break; + } + } + + if(ctx->mcd_filename){ + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + } + else{ + ctx->mcd_struct = mcd_build_wplgfs(); + } + + return ctx; +} + + + + +word_buffer *word_buffer_load_mcf2(char *mcf_filename, mcd *mcd_struct) +{ + FILE *f; + + if(mcf_filename == NULL) + f = stdin; + else + f = myfopen(mcf_filename, "r"); + word_buffer *wb = word_buffer_new(f, mcd_struct, 0); + while(word_buffer_read_next_word(wb) != -1){ + /* printf("load word %d\n", wb->nbelem - 1); */ + } + if(mcf_filename != NULL) + fclose(f); + return wb; +} + +char *mcd_get_str(mcd *m, int code, int col) +{ + if((col < 0) || (col >= m->nb_col)) return NULL; + if(m->representation[col] == MCD_REPRESENTATION_VOCAB) + return (m->dico_array[col])? dico_int2string(m->dico_array[col], code) : NULL; + return NULL; +} + + + +int main(int argc, char *argv[]) +{ + FILE *output_file; + context *ctx = context_read_options(argc, argv); + mcf2conll_check_options(ctx); + + + word_buffer *wb = word_buffer_load_mcf2(ctx->mcf_filename, ctx->mcd_struct); + word *w = NULL; + int form_col = mcd_get_form_col(ctx->mcd_struct); + int pos_col = mcd_get_pos_col(ctx->mcd_struct); + int cpos_col = mcd_get_cpos_col(ctx->mcd_struct); + int lemma_col = mcd_get_lemma_col(ctx->mcd_struct); + int gov_col = mcd_get_gov_col(ctx->mcd_struct); + int label_col = mcd_get_label_col(ctx->mcd_struct); + int feats_col = mcd_get_feats_col(ctx->mcd_struct); + int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct); + int index = 1; + + output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout; + + + do{ + w = word_buffer_b0(wb); + + printf("%d\t", index); + + if(form_col != -1) + str_print_col_n(output_file, w->input, form_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(lemma_col != -1) + str_print_col_n(output_file, w->input, lemma_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(cpos_col != -1) + str_print_col_n(output_file, w->input, cpos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(pos_col != -1) + str_print_col_n(output_file, w->input, pos_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(feats_col != -1) + str_print_col_n(output_file, w->input, feats_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + if(gov_col) + fprintf(output_file, "%d\t", word_get_gov(w) + index); + else + fprintf(output_file, "_\t"); + + if(label_col != -1) + str_print_col_n(output_file, w->input, label_col); + else + fprintf(output_file, "_"); + fprintf(output_file, "\t"); + + fprintf(output_file, "\t_\t\n"); + + if((sent_seg_col) && (word_get_sent_seg(w))){ + fprintf(output_file, "\n"); + index = 0; + } + + index ++; + } while(word_buffer_move_right(wb)); + + if(ctx->conll_filename) + fclose(output_file); + return 0; +} diff --git a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c index 77f2ab64664ecae7eabdfd722fc30f3b4773a1d4..6705393c0ca1fe93cee8035c3823966776c730b3 100644 --- a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c @@ -56,6 +56,7 @@ void generate_training_file_stream(FILE *output_file, context *ctx) FILE *mcf_file = myfopen(ctx->input_filename, "r"); int start_sentence_index = 1; + /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ /* the idea is to ignore syntax in the mcf file that will be read */ /* it is ugly !!! */ @@ -96,7 +97,48 @@ void generate_training_file_stream(FILE *output_file, context *ctx) movement_right_arc(c, mvt_label, 0); word_buffer_move_right(ref); if((mvt_label == eos_label)){ /* sentence is complete */ - sentence_nb++; +#if 0 + ======= + while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, ctx->dico_labels); */ + while(1){ + /* config_print(stdout,c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + /* feat_vec_print(stdout, fv); */ + + mvt_code = oracle_parser(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt code = %d\n", mvt_code); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(queue_is_empty(c->bf)) break; + + if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ + + /* create the root arc */ + movement_right_arc(c, mvt_label, 0); + + /* shift dummy word in stack */ + movement_shift(c, 1, 0); + + /* printf("sentence complete config : "); + config_print(stdout,c); */ + + /* empty depset */ + depset_free(c->ds); + c->ds = depset_new(); + sentence_free(ref); +>>>>>>> master +#endif + + sentence_nb++; start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1; /* printf("%d\n", start_sentence_index); */ @@ -172,8 +214,18 @@ int main(int argc, char *argv[]) else output_file = stdout; + generate_training_file_stream(output_file, ctx); - +#if 0 +======= + if(ctx->stream_mode){ + generate_training_file_stream(output_file, ctx); + } + else{ + generate_training_file_buffer(output_file, ctx); + } +>>>>>>> master +#endif if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs);