diff --git a/CMakeLists.txt b/CMakeLists.txt index 517442d92e09f86fe97a70297de6d031faec2710..2ebd0e07bed36d276dad909ced25fa57f20b43db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ include_directories(perceptron/lib/include) add_subdirectory(maca_common) add_subdirectory(maca_tools) add_subdirectory(perceptron) -add_subdirectory(maca_lemmatizer) +#add_subdirectory(maca_lemmatizer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) diff --git a/maca_common/include/form2pos.h b/maca_common/include/form2pos.h index 515618ca240bfd76a32818b47078b83df48b4d12..56f5cbc39c83e624aaf5b41f69b4f520a2cb60dd 100644 --- a/maca_common/include/form2pos.h +++ b/maca_common/include/form2pos.h @@ -19,6 +19,6 @@ void form2pos_free(form2pos *f2p); form2pos *form2pos_read(char *filename); int form2pos_get_signature(form2pos *f2p, char *form); int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos); - +int form2pos_word_is_non_ambiguous(form2pos *f2p, char *form, char **pos); #endif diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 622d4c3e3f8b7a1446f1cf01c00aa5cc9f74a5c7..fe4eecf2e8f13aac08e418f3973606db8e1ce32e 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -92,6 +92,7 @@ #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v) + /* mcd (multi column description) files describe the format of corpus files */ /* every line of an mcd file describes the content of a column of the corpus file */ /* every line contains four fields separated by a space character */ diff --git a/maca_common/include/util.h b/maca_common/include/util.h index 26c0952c1798dc24103f20a5e9e6a6619f3a8cce..7046269758ef894325a2209bbcd8c89ed4c3755b 100644 --- a/maca_common/include/util.h +++ b/maca_common/include/util.h @@ -6,4 +6,5 @@ void myfree(void *ptr); void *memalloc(size_t s); FILE *myfopen(const char *path, const char *mode); FILE *myfopen_no_exit(const char *path, const char *mode); +char *to_lower_string(char *s); #endif diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 3990714bf0e6ec67c62dbbc1a5a7fe83a25e23e3..00e78085174f63911bb8f673e39ca9b5ca22e3af 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -109,5 +109,7 @@ word *word_read(FILE *f, mcd *mcd_struct); word *word_parse_buffer(char *buffer, mcd *mcd_struct); int word_is_eos(word *w, mcd *mcd_struct); int word_get_gov_index(word *w); +void word_print_col_n(FILE *f, word *w, int n); +void word_sprint_col_n(char *s, word *w, int n); #endif diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index b2efb19294a1af2416128925693e2b6bc2e01a86..dccc016eeb197852c866e9a59adbaefde3b39b6b 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -29,6 +29,29 @@ void form2pos_free(form2pos *f2p) free(f2p); } + +int form2pos_word_is_non_ambiguous(form2pos *f2p, char *form, char **pos) +{ + int pos_code; + int signature = form2pos_get_signature(f2p, form); + char *signature_str = dico_int2string(f2p->d_signature, signature); + if(signature_str == NULL) return 0; + int l = strlen(signature_str); + int sum = 0; + /* printf("form = %s signature = %s\n", form, signature_str); */ + for(int i = 0; i < l; i++){ + sum += signature_str[i] - '0'; + if(signature_str[i] != '0') pos_code = i; + /* printf("sum = %d\n", sum); */ + } + if(sum == 1) + *pos = dico_int2string(f2p->d_pos, pos_code); + else + *pos = NULL; + + return (sum ==1)? 1 : 0; +} + form2pos *form2pos_read(char *filename) { FILE *f = myfopen_no_exit(filename, "r"); diff --git a/maca_common/src/util.c b/maca_common/src/util.c index 4ff03522692bd20ddba0020216d2c1cf44bec0d4..9b16c7536b15e35a14458c663cf0f3c38e817d69 100644 --- a/maca_common/src/util.c +++ b/maca_common/src/util.c @@ -1,5 +1,7 @@ #include<stdlib.h> #include<stdio.h> +#include<string.h> +#include<ctype.h> void myfree(void *ptr) { @@ -34,3 +36,11 @@ FILE *myfopen_no_exit(const char *path, const char *mode) } return f; } + +char *to_lower_string(char *s) +{ + int i; + for(i=0; i < strlen(s); i++) + s[i] = tolower(s[i]); + return s; +} diff --git a/maca_common/src/word.c b/maca_common/src/word.c index ebf00c6e47670a4988e86be11a1004eebb9f2e0e..b6420932d994a9d343fe01651984a2dd53b87b02 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -59,7 +59,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) w = word_new(buffer); token = strtok(buffer, "\t"); do{ - if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){ + if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col); } if(mcd_struct->wf[col] == MCD_WF_FORM){ @@ -148,3 +148,39 @@ int word_get_gov_index(word *w) index = (word_get_index(w)) + (word_get_gov(w)); return index; } + +void word_print_col_n(FILE *f, word *w, int n) +{ + int i; + int col = 0; + char *buffer = w->input; + if(buffer == NULL) return; + int l= strlen(buffer); + for(i=0; i < l; i++){ + if(buffer[i] == '\t') { + col++; + continue; + } + if(col == n) + fprintf(f, "%c", buffer[i]); + } +} + +void word_sprint_col_n(char *s, word *w, int n) +{ + int i; + int col = 0; + int j = 0; + char *buffer = w->input; + if(buffer == NULL) return; + int l= strlen(buffer); + for(i=0; i < l; i++){ + if(buffer[i] == '\t') { + col++; + continue; + } + if(col == n) + s[j++] = buffer[i]; + } + s[j] = '\0'; +} diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index 76b640a267d8d635519b380a49936e8c8e85db8d..5d9cacdf050d2526a7b9bc046fae50ad3e525a10 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -86,14 +86,46 @@ char *to_lower_string(char *s) s[i] = tolower(s[i]); return s; } +/* +void print_word(char *input, mcd *mcd_struct, char *lemma) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_lemma_col(mcd_struct) == -1){ + printf("%s\t%s\n", input, lemma); + } + else{ + buffer = strdup(input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_lemma_col(mcd_struct)) + printf("%s", lemma); + else + word_print_col_n(stdout, w->input, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_lemma_col(mcd_struct)) + printf("\t%s", lemma); + printf("\n"); + free(buffer); + } +} + +*/ int main(int argc, char *argv[]) { hash *form_pos_ht = hash_new(1000000); char buffer[10000]; + char *buffer_copy; char *form; char *pos; + char *token; int column_nb; char form_pos[500]; @@ -103,6 +135,7 @@ int main(int argc, char *argv[]) context *ctx; int form_column; int pos_column; + int lemma_column; FILE *f = NULL; ctx = context_read_options(argc, argv); @@ -123,24 +156,31 @@ int main(int argc, char *argv[]) f = stdin; else f = myfopen(ctx->conll_filename, "r"); - + + lemma_column = ctx->mcd_struct->wf2col[MCD_WF_LEMMA]; + lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode); /* look for a valid word */ - while(fgets(buffer, 10000, f)){ + buffer_copy = strdup(buffer); + while(fgets(buffer_copy, 10000, f)){ if(feof(f)) return 0; /* no more words to read */ - if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ + if((buffer_copy[0] == '\n') || (buffer_copy[0] == ' ') || (buffer_copy[0] == '\t')){ printf("\n"); continue; } - buffer[strlen(buffer)-1] = '\0'; - printf("%s", buffer); - token = strtok(buffer, "\t"); + buffer_copy[strlen(buffer_copy)-1] = '\0'; + printf("%s", buffer_copy); + token = strtok(buffer_copy, "\t"); column_nb = 0; form = NULL; pos = NULL; + lemma = NULL; do{ + if(column_nb == lemma_column) /* lemma is present in the input file */ + if(strcmp(token, "_")) /* and it is not an underscore */ + lemma = strdup(token); /* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == form_column)) */ if(column_nb == form_column) form = strdup(token); @@ -151,32 +191,37 @@ int main(int argc, char *argv[]) column_nb++; } while((token = strtok(NULL , "\t"))); - strcpy(form_pos, form); - strcat(form_pos, "/"); - strcat(form_pos, pos); - index_form_pos = hash_get_val(form_pos_ht, form_pos); - if(index_form_pos != HASH_INVALID_VAL){ - lemma = lemma_array[index_form_pos]; - } - else{ - to_lower_string(form_pos); + if(lemma == NULL){ + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); index_form_pos = hash_get_val(form_pos_ht, form_pos); if(index_form_pos != HASH_INVALID_VAL){ lemma = lemma_array[index_form_pos]; } - else - if(ctx->verbose){ - fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + else{ + to_lower_string(form_pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL){ + lemma = lemma_array[index_form_pos]; } + else + if(ctx->verbose){ + fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + } lemma = form; + } } - /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */ + /* print_word(buffer, ctx->mcd_struct, lemma); */ + + /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */ printf("\t%s\n", lemma); if(pos)free(pos); if(form)free(form); } + free(buffer_copy); free(lemma_array); hash_free(form_pos_ht); diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c index fa5f7cf7e3629e3b6b26971f2f24f7490af0f7bc..49f3e34f9b431acdc5dee6a7205d429680040fe9 100644 --- a/maca_tools/src/mcf2conll.c +++ b/maca_tools/src/mcf2conll.c @@ -52,20 +52,6 @@ void mcf2conll_check_options(context *ctx){ } } -void str_print_col_n(FILE *f, char *buffer, int n) -{ - int i; - int col = 0; - int l= strlen(buffer); - for(i=0; i < l; i++){ - if(buffer[i] == '\t') { - col++; - continue; - } - if(col == n) - fprintf(f, "%c", buffer[i]); - } -} context *context_read_options(int argc, char *argv[]) @@ -151,31 +137,31 @@ int main(int argc, char *argv[]) printf("%d\t", index); if(form_col != -1) - str_print_col_n(output_file, w->input, form_col); + word_print_col_n(output_file, w, form_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); if(lemma_col != -1) - str_print_col_n(output_file, w->input, lemma_col); + word_print_col_n(output_file, w, lemma_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); if(cpos_col != -1) - str_print_col_n(output_file, w->input, cpos_col); + word_print_col_n(output_file, w, cpos_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); if(pos_col != -1) - str_print_col_n(output_file, w->input, pos_col); + word_print_col_n(output_file, w, pos_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); if(feats_col != -1) - str_print_col_n(output_file, w->input, feats_col); + word_print_col_n(output_file, w, feats_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); @@ -190,7 +176,7 @@ int main(int argc, char *argv[]) fprintf(output_file, "_\t"); if(label_col != -1) - str_print_col_n(output_file, w->input, label_col); + word_print_col_n(output_file, w, label_col); else fprintf(output_file, "_"); fprintf(output_file, "\t"); diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 85d67be561ad16cd6349bf20b8cbec8c62aadf57..2b199d3f393cfc5dae84fe48956876d757d9b626 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -53,11 +53,11 @@ install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) #target_link_libraries(maca_trans_parser_mcf2cff maca_common) #install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin) -#add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c) -#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron) -#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) -#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) -#install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) +add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) +install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) add_executable(maca_trans_tagparser_arc_eager_mcf2cff ./src/maca_trans_tagparser_arc_eager_mcf2cff.c) target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff perceptron) @@ -119,6 +119,18 @@ target_link_libraries(cff2fann transparse) target_link_libraries(cff2fann maca_common) install (TARGETS cff2fann DESTINATION bin) +add_executable(maca_trans_interpreter ./src/maca_trans_interpreter.c) +target_compile_options(maca_trans_interpreter INTERFACE -Wall) +target_link_libraries(maca_trans_interpreter transparse) +target_link_libraries(maca_trans_interpreter maca_common) +install (TARGETS maca_trans_interpreter DESTINATION bin) + +add_executable(maca_lemmatizer ./src/maca_trans_lemmatizer.c) +target_compile_options(maca_lemmatizer INTERFACE -Wall) +target_link_libraries(maca_lemmatizer transparse) +target_link_libraries(maca_lemmatizer maca_common) +install (TARGETS maca_lemmatizer DESTINATION bin) + #add_executable(test_w2v ./src/test_w2v.c) #target_link_libraries(test_w2v transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index e1beddd40aef1cd7c60804349712cf6a4e184904..a9b9d4382b9f5480d9f0875d56726f21a35a4870 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -272,7 +272,7 @@ context *context_read_options(int argc, char *argv[]) if(ctx->conll) ctx->mcd_struct = mcd_build_conll07(); else - ctx->mcd_struct = mcd_build_wplgf(); + ctx->mcd_struct = mcd_build_wplgfs(); /* ctx->mcd_struct = mcd_build_ifpls(); */ return ctx; diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index afdcd11c44e4d4bd402eb297131daa965e43e23e..611dd10d09b88a0a5c13d0ee66cafd52a1550519 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -4,7 +4,6 @@ #define TEST_MODE 1 #define TRAIN_MODE 2 - #define DEFAULT_MULTI_COL_DESC_FILENAME "maca_trans_parser.mcd" #define DEFAULT_FEATURES_MODEL_FILENAME "maca_trans_parser.fm" #define DEFAULT_VOCABS_FILENAME "maca_trans_parser.vocab" @@ -14,13 +13,14 @@ #define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm" #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" -#define DEFAULT_F2P_FILENAME "fP" #define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" #define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" #define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" #define DEFAULT_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.model" + #define DEFAULT_F2P_FILENAME "fP" +#define DEFAULT_FPLM_FILENAME "fplm" #include "dico_vec.h" #include "feat_model.h" @@ -38,6 +38,7 @@ typedef struct { char *fann_filename; char *stag_desc_filename; char *f2p_filename; + char *fplm_filename; int hidden_neurons_nb; int iteration_nb; int debug_mode; diff --git a/maca_trans_parser/src/maca_trans_interpreter.c b/maca_trans_parser/src/maca_trans_interpreter.c new file mode 100644 index 0000000000000000000000000000000000000000..281d3aacd703cc9eabd384ba2d95371da2027d74 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_interpreter.c @@ -0,0 +1,150 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"util.h" +#include"mcd.h" +#include"config.h" +#include"word_buffer.h" +#include"movements.h" + +#define LONGUEUR_LIGNE 1000 + +#define MODE_TAGGER 0 +#define MODE_PARSER 1 +#define MODE_TAGPARSER 2 + +void help_message(void) +{ + fprintf(stdout, "help\t print this message\n"); + fprintf(stdout, "verbose\t toggle verbose mode\n"); + fprintf(stdout, "quit\t quit interpreter\n"); + fprintf(stdout, "load_mcd\t load mcd file\n"); + fprintf(stdout, "load_mcf\t load mcf file\n"); + fprintf(stdout, "open_mcf\t open mcf file\n"); + fprintf(stdout, "config_new\t \n"); + fprintf(stdout, "config_print\t print configuration\n"); + fprintf(stdout, "shift\t perform a shift movement\n"); + fprintf(stdout, "shift_undo\t \n"); + fprintf(stdout, "parser\t switch to parser mode\n"); + fprintf(stdout, "tagger\t switch to tagger mode\n"); + fprintf(stdout, "tagparser\t switch to tagparser mode\n"); + +} + +int main(int argc, char *argv[]) +{ + char ligne[LONGUEUR_LIGNE]; + char commande[LONGUEUR_LIGNE], argument[LONGUEUR_LIGNE]; + int n; + mcd *mcd_struct = NULL; + char *mcd_filename = NULL; + char *mcf_filename = NULL; + FILE *mcf_file = NULL; + int verbose = 0; + word_buffer *wb = NULL; + config *c = NULL; + int mode = MODE_PARSER; + + while(1){ + printf("> "); + + if(fgets(ligne, LONGUEUR_LIGNE, stdin) == NULL) { + printf("au revoir !\n"); + exit(1); + } + commande[0] = argument[0] = '\0'; + n = sscanf(ligne, "%s %s\n", commande, argument); + /* printf("ligne = %s n = %d commande = %s argument = %s\n", ligne, n, commande, argument); */ + + if(n == -1) continue; + if(!strcmp(commande, "quit")){ + printf("au revoir !\n"); + exit(1); + } + + if(!strcmp(commande, "verbose")){ + verbose = (verbose == 0) ? 1 : 0; + printf("verbose = %d\n", verbose); + continue; + } + + if(!strcmp(commande, "help")){ + help_message(); + continue; + } + + /* set mode */ + + if(!strcmp(commande, "parser")){ + mode = MODE_PARSER; + if(verbose) + fprintf(stdout, "mode = parser\n"); + continue; + } + + if(!strcmp(commande, "tagger")){ + mode = MODE_TAGGER; + if(verbose) + fprintf(stdout, "mode = tagger\n"); + continue; + } + if(!strcmp(commande, "tagparser")){ + mode = MODE_TAGPARSER; + if(verbose) + fprintf(stdout, "mode = tagparser\n"); + continue; + } + + if(!strcmp(commande, "mode")){ + if(mode == MODE_PARSER){fprintf(stdout, "parser\n"); continue;} + if(mode == MODE_TAGGER){fprintf(stdout, "tagger\n"); continue;} + if(mode == MODE_TAGPARSER){fprintf(stdout, "tagparser\n"); continue;} + } + + if(!strcmp(commande, "load_mcd")){ + mcd_filename = strdup(argument); + mcd_struct = mcd_read(mcd_filename, verbose); + continue; + } + + if(!strcmp(commande, "load_mcf")){ + mcf_filename = strdup(argument); + word_buffer_load_mcf(mcf_filename, mcd_struct); + continue; + } + + if(!strcmp(commande, "open_mcf")){ + mcf_filename = strdup(argument); + mcf_file = myfopen(mcf_filename, "r"); + continue; + } + + if(!strcmp(commande, "config_new")){ + c = config_new(mcf_file, mcd_struct, 5); + continue; + } + + if(!strcmp(commande, "config_print")){ + config_print(stdout, c); + continue; + } + + /* movements */ + + if(!strcmp(commande, "shift")){ + movement_shift(c, 0); + config_print(stdout, c); + continue; + } + + if(!strcmp(commande, "shift_undo")){ + movement_shift_undo(c); + config_print(stdout, c); + continue; + } + + + + + } +} diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c new file mode 100644 index 0000000000000000000000000000000000000000..2cda79f19eac2fbab8cb9c424e71c5841a2fdbac --- /dev/null +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -0,0 +1,198 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> + +#include"context.h" +#include"dico.h" + +void maca_lemmatizer_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); +} + +void maca_lemmatizer_check_options(context *ctx){ + if(ctx->help + ){ + maca_lemmatizer_help_message(ctx); + exit(1); + } +} + +void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else + strcat(absolute_path, getenv("MACAON_DIR")); + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename); + } +} + +char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode) +{ + char form[1000]; + char pos[1000]; + char lemma[1000]; + char morpho[1000]; + int num = 0; + char **lemma_array; + int lemma_array_size = 10000; + char buffer[10000]; + int fields_nb; + FILE *f= myfopen(fplm_filename, "r"); + + lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *)); + + while(fgets(buffer, 10000, f)){ + fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + /* if(!strcmp(form, "d")) */ + /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ + if(fields_nb != 4){ + if(debug_mode){ + fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + strcat(form, "/"); + strcat(form, pos); + hash_add(form_pos_ht, strdup(form), num); + + if(num >= lemma_array_size){ + lemma_array_size = 2 * (lemma_array_size) + 1; + lemma_array = realloc(lemma_array, (lemma_array_size) * sizeof(char *)); + } + + /* if(lemma_array[num] == NULL) */ + lemma_array[num] = strdup(lemma); + num++; + } + /* fprintf(stderr, "%d entries loaded\n", num); */ + return lemma_array; +} + + +char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) +{ + char form_pos[1000]; + int index_form_pos; + + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + + + if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */ + return lemma_array[index_form_pos]; + + strcpy(form_pos, form); + to_lower_string(form_pos); /* change form to lower case and look it up again */ + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL) + return lemma_array[index_form_pos]; + + /* even in lower case couple form/pos is not found, return the form as lemma */ + if(verbose) + fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + + return form; +} + +/* a bit messy */ +void print_word(word *w, mcd *mcd_struct, char *lemma) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + + if(mcd_get_lemma_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, lemma); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_lemma_col(mcd_struct)) + printf("%s", lemma); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_lemma_col(mcd_struct)) + printf("\t%s", lemma); + printf("\n"); + free(buffer); + } +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + hash *form_pos_ht = hash_new(1000000); + char **lemma_array = NULL; + word *b0; + char lemma[200]; + char form[200]; + char pos[200]; + config *c; + + maca_lemmatizer_check_options(ctx); + maca_lemmatizer_set_linguistic_resources_filenames(ctx); + + lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode); + + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct)); + word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct)); + + /* if lemma is not specified in input it is looked up */ + if(strlen(lemma) && strcmp(lemma, "_")) + print_word(b0, ctx->mcd_struct, lemma); + else + print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose)); + + word_buffer_move_right(c->bf); + } + config_free(c); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c index 6a3e6d798c83b1d8256d2adf5a3ed21fe1ee6c8b..d48a4d8cbf776183706752f30b9b1ce5a0a4841e 100644 --- a/maca_trans_parser/src/maca_trans_parser.c +++ b/maca_trans_parser/src/maca_trans_parser.c @@ -5,12 +5,10 @@ #include<getopt.h> #include"context.h" #include"movement_parser.h" -#include"oracle_parser.h" #include"oracle_parser_arc_eager.h" #include"feat_fct.h" #include"feature_table.h" #include"dico.h" -#include"beam.h" #include"simple_decoder_parser_arc_eager.h" /*#include"dnn_decoder.h"*/ #include"config2feat_vec.h" diff --git a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c index b990c79841230ff428c9c98484b2ca777d00bd92..f0a16c043750ee4fce542f354303dec08332e651 100644 --- a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c @@ -42,7 +42,7 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx) } } -void generate_training_file_stream(FILE *output_file, context *ctx) +void generate_training_file(FILE *output_file, context *ctx) { config *c; int mvt_code; @@ -66,14 +66,10 @@ void generate_training_file_stream(FILE *output_file, context *ctx) c = config_new(mcf_file, mcd_struct_hyp, 5); while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){ - /*printf("************ REF ************\n"); - word_buffer_print(stdout, ref); - printf("*****************************\n");*/ - mvt_code = oracle_parser_arc_eager(c, ref, root_label); mvt_type = movement_parser_type(mvt_code); mvt_label = movement_parser_label(mvt_code); - + if(ctx->debug_mode){ config_print(stdout,c); movement_parser_print(stdout, mvt_code, ctx->dico_labels); @@ -92,40 +88,32 @@ void generate_training_file_stream(FILE *output_file, context *ctx) config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); feat_vec_print(output_file, fv); } - - if(mvt_type == MVT_PARSER_EOS){ + + switch(mvt_type){ + case MVT_PARSER_EOS : movement_parser_eos(c); sentence_nb++; - fprintf(stderr, "sentence %d\n", sentence_nb); - if(word_buffer_is_last(ref)) - break; - } - - if(mvt_type == MVT_PARSER_LEFT){ + if((sentence_nb % 100) == 0) + fprintf(stderr, "sentence %d\n", sentence_nb); + /* if(word_buffer_is_last(ref)) */ + break; + case MVT_PARSER_LEFT : movement_parser_left_arc(c, mvt_label); - continue; - } - - if(mvt_type == MVT_PARSER_RIGHT){ + break; + case MVT_PARSER_RIGHT : movement_parser_right_arc(c, mvt_label); word_buffer_move_right(ref); - continue; - } - - if(mvt_type == MVT_PARSER_REDUCE){ + break; + case MVT_PARSER_REDUCE : movement_parser_reduce(c); - continue; - } - - if(mvt_type == MVT_PARSER_ROOT){ + break; + case MVT_PARSER_ROOT : movement_parser_root(c, root_label); - continue; - } - - if(mvt_type == MVT_PARSER_SHIFT){ + break; + case MVT_PARSER_SHIFT : movement_parser_shift(c); word_buffer_move_right(ref); - continue; + break; } } } @@ -174,7 +162,7 @@ int main(int argc, char *argv[]) /* open output file */ output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; - generate_training_file_stream(output_file, ctx); + generate_training_file(output_file, ctx); if(ctx->mode == TRAIN_MODE) dico_vec_print(ctx->vocabs_filename, ctx->vocabs); diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index 93f990a73a245950ff35a95f71b310ff5a6c2d68..01342e6628ac3ffaf1fc2ff9c1d9911e9cf1e1c2 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -12,17 +12,46 @@ #include"word_emb.h" #include"config2feat_vec.h" +#if 1 void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) { int i; word *w; + char lower_form[100]; for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ w = word_buffer_get_word_n(bf, i); if(word_get_signature(w) != -1) break; w->signature = form2pos_get_signature(f2p, w->form); + if(w->signature == -1){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } } } +#endif + +#if 0 +void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos) +{ + int i; + word *w; + int signature; + char *pos; + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + signature = form2pos_get_signature(f2p, w->form); + w->signature = signature; + if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){ + /* printf("%s non ambigu cat = %s code = %d \n", w->form, pos, dico_string2int(dico_pos, pos)); */ + word_set_pos(w, dico_string2int(dico_pos, pos)); + + } + } +} +#endif void maca_trans_parser_mcf2cff_help_message(context *ctx) { @@ -56,64 +85,30 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx) } } -void generate_training_file_stream(FILE *output_file, context *ctx) +void generate_training_file(FILE *output_file, context *ctx) { config *c; feat_vec *fv = feat_vec_new(feature_types_nb); FILE *conll_file = myfopen(ctx->input_filename, "r"); int postag; - + /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ + c = config_new(conll_file, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ /* config_print(stdout,c); */ if(ctx->f2p) - add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + /*add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */ + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); postag = oracle_tagger(c, NULL); fprintf(output_file, "%d", postag); feat_vec_print(output_file, fv); - int res = movement_tagger(c, postag, 0, 1); + int res = movement_tagger(c, postag); if(res == 0) break; } } -void generate_training_file_buffer(FILE *output_file, context *ctx) -{ - config *c; - feat_vec *fv = feat_vec_new(feature_types_nb); - sentence *ref = NULL; - int sentence_nb = 0; - FILE *conll_file = myfopen(ctx->input_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); - int postag; - c = config_new(conll_file, ctx->mcd_struct, 0); - - while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ - /* sentence_print(stdout, ref, NULL); */ - word_buffer_read_sentence(c->bf); - /* get rid of dummy token */ - /* queue_remove(c->bf); */ - - if(ctx->f2p) - add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); - - while(!config_is_terminal(c)){ - /* config_print(stdout, c); */ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - postag = oracle_tagger(c, ref); - fprintf(output_file, "%d", postag); - feat_vec_print(output_file, fv); - - if(postag != -1) - movement_tagger(c, postag, 0, 0); - } - config_free(c); - c = config_new(conll_file, ctx->mcd_struct, 0); - sentence_nb++; - } -} - int main(int argc, char *argv[]) { context *ctx; @@ -152,12 +147,9 @@ int main(int argc, char *argv[]) output_file = myfopen(ctx->cff_filename, "w"); else output_file = stdout; - - if(ctx->stream_mode) - generate_training_file_stream(output_file, ctx); - else - generate_training_file_buffer(output_file, ctx); + generate_training_file(output_file, ctx); + if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); diff --git a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c index fdadaeead7901243dad7907e0d163df68f9a9705..11c163cd93e06b20c4ac2d6729b936d2132961b9 100644 --- a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c @@ -54,7 +54,7 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx) } } -void generate_training_file_stream(FILE *output_file, context *ctx) +void generate_training_file(FILE *output_file, context *ctx) { config *c; int mvt_code; @@ -112,43 +112,33 @@ void generate_training_file_stream(FILE *output_file, context *ctx) feat_vec_print(output_file, fv); } - if(mvt_type == MVT_TAGPARSER_EOS){ + switch(mvt_type){ + case MVT_TAGPARSER_EOS : movement_tagparser_eos(c); sentence_nb++; - if(word_buffer_is_last(ref)) - break; - } - - if(mvt_type == MVT_TAGPARSER_POSTAG){ + if((sentence_nb % 100) == 0) + fprintf(stderr, "sentence %d\n", sentence_nb); + break; + case MVT_TAGPARSER_POSTAG : movement_tagparser_add_pos(c, mvt_label); - continue; - } - - if(mvt_type == MVT_TAGPARSER_LEFT){ + break; + case MVT_TAGPARSER_LEFT : movement_tagparser_left_arc(c, mvt_label); - continue; - } - - if(mvt_type == MVT_TAGPARSER_RIGHT){ + break; + case MVT_TAGPARSER_RIGHT : movement_tagparser_right_arc(c, mvt_label); word_buffer_move_right(ref); - continue; - } - - if(mvt_type == MVT_TAGPARSER_REDUCE){ + break; + case MVT_TAGPARSER_REDUCE : movement_tagparser_reduce(c); - continue; - } - - if(mvt_type == MVT_TAGPARSER_ROOT){ + break; + case MVT_TAGPARSER_ROOT : movement_tagparser_root(c, root_label); - continue; - } - - if(mvt_type == MVT_TAGPARSER_SHIFT){ + break; + case MVT_TAGPARSER_SHIFT : movement_tagparser_shift(c); word_buffer_move_right(ref); - continue; + break; } } } @@ -173,7 +163,6 @@ int main(int argc, char *argv[]) } ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); - if(ctx->dico_labels == NULL){ fprintf(stderr, "cannot find label names\n"); @@ -198,7 +187,7 @@ int main(int argc, char *argv[]) /* open output file */ output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; - generate_training_file_stream(output_file, ctx); + generate_training_file(output_file, ctx); if(ctx->mode == TRAIN_MODE) dico_vec_print(ctx->vocabs_filename, ctx->vocabs); diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c index 5d33788a4b0fe791cdda53e07d30cd22c71d1fc6..8e7d89660958c78bac6efb439442c830df3ed6bd 100644 --- a/maca_trans_parser/src/movement_tagger.c +++ b/maca_trans_parser/src/movement_tagger.c @@ -4,10 +4,8 @@ #include"util.h" #include"movement_tagger.h" -int movement_tagger(config *c, int postag, float score, int stream) +int movement_tagger(config *c, int postag) { - if(word_buffer_is_last(c->bf)) return 0; - word_set_pos(word_buffer_b0(c->bf), postag); word_buffer_move_right(c->bf); diff --git a/maca_trans_parser/src/movement_tagger.h b/maca_trans_parser/src/movement_tagger.h index 7168f5aa0218ba32e05fc84b97e76e5cab9c6a80..1b7dfbe2ac21b4d68b6cb17fc4efd3d3941ec187 100644 --- a/maca_trans_parser/src/movement_tagger.h +++ b/maca_trans_parser/src/movement_tagger.h @@ -3,6 +3,6 @@ #include"config.h" #include"feat_vec.h" -int movement_tagger(config *c, int postag, float score, int stream); +int movement_tagger(config *c, int postag); #endif diff --git a/maca_trans_parser/src/movements.c b/maca_trans_parser/src/movements.c index cccdf083fc2f299f2998f8679968f2c4db3758df..a10e62b50be5e1d462689e29f67c993d2c65a902 100644 --- a/maca_trans_parser/src/movements.c +++ b/maca_trans_parser/src/movements.c @@ -8,8 +8,10 @@ int movement_eos(config *c, int movement_code) { if(stack_is_empty(config_get_stack(c))) return 0; word *s0 = stack_top(config_get_stack(c)); + + if(word_get_sent_seg(s0) == 1) return 0; - /* word on the top of the stack is sent_seg */ + /* set word on the top of the stack to sent_seg */ word_set_sent_seg(s0, 1); config_push_mvt(c, movement_code, s0, NULL); diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c index 3a81fc79d7839530580a9e6803a1e070c4dc55b5..42fee6d5e41f381167f99a2585ad7673f5cde6ad 100644 --- a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c @@ -44,7 +44,7 @@ void simple_decoder_parser_arc_eager(context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; int result; - float entropy; + /* float entropy; */ /* float delta; */ int argmax1, argmax2; float max1, max2; @@ -74,24 +74,6 @@ void simple_decoder_parser_arc_eager(context *ctx) } - if(ctx->debug_mode){ - fprintf(stdout, "***********************************\n"); - config_print(stdout, c); - entropy = feature_table_entropy(fv, ft); - /* delta = feature_table_diff_scores(fv, ft); */ - feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); - movement_parser_print(stdout, argmax1, ctx->dico_labels); - printf(":\t%f\n", max1); - movement_parser_print(stdout, argmax2, ctx->dico_labels); - printf(":\t%f\n", max2); - printf("delta = %f\n", max1 - max2); - - /* delta = feature_table_first_second(fv, ft); */ - /* printf("entropy = %f delta = %f\n", entropy, delta); */ - printf("entropy = %f\n",entropy); - - /* movement_parser_print(stdout, mvt_code, ctx->dico_labels); */ - } result = 0; switch(mvt_type){ case MVT_PARSER_LEFT : diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index bad70b6e5ff3d797f62a3dc436ea1c54a50614ed..9a50f63a81f68ff20b4d9617c7516349069ac6be 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -3,22 +3,82 @@ #include<string.h> #include<unistd.h> #include<getopt.h> +#include<ctype.h> + #include"context.h" #include"movement_tagger.h" #include"feat_fct.h" #include"config2feat_vec.h" #include"feature_table.h" #include"dico.h" +#include"mcd.h" +#if 1 void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) { int i; word *w; + char lower_form[100]; for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ w = word_buffer_get_word_n(bf, i); if(word_get_signature(w) != -1) break; w->signature = form2pos_get_signature(f2p, w->form); + if(w->signature == -1){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } + } +} +#endif + +#if 0 +void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos) +{ + int i; + word *w; + int signature; + char *pos; + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + signature = form2pos_get_signature(f2p, w->form); + w->signature = signature; + if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){ + /* printf("%s non ambigu code = %d \n", pos, dico_string2int(dico_pos, pos)); */ + word_set_pos(w, dico_string2int(dico_pos, pos)); + + } + } +} +#endif + +void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_pos_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_pos_col(mcd_struct)) + printf("%s", dico_int2string(dico_pos, postag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_pos_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_pos, postag)); + printf("\n"); + free(buffer); } } @@ -30,32 +90,48 @@ void simple_decoder_tagger(context *ctx) feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); int postag; float max; - word *w; + word *b0; dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); - int res; + c = config_new(f, ctx->mcd_struct, 5); - while(1){ + while(!config_is_terminal(c)){ if(ctx->f2p) - add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); - /* config_print(stdout, c); */ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - - /* feat_vec_print(stdout, fv); */ - postag = feature_table_argmax(fv, ft, &max); - /* printf("postag = %d\n", postag); */ - - w = word_buffer_b0(c->bf); - printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + /* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */ + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + b0 = word_buffer_b0(c->bf); + postag = word_get_pos(b0); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } - res = movement_tagger(c, postag, max, 1); + /* if postag is not specified in input it is predicted */ + if(postag == -1){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + postag = feature_table_argmax(fv, ft, &max); + /* printf("postag = %d\n", postag); */ - /* printf(" current index = %d nb elem = %d\n", c->bf->current_index, c->bf->nbelem); */ + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } - if(res == 0) break; - } + print_word(b0, ctx->mcd_struct, dico_pos, postag); + + movement_tagger(c, postag); + } /* config_print(stdout, c); */ - /* config_free(c); */ + config_free(c); } - diff --git a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c index 4b6aac10052fa34c8247075fcd9bdee71e407ec6..15b8767ae0b1a70b3f1f1d88f374995166131a92 100644 --- a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c +++ b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c @@ -63,7 +63,7 @@ void simple_decoder_tagparser_arc_eager(context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; int result; - float entropy; + /* float entropy; */ /* float delta; */ int argmax1, argmax2; float max1, max2; @@ -99,6 +99,19 @@ void simple_decoder_tagparser_arc_eager(context *ctx) if(ctx->debug_mode){ fprintf(stdout, "***********************************\n"); config_print(stdout, c); + + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + + for(int i=0; i < 5; i++){ + printf("%d\t", i); + movement_tagparser_print(stdout, vcode_array[i].class_code, ctx->dico_labels, ctx->dico_postags); + printf("\t%.4f\n", vcode_array[i].score); + } + free(vcode_array); + + +#if 0 + entropy = feature_table_entropy(fv, ft); /* delta = feature_table_diff_scores(fv, ft); */ feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); @@ -113,11 +126,17 @@ void simple_decoder_tagparser_arc_eager(context *ctx) printf("entropy = %f\n",entropy); /* movement_tagparser_print(stdout, mvt_code, ctx->dico_labels); */ +#endif } result = 0; switch(mvt_type){ case MVT_TAGPARSER_POSTAG : result = movement_tagparser_add_pos(c, mvt_label); + /* if(result){ + int code_pos = word_get_pos(word_buffer_b0(config_get_buffer(c))); + int code_form = word_get_form(word_buffer_b0(config_get_buffer(c))); + printf("code pos = %d code form = %d\n", code_pos, code_form); + }*/ break; case MVT_TAGPARSER_LEFT : result = movement_tagparser_left_arc(c, mvt_label); diff --git a/perceptron/exec/cff_cutoff.c b/perceptron/exec/cff_cutoff.c index 029701b697e887715a99fdf1d47a67cbd908cabb..64ad7e05231568704e16ee14fdb9602843217c8a 100644 --- a/perceptron/exec/cff_cutoff.c +++ b/perceptron/exec/cff_cutoff.c @@ -174,9 +174,9 @@ int main(int argc, char *argv[]) fprintf(stderr, "after thresholding : %d\n", n_feat - feat_removed); fprintf(stderr, "ratio : %.3f\n\n", (float)(n_feat - feat_removed) / n_feat); - fprintf(stderr, "total number of feature occurrences : %d\n", f_occ); + /* fprintf(stderr, "total number of feature occurrences : %d\n", f_occ); fprintf(stderr, "atfer thresholding : %d\n", f_occ - occ_removed); - fprintf(stderr, "ratio : %.3f\n", (float)(f_occ - occ_removed) / f_occ); + fprintf(stderr, "ratio : %.3f\n", (float)(f_occ - occ_removed) / f_occ);*/ dico_vec_replace_dico(ctx->vocabs, old_d_feat, new_d_feat); diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c index 82c6be2c5d88bb2cb3e6f4525b0084f7870b37dc..99ac711a528f01a306a22bc921e9e5f3f2c5b0c2 100644 --- a/perceptron/lib/src/feature_table.c +++ b/perceptron/lib/src/feature_table.c @@ -194,6 +194,7 @@ float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, fl return (*max1 - *max2); } + float feature_table_entropy(feat_vec *fv, feature_table *ft) { float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));