diff --git a/maca_common/src/dico.c b/maca_common/src/dico.c index eb0a1cd3a8555d67bbe47f4405807d7d0cc47ba0..01a18da8cd0af84670ff89facb68ead405d1bd65 100644 --- a/maca_common/src/dico.c +++ b/maca_common/src/dico.c @@ -157,8 +157,8 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name) while(fgets(buffer, 10000, f)){ if(feof(f)) return NULL; /* no more words to read */ - if((buffer[0] == '\n') || (buffer[0] == ' ')) continue; - if(buffer[strlen(buffer) - 1] == '\n') + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == ' ')) continue; + if(buffer[strlen(buffer) - 1] == '\n' || buffer[strlen(buffer) - 1] == '\r') buffer[strlen(buffer) - 1] = '\0'; token = strtok(buffer, "\t"); /* get index */ column_nb = 0; diff --git a/maca_common/src/feat_model.c b/maca_common/src/feat_model.c index 4bf28ca9c1777129708bd26cb5f66cb3a7b3d288..5279e687bc7375e3b6b717ee12991e8702d1fc50 100644 --- a/maca_common/src/feat_model.c +++ b/maca_common/src/feat_model.c @@ -67,16 +67,16 @@ feat_model *feat_model_read(char *filename, feat_lib *fl, int verbose) while(fgets(buffer, 1000, f)){ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == '#')) continue; if(verbose) fprintf(stderr, "%d", feature_number + 1); fd = feat_desc_new(); - feat_name = strtok(buffer, " \n"); + feat_name = strtok(buffer, " \n\r"); do{ if(verbose) fprintf(stderr, "\t%s", feat_name); sfd = feat_lib_get_simple_feat_desc(fm->fl, feat_name); if(sfd) feat_desc_add(fd, sfd); - }while((feat_name = strtok(NULL, " \n"))); + }while((feat_name = strtok(NULL, " \n\r"))); if(verbose) fprintf(stderr, "\n"); feat_model_add(fm, fd); feature_number++; diff --git a/maca_common/src/hash.c b/maca_common/src/hash.c index 9b8ccf899fa853258c4a4c368727ce7fa2b02c92..2f80f2fbf0a15484af9b92cba6e944f8ab195b2b 100644 --- a/maca_common/src/hash.c +++ b/maca_common/src/hash.c @@ -60,6 +60,9 @@ cell *hash_lookup(hash *h, char *key) for(c=h->array[index]; c; c = c->next) if(!strcmp(key, c->key)) return c; + + //printf("<key:%s>\n",key); + return NULL; } diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 69117407b18c2303402578a5499d18685f0fb30d..d288f33142e86db06b32a75cda9820fcf38bb8dd 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -117,7 +117,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) while(fgets(buffer, 1000, f)){ line_number++; if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == '#')) continue; fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); @@ -183,7 +183,7 @@ mcd *mcd_read(char *mcd_filename, int verbose) while(fgets(buffer, 1000, f)){ line_number++; if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == '#')) continue; fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index a97486b02c565d33b0b87073e005d3ca0834c71e..f82a0a16321788adf958adda72fdf080b741c7a3 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -73,7 +73,7 @@ sentence *sentence_read(FILE *f, mcd *mcd_struct) while(fgets(buffer, 1000, f)){ /* printf("buffer = %s\n", buffer); */ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence indicated by empty line */ + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == ' ')) break; /* end of the sentence indicated by empty line */ w = word_parse_buffer(buffer, mcd_struct); if(w) sentence_add_word(s, w); if(word_is_eos(w, mcd_struct)) break; @@ -95,7 +95,7 @@ sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct) while(fgets(buffer, 1000, f)){ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == ' ')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); sentence_add_word(s, w); } diff --git a/maca_common/src/word.c b/maca_common/src/word.c index 21e303e844bc5e7fb9030a56995420bf55504a3f..3e3177ebed53b479ddc4c9c3405b8a96d2152413 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -57,7 +57,8 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) int col = 0; /* remove newline from buffer */ - if(buffer[strlen(buffer)-1] == '\n') buffer[strlen(buffer)-1] = '\0'; + if(buffer[strlen(buffer)-1] == '\n' || buffer[strlen(buffer)-1] == '\r') buffer[strlen(buffer)-1] = '\0'; + if(buffer[strlen(buffer)-2] == '\r') buffer[strlen(buffer)-2] = '\0'; w = word_new(buffer); token = strtok(buffer, "\t"); diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index e9355d50e3fc8d271b13205dbbcd0f7678a4f65d..50fce208ff986cd78a9f8dd0b4a5a5aabba904de 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -145,7 +145,7 @@ int word_buffer_read_sentence(word_buffer *wb) int index = 1; while(fgets(buffer, 10000, word_buffer_get_input_file(wb))){ - if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) continue; /* ignore empty lines */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t') || (buffer[0] == '\r')) continue; /* ignore empty lines */ if(feof(word_buffer_get_input_file(wb))) break; w = word_parse_buffer(buffer, word_buffer_get_mcd(wb)); word_set_index(w, index); diff --git a/maca_common/src/word_emb.c b/maca_common/src/word_emb.c index b94743e97e812de6987adbfb39db065407a680cb..378809775aa0ebb640250de9fdd1a87eccf18914 100644 --- a/maca_common/src/word_emb.c +++ b/maca_common/src/word_emb.c @@ -69,7 +69,7 @@ word_emb *word_emb_load_w2v_file(char *file_name) while (1) { word[a] = fgetc(f); if (feof(f) || (word[a] == ' ')) break; - if ((a < w2v_max_w) && (word[a] != '\n')) a++; + if ((a < w2v_max_w) && (word[a] != '\n') && (word[a] != '\r')) a++; } word[a] = 0; hash_add(we->htable, word, word_nb++); @@ -126,7 +126,7 @@ int word_emb_number_of_columns_in_file(char *filename) fgets(buffer, 10000, f); token = strtok(buffer, " "); column_nb = 1; - while((token = strtok(NULL , " \n"))) + while((token = strtok(NULL , " \n\r"))) column_nb++; fclose(f); return column_nb; diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index d709f702b9ed113714bbac742b3634277e0b3f10..189c70f6e74006340a26335cf95d047a2834ce5b 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -120,7 +120,6 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); - if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); @@ -130,6 +129,7 @@ int main(int argc, char *argv[]) mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); /* in train mode create feature dictionnary for perceptron */ diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 4e487004e5f2e21caa9725a04cd1c5630442fd20..90af92621ee30c792d334a03bc5a86b8f4cab497 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -21,7 +21,7 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) break; /* fprintf(stderr, "%s", buffer); */ - if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); word_set_index(w, index); index++; diff --git a/perceptron/exec/cff_cutoff.c b/perceptron/exec/cff_cutoff.c index 64ad7e05231568704e16ee14fdb9602843217c8a..4fa8e757bfb0c10d9832c4e1d2097d2deebb3b2e 100644 --- a/perceptron/exec/cff_cutoff.c +++ b/perceptron/exec/cff_cutoff.c @@ -27,6 +27,7 @@ cff_cutoff_context *cff_cutoff_context_new(void) { cff_cutoff_context *ctx = (cff_cutoff_context *)memalloc(sizeof(cff_cutoff_context)); + ctx->help = 0; ctx->verbose = 0; ctx->program_name = NULL; ctx->vocabs_filename = NULL; diff --git a/perceptron/lib/src/perceptron_context.c b/perceptron/lib/src/perceptron_context.c index 2013181d9db01d57d0f5db6d1f078804ae31be8f..34d0f7bb759855b914439e65b833fd6d354e1c86 100644 --- a/perceptron/lib/src/perceptron_context.c +++ b/perceptron/lib/src/perceptron_context.c @@ -19,6 +19,7 @@ perceptron_context *perceptron_context_new(void) { perceptron_context *ctx = (perceptron_context *)memalloc(sizeof(perceptron_context)); + ctx->help = 0; ctx->verbose = 0; ctx->program_name = NULL; ctx->perc_model_filename = NULL;