From 33d5af4913640314f00001b6f386386050d8654a Mon Sep 17 00:00:00 2001 From: "robin.perrotin" <robin.perrotin@lif.univ-mrs.fr> Date: Wed, 22 Nov 2017 03:37:55 +0100 Subject: [PATCH] solved issues with '\r' eol when running on windows --- maca_common/src/dico.c | 4 ++-- maca_common/src/feat_model.c | 6 +++--- maca_common/src/hash.c | 3 +++ maca_common/src/mcd.c | 4 ++-- maca_common/src/sentence.c | 4 ++-- maca_common/src/word.c | 3 ++- maca_common/src/word_buffer.c | 2 +- maca_common/src/word_emb.c | 4 ++-- maca_trans_parser/src/maca_trans_tagger_mcf2cff.c | 2 +- maca_trans_parser/src/queue.c | 2 +- perceptron/exec/cff_cutoff.c | 1 + perceptron/lib/src/perceptron_context.c | 1 + 12 files changed, 21 insertions(+), 15 deletions(-) diff --git a/maca_common/src/dico.c b/maca_common/src/dico.c index eb0a1cd..01a18da 100644 --- a/maca_common/src/dico.c +++ b/maca_common/src/dico.c @@ -157,8 +157,8 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name) while(fgets(buffer, 10000, f)){ if(feof(f)) return NULL; /* no more words to read */ - if((buffer[0] == '\n') || (buffer[0] == ' ')) continue; - if(buffer[strlen(buffer) - 1] == '\n') + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == ' ')) continue; + if(buffer[strlen(buffer) - 1] == '\n' || buffer[strlen(buffer) - 1] == '\r') buffer[strlen(buffer) - 1] = '\0'; token = strtok(buffer, "\t"); /* get index */ column_nb = 0; diff --git a/maca_common/src/feat_model.c b/maca_common/src/feat_model.c index 4bf28ca..5279e68 100644 --- a/maca_common/src/feat_model.c +++ b/maca_common/src/feat_model.c @@ -67,16 +67,16 @@ feat_model *feat_model_read(char *filename, feat_lib *fl, int verbose) while(fgets(buffer, 1000, f)){ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == '#')) continue; if(verbose) fprintf(stderr, "%d", feature_number + 1); fd = feat_desc_new(); - feat_name = strtok(buffer, " \n"); + feat_name = strtok(buffer, " \n\r"); do{ if(verbose) fprintf(stderr, "\t%s", feat_name); sfd = feat_lib_get_simple_feat_desc(fm->fl, feat_name); if(sfd) feat_desc_add(fd, sfd); - }while((feat_name = strtok(NULL, " \n"))); + }while((feat_name = strtok(NULL, " \n\r"))); if(verbose) fprintf(stderr, "\n"); feat_model_add(fm, fd); feature_number++; diff --git a/maca_common/src/hash.c b/maca_common/src/hash.c index 9b8ccf8..2f80f2f 100644 --- a/maca_common/src/hash.c +++ b/maca_common/src/hash.c @@ -60,6 +60,9 @@ cell *hash_lookup(hash *h, char *key) for(c=h->array[index]; c; c = c->next) if(!strcmp(key, c->key)) return c; + + //printf("<key:%s>\n",key); + return NULL; } diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 6911740..d288f33 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -117,7 +117,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) while(fgets(buffer, 1000, f)){ line_number++; if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == '#')) continue; fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); @@ -183,7 +183,7 @@ mcd *mcd_read(char *mcd_filename, int verbose) while(fgets(buffer, 1000, f)){ line_number++; if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + if((buffer[0] == '\n') || (buffer[0] == '\r') ||(buffer[0] == '#')) continue; fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index a97486b..f82a0a1 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -73,7 +73,7 @@ sentence *sentence_read(FILE *f, mcd *mcd_struct) while(fgets(buffer, 1000, f)){ /* printf("buffer = %s\n", buffer); */ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence indicated by empty line */ + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == ' ')) break; /* end of the sentence indicated by empty line */ w = word_parse_buffer(buffer, mcd_struct); if(w) sentence_add_word(s, w); if(word_is_eos(w, mcd_struct)) break; @@ -95,7 +95,7 @@ sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct) while(fgets(buffer, 1000, f)){ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == ' ')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); sentence_add_word(s, w); } diff --git a/maca_common/src/word.c b/maca_common/src/word.c index 21e303e..3e3177e 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -57,7 +57,8 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) int col = 0; /* remove newline from buffer */ - if(buffer[strlen(buffer)-1] == '\n') buffer[strlen(buffer)-1] = '\0'; + if(buffer[strlen(buffer)-1] == '\n' || buffer[strlen(buffer)-1] == '\r') buffer[strlen(buffer)-1] = '\0'; + if(buffer[strlen(buffer)-2] == '\r') buffer[strlen(buffer)-2] = '\0'; w = word_new(buffer); token = strtok(buffer, "\t"); diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index e9355d5..50fce20 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -145,7 +145,7 @@ int word_buffer_read_sentence(word_buffer *wb) int index = 1; while(fgets(buffer, 10000, word_buffer_get_input_file(wb))){ - if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) continue; /* ignore empty lines */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t') || (buffer[0] == '\r')) continue; /* ignore empty lines */ if(feof(word_buffer_get_input_file(wb))) break; w = word_parse_buffer(buffer, word_buffer_get_mcd(wb)); word_set_index(w, index); diff --git a/maca_common/src/word_emb.c b/maca_common/src/word_emb.c index b94743e..3788097 100644 --- a/maca_common/src/word_emb.c +++ b/maca_common/src/word_emb.c @@ -69,7 +69,7 @@ word_emb *word_emb_load_w2v_file(char *file_name) while (1) { word[a] = fgetc(f); if (feof(f) || (word[a] == ' ')) break; - if ((a < w2v_max_w) && (word[a] != '\n')) a++; + if ((a < w2v_max_w) && (word[a] != '\n') && (word[a] != '\r')) a++; } word[a] = 0; hash_add(we->htable, word, word_nb++); @@ -126,7 +126,7 @@ int word_emb_number_of_columns_in_file(char *filename) fgets(buffer, 10000, f); token = strtok(buffer, " "); column_nb = 1; - while((token = strtok(NULL , " \n"))) + while((token = strtok(NULL , " \n\r"))) column_nb++; fclose(f); return column_nb; diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index d709f70..189c70f 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -120,7 +120,6 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); - if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); @@ -130,6 +129,7 @@ int main(int argc, char *argv[]) mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); /* in train mode create feature dictionnary for perceptron */ diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 4e48700..90af926 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -21,7 +21,7 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) break; /* fprintf(stderr, "%s", buffer); */ - if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); word_set_index(w, index); index++; diff --git a/perceptron/exec/cff_cutoff.c b/perceptron/exec/cff_cutoff.c index 64ad7e0..4fa8e75 100644 --- a/perceptron/exec/cff_cutoff.c +++ b/perceptron/exec/cff_cutoff.c @@ -27,6 +27,7 @@ cff_cutoff_context *cff_cutoff_context_new(void) { cff_cutoff_context *ctx = (cff_cutoff_context *)memalloc(sizeof(cff_cutoff_context)); + ctx->help = 0; ctx->verbose = 0; ctx->program_name = NULL; ctx->vocabs_filename = NULL; diff --git a/perceptron/lib/src/perceptron_context.c b/perceptron/lib/src/perceptron_context.c index 2013181..34d0f7b 100644 --- a/perceptron/lib/src/perceptron_context.c +++ b/perceptron/lib/src/perceptron_context.c @@ -19,6 +19,7 @@ perceptron_context *perceptron_context_new(void) { perceptron_context *ctx = (perceptron_context *)memalloc(sizeof(perceptron_context)); + ctx->help = 0; ctx->verbose = 0; ctx->program_name = NULL; ctx->perc_model_filename = NULL; -- GitLab