#include<stdio.h> #include<stdlib.h> #include<string.h> #include<unistd.h> #include<getopt.h> #include<ctype.h> #include"context.h" #include"dico.h" void maca_lemmatizer_help_message(context *ctx) { context_general_help_message(ctx); context_beam_help_message(ctx); context_conll_help_message(ctx); fprintf(stderr, "INPUT\n"); context_input_help_message(ctx); context_mcd_help_message(ctx); } void maca_lemmatizer_check_options(context *ctx){ if(ctx->help ){ maca_lemmatizer_help_message(ctx); exit(1); } } void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) { char absolute_filename[500]; if(!ctx->fplm_filename){ strcpy(absolute_filename, ctx->maca_data_path); strcat(absolute_filename, DEFAULT_FPLM_FILENAME); ctx->fplm_filename = strdup(absolute_filename); } if(ctx->verbose){ fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename); } } char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size) { char form[1000]; char pos[1000]; char lemma[1000]; char morpho[1000]; int num = 0; char **lemma_array; //int lemma_array_size = 10000; *lemma_array_size = 10000; char buffer[10000]; int fields_nb; FILE *f= myfopen(fplm_filename, "r"); lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *)); while(fgets(buffer, 10000, f)){ fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); /* if(!strcmp(form, "d")) */ /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ if(fields_nb != 4){ if(debug_mode){ fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); fprintf(stderr, "incorrect fplm entry, skipping it\n"); } continue; } strcat(form, "/"); strcat(form, pos); // TODO: memory leak: if form is already in the hash, it is not added and the memory // allocated by strdup() is leaked // solutions: hash_add does the strdup() if necessary (check else where !) // or return code to indicate whether form has been added or not hash_add(form_pos_ht, strdup(form), num); if(num >= *lemma_array_size){ *lemma_array_size = 2 * (*lemma_array_size) + 1; lemma_array = realloc(lemma_array, (*lemma_array_size) * sizeof(char *)); // initialize in order to be able to free correctly and the end for(int i=num; i<*lemma_array_size; ++i) { lemma_array[i] = NULL; } } /* if(lemma_array[num] == NULL) */ lemma_array[num] = strdup(lemma); num++; } /* fprintf(stderr, "%d entries loaded\n", num); */ fclose(f); return lemma_array; } char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) { char form_pos[1000]; int index_form_pos; strcpy(form_pos, form); strcat(form_pos, "/"); strcat(form_pos, pos); index_form_pos = hash_get_val(form_pos_ht, form_pos); if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */ return lemma_array[index_form_pos]; strcpy(form_pos, form); to_lower_string(form_pos); /* change form to lower case and look it up again */ strcat(form_pos, "/"); strcat(form_pos, pos); index_form_pos = hash_get_val(form_pos_ht, form_pos); if(index_form_pos != HASH_INVALID_VAL) return lemma_array[index_form_pos]; /* even in lower case couple form/pos is not found, return the form as lemma */ if(verbose) fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); return form; } /* a bit messy */ void print_word(word *w, mcd *mcd_struct, char *lemma) { char *buffer = NULL; char *token = NULL; int col_nb = 0; if(mcd_get_lemma_col(mcd_struct) == -1){ printf("%s\t%s\n", w->input, lemma); } else{ buffer = strdup(w->input); token = strtok(buffer, "\t"); col_nb = 0; while(token){ if(col_nb != 0) printf("\t"); if(col_nb == mcd_get_lemma_col(mcd_struct)) printf("%s", lemma); else word_print_col_n(stdout, w, col_nb); col_nb++; token = strtok(NULL, "\t"); } if(col_nb <= mcd_get_lemma_col(mcd_struct)) printf("\t%s", lemma); printf("\n"); free(buffer); } } int main(int argc, char *argv[]) { context *ctx = context_read_options(argc, argv); hash *form_pos_ht = hash_new(1000000); char **lemma_array = NULL; word *b0; char lemma[200]; char form[200]; char pos[200]; config *c; maca_lemmatizer_check_options(ctx); maca_lemmatizer_set_linguistic_resources_filenames(ctx); int lemma_array_size; lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size); FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; c = config_new(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ b0 = word_buffer_b0(c->bf); word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct)); word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct)); word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct)); /* if lemma is not specified in input it is looked up */ if(strlen(lemma) && strcmp(lemma, "_")) print_word(b0, ctx->mcd_struct, lemma); else print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose)); word_buffer_move_right(c->bf); } hash_free(form_pos_ht); for(int i=0; i<lemma_array_size; ++i) { if (lemma_array[i]) free(lemma_array[i]); } free(lemma_array); config_free(c); if (ctx->input_filename) fclose(f); context_free(ctx); return 0; }