#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include<ctype.h>

#include"context.h"
#include"dico.h"

void maca_lemmatizer_help_message(context *ctx)
{
  context_general_help_message(ctx);
  context_beam_help_message(ctx);
  context_conll_help_message(ctx);
  fprintf(stderr, "INPUT\n");
  context_input_help_message(ctx);
  context_mcd_help_message(ctx);
}

void maca_lemmatizer_check_options(context *ctx){
  if(ctx->help
     ){
    maca_lemmatizer_help_message(ctx);
    exit(1);
  }
}

void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx)
{
  char absolute_filename[500];

  if(!ctx->fplm_filename){
    strcpy(absolute_filename, ctx->maca_data_path);
    strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
    ctx->fplm_filename = strdup(absolute_filename);
  }

  if(ctx->verbose){
    fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename);
  }
}

char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size)
{
  char form[1000];
  char pos[1000];
  char lemma[1000];  
  char morpho[1000];
  int num = 0;
  char **lemma_array;
  //int lemma_array_size = 10000;
  *lemma_array_size = 10000;
  char buffer[10000];
  int fields_nb;
  FILE *f= myfopen(fplm_filename, "r");

  lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *));

  while(fgets(buffer, 10000, f)){
    fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
    /* if(!strcmp(form, "d")) */
    /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);   */
    if(fields_nb != 4){
      if(debug_mode){
	fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); 
	fprintf(stderr, "incorrect fplm entry, skipping it\n");
      }
      continue;
    }
    strcat(form, "/");
    strcat(form, pos);
    // TODO: memory leak: if form is already in the hash, it is not added and the memory
    // allocated by strdup() is leaked
    // solutions: hash_add does the strdup() if necessary (check else where !)
    // or return code to indicate whether form has been added or not
    hash_add(form_pos_ht, strdup(form), num);

    if(num >= *lemma_array_size){
      *lemma_array_size = 2 * (*lemma_array_size) + 1;
      lemma_array = realloc(lemma_array, (*lemma_array_size) * sizeof(char *));
      // initialize in order to be able to free correctly and the end
      for(int i=num; i<*lemma_array_size; ++i) {
	  lemma_array[i] = NULL;
      }
    }

    /* if(lemma_array[num] == NULL) */
    lemma_array[num] = strdup(lemma);
    num++;
  }
  /* fprintf(stderr, "%d entries loaded\n", num); */
  fclose(f);
  return lemma_array;
}


char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose)
{
  char form_pos[1000];
  int index_form_pos;

  strcpy(form_pos, form);
  strcat(form_pos, "/");
  strcat(form_pos, pos);
  index_form_pos = hash_get_val(form_pos_ht, form_pos);


  if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */
    return lemma_array[index_form_pos];
  
  strcpy(form_pos, form);
  to_lower_string(form_pos); /* change form to lower case and look it up again */
  strcat(form_pos, "/");
  strcat(form_pos, pos);
  index_form_pos = hash_get_val(form_pos_ht, form_pos);
  if(index_form_pos != HASH_INVALID_VAL)
    return lemma_array[index_form_pos];

  /* even in lower case couple form/pos is not found, return the form as lemma */
  if(verbose)
    fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
  
  return form;
}

/* a bit messy */
void print_word(word *w, mcd *mcd_struct, char *lemma)
{
  char *buffer = NULL;
  char *token = NULL;
  int col_nb = 0;

  if(mcd_get_lemma_col(mcd_struct) == -1){
    printf("%s\t%s\n", w->input, lemma);
  }
  else{
    buffer = strdup(w->input);
    token = strtok(buffer, "\t");
    col_nb = 0;
    while(token){
      if(col_nb != 0) printf("\t");
      if(col_nb == mcd_get_lemma_col(mcd_struct))
	printf("%s", lemma);
      else
	word_print_col_n(stdout, w, col_nb);
      col_nb++;
      token = strtok(NULL, "\t");
    }
    if(col_nb <= mcd_get_lemma_col(mcd_struct))
      printf("\t%s", lemma);
    printf("\n");
    free(buffer);
  }
}


int main(int argc, char *argv[])
{
  context *ctx = context_read_options(argc, argv);
  hash *form_pos_ht = hash_new(1000000);
  char **lemma_array = NULL;
  word *b0;
  char lemma[200];
  char form[200];
  char pos[200];
  config *c;
  
  maca_lemmatizer_check_options(ctx);
  maca_lemmatizer_set_linguistic_resources_filenames(ctx);

  int lemma_array_size;
  lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size);
  
  FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;

  c = config_new(f, ctx->mcd_struct, 5); 

  while(!config_is_terminal(c)){
    b0 = word_buffer_b0(c->bf);
    word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
    word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
    word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));

    /* if lemma is not specified in input it is looked up */
    if(strlen(lemma) && strcmp(lemma, "_"))
      print_word(b0, ctx->mcd_struct, lemma);
    else
      print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose));
  
    word_buffer_move_right(c->bf);
  }

  hash_free(form_pos_ht);

  for(int i=0; i<lemma_array_size; ++i) {
      if (lemma_array[i]) free(lemma_array[i]);
  }
  free(lemma_array);


  config_free(c); 
  if (ctx->input_filename) fclose(f);
  context_free(ctx);

  return 0;
}