diff --git a/CMakeLists.txt b/CMakeLists.txt index 766110fb2776ae8d42ae6209ee1a24e0c8974226..85d4e4bcdce693d1da0be09721efe1118d02158e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,10 @@ project(macaon2) add_definitions("-Wall") +include_directories(maca_common/include) + +add_subdirectory(maca_common) +add_subdirectory(maca_lemmatizer) add_subdirectory(maca_trans_parser) #set(CMAKE_INSTALL_PREFIX ../) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0dbc552b3f9756143c299ffe07eb309fdbda97c --- /dev/null +++ b/maca_common/CMakeLists.txt @@ -0,0 +1,11 @@ +set(SOURCES src/util.c + src/hash.c + src/dico.c + src/word_emb.c + src/mcd.c + src/dico_vec.c + src/feat_types.c +) + +#compiling library +add_library(maca_common STATIC ${SOURCES}) diff --git a/maca_trans_parser/src/dico.h b/maca_common/include/dico.h similarity index 100% rename from maca_trans_parser/src/dico.h rename to maca_common/include/dico.h diff --git a/maca_trans_parser/src/dico_vec.h b/maca_common/include/dico_vec.h similarity index 100% rename from maca_trans_parser/src/dico_vec.h rename to maca_common/include/dico_vec.h diff --git a/maca_trans_parser/src/feat_types.h b/maca_common/include/feat_types.h similarity index 100% rename from maca_trans_parser/src/feat_types.h rename to maca_common/include/feat_types.h diff --git a/maca_trans_parser/src/hash.h b/maca_common/include/hash.h similarity index 100% rename from maca_trans_parser/src/hash.h rename to maca_common/include/hash.h diff --git a/maca_trans_parser/src/mcd.h b/maca_common/include/mcd.h similarity index 72% rename from maca_trans_parser/src/mcd.h rename to maca_common/include/mcd.h index aad932aaec50f2f4ac9346d1c6edc9783d61ae04..e759789f3667de115689dc6275f032161f3d3784 100644 --- a/maca_trans_parser/src/mcd.h +++ b/maca_common/include/mcd.h @@ -18,15 +18,19 @@ typedef struct { int nb_col; int type2col[FEAT_TYPE_NB]; - int *col2type; + /* int *col2type; */ int *type; + char **type_str; int *representation; char **filename; dico **dico_array; word_emb **word_emb_array; } mcd; -mcd *mcd_read(char *mcd_filename, char *corpus_filename, dico_vec *vocabs); +mcd *mcd_build_conll07(void); +mcd *mcd_read(char *mcd_filename); +void mcd_link_to_dico(mcd *m, dico_vec *vocabs); +void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); void mcd_free(mcd *m); int mcd_get_code(mcd *m, char *str, int col); dico_vec *mcd_build_dico_vec(mcd *mcd_struct); diff --git a/maca_trans_parser/src/util.h b/maca_common/include/util.h similarity index 100% rename from maca_trans_parser/src/util.h rename to maca_common/include/util.h diff --git a/maca_trans_parser/src/word_emb.h b/maca_common/include/word_emb.h similarity index 100% rename from maca_trans_parser/src/word_emb.h rename to maca_common/include/word_emb.h diff --git a/maca_trans_parser/src/dico.c b/maca_common/src/dico.c similarity index 100% rename from maca_trans_parser/src/dico.c rename to maca_common/src/dico.c diff --git a/maca_trans_parser/src/dico_vec.c b/maca_common/src/dico_vec.c similarity index 100% rename from maca_trans_parser/src/dico_vec.c rename to maca_common/src/dico_vec.c diff --git a/maca_trans_parser/src/feat_types.c b/maca_common/src/feat_types.c similarity index 100% rename from maca_trans_parser/src/feat_types.c rename to maca_common/src/feat_types.c diff --git a/maca_trans_parser/src/hash.c b/maca_common/src/hash.c similarity index 100% rename from maca_trans_parser/src/hash.c rename to maca_common/src/hash.c diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c new file mode 100644 index 0000000000000000000000000000000000000000..a11cae21294ab9a28cf5f51a558d0d1bb44e35fa --- /dev/null +++ b/maca_common/src/mcd.c @@ -0,0 +1,315 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include "mcd.h" +#include "util.h" +#include "dico.h" +#include "word_emb.h" + + +mcd *mcd_new(int nb_col) +{ + mcd *m = (mcd *)memalloc(sizeof(mcd)); + int i; + m->nb_col = nb_col; + + for(i=0; i < FEAT_TYPE_NB; i++) + m->type2col[i] = -1; + + m->representation = (int *)memalloc(nb_col * sizeof(int)); + m->type = (int *)memalloc(nb_col * sizeof(int)); + m->type_str = (char **)memalloc(nb_col * sizeof(char *)); + /* m->col2type = (int *)memalloc(nb_col * sizeof(int)); */ + m->filename = (char **)memalloc(nb_col * sizeof(char *)); + m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *)); + m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *)); + + for(i=0; i < nb_col; i++){ + m->representation[i] = MCD_REPRESENTATION_NULL; + m->type[i] = -1; + m->type_str[i] = NULL; + /* m->col2type[i] = -1; */ + m->filename[i] = NULL; + m->dico_array[i] = NULL; + m->word_emb_array[i] = NULL;; + } + return m; +} + +void mcd_free(mcd *m) +{ + int i; + for(i=0; i < m->nb_col; i++){ + if(m->dico_array[i]) dico_free(m->dico_array[i]); + if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]); + if(m->type_str[i]) free(m->type_str[i]); + } + free(m->representation); + free(m->filename); + free(m->dico_array); + free(m->word_emb_array); + free(m->type_str); + free(m->type); + free(m); +} + +int mcd_get_code(mcd *m, char *str, int col){ + if(m->representation[col] == MCD_REPRESENTATION_VOCAB) + return dico_string2int(m->dico_array[col], str); + if(m->representation[col] == MCD_REPRESENTATION_EMB) + return word_emb_get_code(m->word_emb_array[col], str); + if(m->representation[col] == MCD_REPRESENTATION_INT) + return atoi(str); + return MCD_INVALID_VALUE; +} + +int mcd_max_column_index_in_file(char *mcd_filename) +{ + int max_col = -1; + FILE *f = myfopen(mcd_filename, "r"); + char buffer[1000]; /* ugly */ + int column; + char type[100]; + char representation[100]; + char filename[500]; /* ugly */ + int fields_number; + int line_number = 0; + + while(fgets(buffer, 1000, f)){ + line_number++; + if(feof(f)) break; + if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); + if(fields_number != 4){ + fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); + continue; + } + if(column > max_col) max_col = column; + } + return max_col; +} + + +/* takes as argument an mcd structure (m) and the name of a corpus file (corpus_filename) */ +/* populates the vocabularies of m with values found in corpus_filename */ + +void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename) +{ + int column; + + for(column=0; column < m->nb_col; column++){ + if((m->representation[column] == MCD_REPRESENTATION_VOCAB) + /* && (strcmp(m->filename[column], "_")) */ + && (m->dico_array[column] == NULL)){ + m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->type_str[column]); + fprintf(stderr, "extracting dico %s from corpus\n", m->type_str[column]); + } + } +} + + +/* takes as argument an mcd structure (m) and a dictionary vector (vocabs) */ +/* links the vocabularies of m to vocabularies of vocabs (based on their names) */ + +void mcd_link_to_dico(mcd *m, dico_vec *vocabs) +{ + int column; + for(column=0; column < m->nb_col; column++){ + if((m->representation[column] == MCD_REPRESENTATION_VOCAB) + && (!strcmp(m->filename[column], "_")) + && (m->dico_array[column] == NULL)){ + m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]); + fprintf(stderr, "linking to dico %s\n", m->type_str[column]); + } + } +} + +/* read an multi column description file and produces an mcd structure */ + +mcd *mcd_read(char *mcd_filename) +{ + int column; + char type[100]; + char representation[100]; + char filename[500]; /* ugly */ + int fields_number; + int line_number = 0; + char buffer[1000]; /* ugly */ + int nb_col = mcd_max_column_index_in_file(mcd_filename); + mcd *m = mcd_new(nb_col + 1); + FILE *f = myfopen(mcd_filename, "r"); + /* int first = 1; */ + + while(fgets(buffer, 1000, f)){ + line_number++; + if(feof(f)) break; + if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); + if(fields_number != 4){ + /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ + continue; + } + fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); + m->type[column] = feat_type_string2int(type); + m->type_str[column] = strdup(type); + if(m->type[column] == -1){ + fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); + continue; + } + m->type2col[m->type[column]] = column; + + if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; + else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; + else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB; + else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT; + else{ + fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename); + m->representation[column] = MCD_REPRESENTATION_NULL; + } + if(m->representation[column] != MCD_REPRESENTATION_NULL) + m->filename[column] = strdup(filename); + + if(strcmp(m->filename[column], "_")){ + if(m->representation[column] == MCD_REPRESENTATION_EMB){ + fprintf(stderr, "loading word embedding %s\n", m->filename[column]); + m->word_emb_array[column] = word_emb_load(m->filename[column]); + } + else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){ + fprintf(stderr, "loading dico %s\n", m->filename[column]); + m->dico_array[column] = dico_read(m->filename[column], 0.5); + } + } + } + fclose(f); + return m; +} + + +mcd *mcd_build_conll07(void) +{ + mcd *m = mcd_new(8); + m->type[0]=FEAT_TYPE_INDEX; + m->type_str[0]=strdup("INDEX"); + m->representation[0]= MCD_REPRESENTATION_INT; + m->type2col[FEAT_TYPE_INDEX] = 0; + + m->type[1]=FEAT_TYPE_FORM; + m->type_str[1]=strdup("FORM"); + m->representation[1]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_FORM] = 1; + + m->type[2]=FEAT_TYPE_LEMMA; + m->type_str[2]=strdup("LEMMA"); + m->representation[2]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_LEMMA] = 2; + + m->type[3]=FEAT_TYPE_CPOS; + m->type_str[3]=strdup("CPOS"); + m->representation[3]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_CPOS] = 3; + + m->type[4]=FEAT_TYPE_POS; + m->type_str[4]=strdup("POS"); + m->representation[4]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_POS] = 4; + + m->type[5]=FEAT_TYPE_FEATS; + m->type_str[5]=strdup("FEATS"); + m->representation[5]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_FEATS] = 5; + + m->type[6]=FEAT_TYPE_GOV; + m->type_str[6]=strdup("GOV"); + m->representation[6]= MCD_REPRESENTATION_INT; + m->type2col[FEAT_TYPE_GOV] = 6; + + m->type[7]=FEAT_TYPE_LABEL; + m->type_str[7]=strdup("LABEL"); + m->representation[7]= MCD_REPRESENTATION_VOCAB; + m->type2col[FEAT_TYPE_LABEL] = 7; + + return m; +} + +mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs) +{ + int column; + char type[100]; + char representation[100]; + char filename[500]; /* ugly */ + int fields_number; + int line_number = 0; + char buffer[1000]; /* ugly */ + int nb_col = mcd_max_column_index_in_file(mcd_filename); + mcd *m = mcd_new(nb_col + 1); + FILE *f = myfopen(mcd_filename, "r"); + /* int first = 1; */ + + while(fgets(buffer, 1000, f)){ + line_number++; + if(feof(f)) break; + if((buffer[0] == '\n') || (buffer[0] == '#')) continue; + fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); + if(fields_number != 4){ + /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ + continue; + } + fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); + m->type[column] = feat_type_string2int(type); + if(m->type[column] == -1){ + fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); + continue; + } + m->type2col[m->type[column]] = column; + /* m->col2type[column] = m->type[column]; */ + if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; + else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; + else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB; + else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT; + else{ + fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename); + m->representation[column] = MCD_REPRESENTATION_NULL; + } + if(m->representation[column] != MCD_REPRESENTATION_NULL){ + m->filename[column] = strdup(filename); + if(m->representation[column] == MCD_REPRESENTATION_EMB){ + fprintf(stderr, "loading word embedding %s\n", m->filename[column]); + m->word_emb_array[column] = word_emb_load(m->filename[column]); + } + else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){ + if(!strcmp(m->filename[column], "_")){ + if(corpus_filename){ + fprintf(stderr, "extracting dico %s from corpus\n", type); + m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, type); + } + else if(vocabs){ + fprintf(stderr, "linking to dico %s\n", type); + m->dico_array[column] = dico_vec_get_dico(vocabs, type); + } + if(m->dico_array[column] == NULL) + fprintf(stderr, "cannot find dico %s\n", type); + } + else{ + fprintf(stderr, "loading dico %s\n", m->filename[column]); + m->dico_array[column] = dico_read(m->filename[column], 0.5); + } + } + } + } + fclose(f); + return m; +} + + +dico_vec *mcd_build_dico_vec(mcd *mcd_struct) +{ + dico_vec *dv = dico_vec_new(); + int i; + for(i=0; i < mcd_struct->nb_col; i++){ + if(mcd_struct->dico_array[i]){ + dico_vec_add(dv, mcd_struct->dico_array[i]); + } + } + return dv; +} diff --git a/maca_trans_parser/src/util.c b/maca_common/src/util.c similarity index 100% rename from maca_trans_parser/src/util.c rename to maca_common/src/util.c diff --git a/maca_trans_parser/src/word_emb.c b/maca_common/src/word_emb.c similarity index 100% rename from maca_trans_parser/src/word_emb.c rename to maca_common/src/word_emb.c diff --git a/maca_lemmatizer/CMakeLists.txt b/maca_lemmatizer/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4f7e9da1fccafb07525ef1d29add9ffeb7a5a6f --- /dev/null +++ b/maca_lemmatizer/CMakeLists.txt @@ -0,0 +1,12 @@ +set(SOURCES src/context.c) + +##compiling library +include_directories(src) +add_library(maca_lemmatizer_lib STATIC ${SOURCES}) + +#compiling, linking and installing executables + +add_executable(maca_lemmatizer ./src/maca_lemmatizer.c) +target_link_libraries(maca_lemmatizer maca_lemmatizer_lib) +target_link_libraries(maca_lemmatizer maca_common) +install (TARGETS maca_lemmatizer DESTINATION bin) diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c new file mode 100644 index 0000000000000000000000000000000000000000..3ba024477c1376898047ca1e91da20470ec6465a --- /dev/null +++ b/maca_lemmatizer/src/context.c @@ -0,0 +1,149 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "context.h" +#include "util.h" + + +#define STANDARD_FPLM_FILENAME "fplm" + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->conll_filename) free(ctx->conll_filename); + if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->conll_filename = NULL; + ctx->fplm_filename = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_conll_help_message(context *ctx){ + fprintf(stderr, "\t-i --conll <file> : conll file name\n"); +} +void context_fplm_help_message(context *ctx){ + fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); +} +void context_mcd_help_message(context *ctx){ + fprintf(stderr, "\t-m --mcd <file> : multi column description file name\n"); +} +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-C --language : identifier of the language to use\n"); +} +void context_maca_data_path_help_message(context *ctx){ + fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[8] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"conll", required_argument, 0, 'i'}, + {"mcd", required_argument, 0, 'm'}, + {"language", required_argument, 0, 'C'}, + {"fplm", required_argument, 0, 'f'}, + {"maca_data_path", required_argument, 0, 'M'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdi:f:m:C:M:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'i': + ctx->conll_filename = strdup(optarg); + break; + case 'm': + ctx->mcd_filename = strdup(optarg); + ctx->mcd_struct = mcd_read(ctx->mcd_filename); + break; + case 'C': + ctx->language = strdup(optarg); + break; + case 'M': + ctx->maca_data_path = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + if(ctx->mcd_filename == NULL) + ctx->mcd_struct = mcd_build_conll07(); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else + strcat(absolute_path, getenv("MACAON_DIR")); + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, STANDARD_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + +} diff --git a/maca_lemmatizer/src/context.h b/maca_lemmatizer/src/context.h new file mode 100644 index 0000000000000000000000000000000000000000..5352eb3f71cebaf88b5c571a81ae7051816d207c --- /dev/null +++ b/maca_lemmatizer/src/context.h @@ -0,0 +1,31 @@ +#ifndef __MACA_LEMMATIZER_CONTEXT__ +#define __MACA_LEMMATIZER_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *conll_filename; + char *fplm_filename; + char *language; + char *maca_data_path; + char *mcd_filename; + mcd *mcd_struct; +} context; + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_conll_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_mcd_help_message(context *ctx); + + +#endif diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c new file mode 100644 index 0000000000000000000000000000000000000000..6737bbb366ea379386d79a8a182a367943c3a9a1 --- /dev/null +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -0,0 +1,154 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<ctype.h> + +#include"util.h" +#include"hash.h" +#include"mcd.h" +#include"context.h" + +void maca_lemmatizer_help_message(context *ctx) +{ + context_general_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + context_mcd_help_message(ctx); + context_language_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_fplm_help_message(ctx); +} + + +void maca_lemmatizer_check_options(context *ctx){ + if(!ctx->conll_filename + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + || ctx->help + ){ + maca_lemmatizer_help_message(ctx); + exit(1); + } +} + +char **read_fplm_file(char *fplm_filename, hash *form_pos_ht) +{ + char form[1000]; + char pos[1000]; + char lemma[1000]; + char morpho[1000]; + int num = 0; + char **lemma_array; + int lemma_array_size = 10000; + + FILE *f= myfopen(fplm_filename, "r"); + int fields_nb; + + lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *)); + + while(!feof(f)){ + fields_nb = fscanf(f, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + if(fields_nb != 4){ + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + continue; + } + strcat(form, "/"); + strcat(form, pos); + hash_add(form_pos_ht, strdup(form), num); + + if(num >= lemma_array_size){ + lemma_array_size = 2 * (lemma_array_size) + 1; + lemma_array = realloc(lemma_array, (lemma_array_size) * sizeof(char *)); + } + + /* if(lemma_array[num] == NULL) */ + lemma_array[num] = strdup(lemma); + num++; + } + /* fprintf(stderr, "%d entries loaded\n", num); */ + return lemma_array; +} + +char *to_lower_string(char *s) +{ + int i; + for(i=0; i < strlen(s); i++) + s[i] = tolower(s[i]); + return s; +} + + +int main(int argc, char *argv[]) +{ + hash *form_pos_ht = hash_new(1000000); + char buffer[10000]; + char *form; + char *pos; + char *token; + int column_nb; + char form_pos[500]; + char *lemma; + int index_form_pos; + char **lemma_array; + context *ctx; + + ctx = context_read_options(argc, argv); + maca_lemmatizer_check_options(ctx); + + FILE *f = myfopen(ctx->conll_filename, "r"); + + lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht); + + /* look for a valid word */ + while(fgets(buffer, 10000, f)){ + if(feof(f)) return 0; /* no more words to read */ + if((buffer[0] == '\n') || (buffer[0] == ' ')){ + printf("\n"); + continue; + } + + buffer[strlen(buffer)-1] = '\0'; + printf("%s", buffer); + token = strtok(buffer, "\t"); + column_nb = 0; + form = NULL; + pos = NULL; + do{ + if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_FORM)) + form = strdup(token); + if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_POS)) + pos = strdup(token); + column_nb++; + } while((token = strtok(NULL , "\t"))); + + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL){ + lemma = lemma_array[index_form_pos]; + } + else{ + to_lower_string(form_pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL){ + lemma = lemma_array[index_form_pos]; + } + else + lemma = form; + } + + /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */ + printf("\t%s\n", lemma); + + if(pos)free(pos); + if(form)free(form); + } + free(lemma_array); + hash_free(form_pos_ht); + + return 0; +} + diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 739776227acc569240ce915852f59890fdd24ae4..c638a2c3ad89a17f9036cc525123a67608e9d166 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -1,10 +1,8 @@ set(SOURCES src/context.c - src/dico_vec.c src/feat_desc.c src/feature_table.c src/movement.c src/sentence.c - src/util.c src/feat_fct.c src/feat_vec.c src/global_feat_vec.c @@ -12,24 +10,17 @@ set(SOURCES src/context.c src/simple_decoder.c src/cf_file.c src/feat_lib.c - src/hash.c src/perceptron.c src/stack.c src/word.c src/config2feat_vec.c src/depset.c src/feat_model.c - src/word_emb.c src/config.c - src/dico.c - src/feat_types.c - src/mcd.c src/queue.c src/beam.c ) - - #compiling library include_directories(src) add_library(transparse STATIC ${SOURCES}) @@ -38,23 +29,28 @@ add_library(transparse STATIC ${SOURCES}) add_executable(maca_trans_parser_conll2cff ./src/transform_treebank.c) target_link_libraries(maca_trans_parser_conll2cff transparse) +target_link_libraries(maca_trans_parser_conll2cff maca_common) install (TARGETS maca_trans_parser_conll2cff DESTINATION bin) add_executable(maca_trans_parser ./src/decode.c) target_link_libraries(maca_trans_parser transparse) +target_link_libraries(maca_trans_parser maca_common) install (TARGETS maca_trans_parser DESTINATION bin) add_executable(maca_trans_parser_train ./src/train_perceptron.c) target_compile_options(maca_trans_parser_train INTERFACE -Wall) target_link_libraries(maca_trans_parser_train transparse) +target_link_libraries(maca_trans_parser_train maca_common) install (TARGETS maca_trans_parser_train DESTINATION bin) add_executable(maca_trans_parser_train_from_cff ./src/train.c) target_link_libraries(maca_trans_parser_train_from_cff transparse) +target_link_libraries(maca_trans_parser_train_from_cff maca_common) install (TARGETS maca_trans_parser_train_from_cff DESTINATION bin) add_executable(maca_trans_parser_cff_cutoff ./src/cff_cutoff.c) target_link_libraries(maca_trans_parser_cff_cutoff transparse) +target_link_libraries(maca_trans_parser_cff_cutoff maca_common) install (TARGETS maca_trans_parser_cff_cutoff DESTINATION bin) #add_executable(test_w2v ./src/test_w2v.c) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 1d98c287d639216f45ea26206cd6c99850601c45..b39cafb1a3f01d51aaccbf1e460b2bc3c8154923 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -256,6 +256,7 @@ context *context_read_options(int argc, char *argv[]) break; case 'C': ctx->mcd_filename = strdup(optarg); + ctx->mcd_struct = mcd_read(ctx->mcd_filename); break; case 'F': ctx->features_model_filename = strdup(optarg); @@ -281,11 +282,15 @@ context *context_read_options(int argc, char *argv[]) ctx->mvt_nb = ctx->mcd_struct->dico_array[ctx->mcd_struct->type2col[FEAT_TYPE_LABEL]]->nbelem * 2 + 1; }*/ + /* if(ctx->features_model && ctx->mcd_struct) feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); - + */ context_set_linguistic_resources_filenames(ctx); + if(ctx->mcd_filename == NULL){ + ctx->mcd_struct = mcd_build_conll07(); + } return ctx; } @@ -319,11 +324,11 @@ void context_set_linguistic_resources_filenames(context *ctx) ctx->vocabs_filename = strdup(absolute_filename); } - if(!ctx->mcd_filename){ + /* if(!ctx->mcd_filename){ strcpy(absolute_filename, absolute_path); strcat(absolute_filename, STANDARD_MULTI_COL_DESC_FILENAME); ctx->mcd_filename = strdup(absolute_filename); - } + }*/ if(!ctx->features_model_filename){ strcpy(absolute_filename, absolute_path); diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index 909f923b47584acc679e0f0554d9cbb050e0e806..5f8679dc5eb01c010c890d89dd3f7ffa171e1735 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -54,7 +54,9 @@ int main(int argc, char *argv[]) decode_check_options(ctx); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - ctx->mcd_struct = mcd_read(ctx->mcd_filename, NULL, ctx->vocabs); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); if(ctx->dico_labels == NULL){ @@ -69,7 +71,7 @@ int main(int argc, char *argv[]) /* when in stream mode, force to renumber the tokens (ugly !) */ if(ctx->stream_mode){ - ctx->mcd_struct->col2type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; + ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; } diff --git a/maca_trans_parser/src/mcd.c b/maca_trans_parser/src/mcd.c deleted file mode 100644 index c701da7a8718f03e2772865cb4faacc0ca26f94d..0000000000000000000000000000000000000000 --- a/maca_trans_parser/src/mcd.c +++ /dev/null @@ -1,169 +0,0 @@ -#include<stdio.h> -#include<stdlib.h> -#include<string.h> - -#include "mcd.h" -#include "util.h" -#include "dico.h" -#include "word_emb.h" - - -mcd *mcd_new(int nb_col) -{ - mcd *m = (mcd *)memalloc(sizeof(mcd)); - int i; - m->nb_col = nb_col; - - for(i=0; i < FEAT_TYPE_NB; i++) - m->type2col[i] = -1; - - m->representation = (int *)memalloc(nb_col * sizeof(int)); - m->type = (int *)memalloc(nb_col * sizeof(int)); - m->col2type = (int *)memalloc(nb_col * sizeof(int)); - m->filename = (char **)memalloc(nb_col * sizeof(char *)); - m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *)); - m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *)); - - for(i=0; i < nb_col; i++){ - m->representation[i] = MCD_REPRESENTATION_NULL; - m->type[i] = -1; - m->col2type[i] = -1; - m->filename[i] = NULL; - m->dico_array[i] = NULL; - m->word_emb_array[i] = NULL;; - } - return m; -} - -void mcd_free(mcd *m) -{ - int i; - for(i=0; i < m->nb_col; i++){ - if(m->dico_array[i]) dico_free(m->dico_array[i]); - if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]); - } - free(m->representation); - free(m->filename); - free(m->dico_array); - free(m->word_emb_array); - - free(m); -} - -int mcd_get_code(mcd *m, char *str, int col){ - if(m->representation[col] == MCD_REPRESENTATION_VOCAB) - return dico_string2int(m->dico_array[col], str); - if(m->representation[col] == MCD_REPRESENTATION_EMB) - return word_emb_get_code(m->word_emb_array[col], str); - if(m->representation[col] == MCD_REPRESENTATION_INT) - return atoi(str); - return MCD_INVALID_VALUE; -} - -int mcd_max_column_index_in_file(char *mcd_filename) -{ - int max_col = -1; - FILE *f = myfopen(mcd_filename, "r"); - char buffer[1000]; /* ugly */ - int column; - char type[100]; - char representation[100]; - char filename[500]; /* ugly */ - int fields_number; - int line_number = 0; - - while(fgets(buffer, 1000, f)){ - line_number++; - if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); - if(fields_number != 4){ - fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); - continue; - } - if(column > max_col) max_col = column; - } - return max_col; -} - -mcd *mcd_read(char *mcd_filename, char *corpus_filename, dico_vec *vocabs) -{ - int column; - char type[100]; - char representation[100]; - char filename[500]; /* ugly */ - int fields_number; - int line_number = 0; - char buffer[1000]; /* ugly */ - int nb_col = mcd_max_column_index_in_file(mcd_filename); - mcd *m = mcd_new(nb_col + 1); - FILE *f = myfopen(mcd_filename, "r"); - /* int first = 1; */ - - while(fgets(buffer, 1000, f)){ - line_number++; - if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); - if(fields_number != 4){ - /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ - continue; - } - fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); - m->type[column] = feat_type_string2int(type); - if(m->type[column] == -1){ - fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); - continue; - } - m->type2col[m->type[column]] = column; - m->col2type[column] = m->type[column]; - if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; - else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; - else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB; - else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT; - else{ - fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename); - m->representation[column] = MCD_REPRESENTATION_NULL; - } - if(m->representation[column] != MCD_REPRESENTATION_NULL){ - m->filename[column] = strdup(filename); - if(m->representation[column] == MCD_REPRESENTATION_EMB){ - fprintf(stderr, "loading word embedding %s\n", m->filename[column]); - m->word_emb_array[column] = word_emb_load(m->filename[column]); - } - else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){ - if(!strcmp(m->filename[column], "_")){ - if(corpus_filename){ - fprintf(stderr, "extracting dico %s from corpus\n", type); - m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, type); - } - else if(vocabs){ - fprintf(stderr, "linking to dico %s\n", type); - m->dico_array[column] = dico_vec_get_dico(vocabs, type); - } - if(m->dico_array[column] == NULL) - fprintf(stderr, "cannot find dico %s\n", type); - } - else{ - fprintf(stderr, "loading dico %s\n", m->filename[column]); - m->dico_array[column] = dico_read(m->filename[column], 0.5); - } - } - } - } - fclose(f); - return m; -} - - -dico_vec *mcd_build_dico_vec(mcd *mcd_struct) -{ - dico_vec *dv = dico_vec_new(); - int i; - for(i=0; i < mcd_struct->nb_col; i++){ - if(mcd_struct->dico_array[i]){ - dico_vec_add(dv, mcd_struct->dico_array[i]); - } - } - return dv; -} diff --git a/maca_trans_parser/src/train_perceptron.c b/maca_trans_parser/src/train_perceptron.c index f9c9482dabcbfa09885e9ec8e1162ff80da24b3f..54ff2990ec7318da4a84bc90dc1720b16ce77b16 100644 --- a/maca_trans_parser/src/train_perceptron.c +++ b/maca_trans_parser/src/train_perceptron.c @@ -42,7 +42,7 @@ void train_perceptron_check_options(context *ctx) { if(!ctx->conll_filename || ctx->help - || !ctx->mcd_filename + /* || !ctx->mcd_filename */ || !ctx->features_model_filename || !ctx->perc_model_filename || !ctx->vocabs_filename @@ -60,7 +60,8 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); train_perceptron_check_options(ctx); - ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->conll_filename, NULL); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); diff --git a/maca_trans_parser/src/transform_treebank.c b/maca_trans_parser/src/transform_treebank.c index fbda610c9218d08e93298872510f4a229a08a20c..7b46049f116e490dc402e47fd4e62279d439e8f6 100644 --- a/maca_trans_parser/src/transform_treebank.c +++ b/maca_trans_parser/src/transform_treebank.c @@ -36,7 +36,7 @@ void transform_treebank_check_options(context *ctx) { if(!ctx->conll_filename || ctx->help - || !ctx->mcd_filename + /* || !ctx->mcd_filename */ || !(ctx->cff_filename || ctx->fann_filename) ){ transform_treebank_help_message(ctx); @@ -280,24 +280,22 @@ int main(int argc, char *argv[]) transform_treebank_check_options(ctx); if(ctx->mode == TRAIN_MODE){ - ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->conll_filename, NULL); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); - ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); - /* ctx->mvt_nb = ctx->mcd_struct->dico_array[ctx->mcd_struct->type2col[FEAT_TYPE_LABEL]]->nbelem * 2 + 1; */ } else if(ctx->mode == TEST_MODE){ ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - ctx->mcd_struct = mcd_read(ctx->mcd_filename, NULL, ctx->vocabs); - ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); } - - + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); if(ctx->dico_labels == NULL){ fprintf(stderr, "cannot find label names\n"); return 1; } ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c index ed3912374685e646512b2abd75b64fc5f149c664..bdb63930b3fa801c4608b1529ff5ce6607f89151 100644 --- a/maca_trans_parser/src/word.c +++ b/maca_trans_parser/src/word.c @@ -48,10 +48,10 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) w = word_new(buffer); token = strtok(buffer, "\t"); do{ - if((column_nb < mcd_struct->nb_col) && (mcd_struct->col2type[column_nb] != -1)){ - w->feat_array[mcd_struct->col2type[column_nb]] = mcd_get_code(mcd_struct, token, column_nb); + if((column_nb < mcd_struct->nb_col) && (mcd_struct->type[column_nb] != -1)){ + w->feat_array[mcd_struct->type[column_nb]] = mcd_get_code(mcd_struct, token, column_nb); } - if(mcd_struct->col2type[column_nb] == FEAT_TYPE_FORM){ + if(mcd_struct->type[column_nb] == FEAT_TYPE_FORM){ w->U1 = isupper(token[0]) ? 1 : 0; } column_nb++;