Commit a0a86014 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

Merge branch 'maca_common' into 'master'

Maca common

merging with maca_common branch, general functions are now in maca_common

See merge request !1
parents 4d2a9550 362239fe
......@@ -3,6 +3,10 @@ project(macaon2)
add_definitions("-Wall")
include_directories(maca_common/include)
add_subdirectory(maca_common)
add_subdirectory(maca_lemmatizer)
add_subdirectory(maca_trans_parser)
#set(CMAKE_INSTALL_PREFIX ../)
set(SOURCES src/util.c
src/hash.c
src/dico.c
src/word_emb.c
src/mcd.c
src/dico_vec.c
src/feat_types.c
)
#compiling library
add_library(maca_common STATIC ${SOURCES})
......@@ -18,15 +18,19 @@
typedef struct {
int nb_col;
int type2col[FEAT_TYPE_NB];
int *col2type;
/* int *col2type; */
int *type;
char **type_str;
int *representation;
char **filename;
dico **dico_array;
word_emb **word_emb_array;
} mcd;
mcd *mcd_read(char *mcd_filename, char *corpus_filename, dico_vec *vocabs);
mcd *mcd_build_conll07(void);
mcd *mcd_read(char *mcd_filename);
void mcd_link_to_dico(mcd *m, dico_vec *vocabs);
void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
void mcd_free(mcd *m);
int mcd_get_code(mcd *m, char *str, int col);
dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
......
......@@ -19,7 +19,8 @@ mcd *mcd_new(int nb_col)
m->representation = (int *)memalloc(nb_col * sizeof(int));
m->type = (int *)memalloc(nb_col * sizeof(int));
m->col2type = (int *)memalloc(nb_col * sizeof(int));
m->type_str = (char **)memalloc(nb_col * sizeof(char *));
/* m->col2type = (int *)memalloc(nb_col * sizeof(int)); */
m->filename = (char **)memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *));
......@@ -27,7 +28,8 @@ mcd *mcd_new(int nb_col)
for(i=0; i < nb_col; i++){
m->representation[i] = MCD_REPRESENTATION_NULL;
m->type[i] = -1;
m->col2type[i] = -1;
m->type_str[i] = NULL;
/* m->col2type[i] = -1; */
m->filename[i] = NULL;
m->dico_array[i] = NULL;
m->word_emb_array[i] = NULL;;
......@@ -41,12 +43,14 @@ void mcd_free(mcd *m)
for(i=0; i < m->nb_col; i++){
if(m->dico_array[i]) dico_free(m->dico_array[i]);
if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]);
if(m->type_str[i]) free(m->type_str[i]);
}
free(m->representation);
free(m->filename);
free(m->dico_array);
free(m->word_emb_array);
free(m->type_str);
free(m->type);
free(m);
}
......@@ -86,7 +90,149 @@ int mcd_max_column_index_in_file(char *mcd_filename)
return max_col;
}
mcd *mcd_read(char *mcd_filename, char *corpus_filename, dico_vec *vocabs)
/* takes as argument an mcd structure (m) and the name of a corpus file (corpus_filename) */
/* populates the vocabularies of m with values found in corpus_filename */
void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
{
int column;
for(column=0; column < m->nb_col; column++){
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
/* && (strcmp(m->filename[column], "_")) */
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->type_str[column]);
fprintf(stderr, "extracting dico %s from corpus\n", m->type_str[column]);
}
}
}
/* takes as argument an mcd structure (m) and a dictionary vector (vocabs) */
/* links the vocabularies of m to vocabularies of vocabs (based on their names) */
void mcd_link_to_dico(mcd *m, dico_vec *vocabs)
{
int column;
for(column=0; column < m->nb_col; column++){
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
&& (!strcmp(m->filename[column], "_"))
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]);
fprintf(stderr, "linking to dico %s\n", m->type_str[column]);
}
}
}
/* read an multi column description file and produces an mcd structure */
mcd *mcd_read(char *mcd_filename)
{
int column;
char type[100];
char representation[100];
char filename[500]; /* ugly */
int fields_number;
int line_number = 0;
char buffer[1000]; /* ugly */
int nb_col = mcd_max_column_index_in_file(mcd_filename);
mcd *m = mcd_new(nb_col + 1);
FILE *f = myfopen(mcd_filename, "r");
/* int first = 1; */
while(fgets(buffer, 1000, f)){
line_number++;
if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename);
if(fields_number != 4){
/* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
continue;
}
fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename);
m->type[column] = feat_type_string2int(type);
m->type_str[column] = strdup(type);
if(m->type[column] == -1){
fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename);
continue;
}
m->type2col[m->type[column]] = column;
if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB;
else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT;
else{
fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename);
m->representation[column] = MCD_REPRESENTATION_NULL;
}
if(m->representation[column] != MCD_REPRESENTATION_NULL)
m->filename[column] = strdup(filename);
if(strcmp(m->filename[column], "_")){
if(m->representation[column] == MCD_REPRESENTATION_EMB){
fprintf(stderr, "loading word embedding %s\n", m->filename[column]);
m->word_emb_array[column] = word_emb_load(m->filename[column]);
}
else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){
fprintf(stderr, "loading dico %s\n", m->filename[column]);
m->dico_array[column] = dico_read(m->filename[column], 0.5);
}
}
}
fclose(f);
return m;
}
mcd *mcd_build_conll07(void)
{
mcd *m = mcd_new(8);
m->type[0]=FEAT_TYPE_INDEX;
m->type_str[0]=strdup("INDEX");
m->representation[0]= MCD_REPRESENTATION_INT;
m->type2col[FEAT_TYPE_INDEX] = 0;
m->type[1]=FEAT_TYPE_FORM;
m->type_str[1]=strdup("FORM");
m->representation[1]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_FORM] = 1;
m->type[2]=FEAT_TYPE_LEMMA;
m->type_str[2]=strdup("LEMMA");
m->representation[2]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_LEMMA] = 2;
m->type[3]=FEAT_TYPE_CPOS;
m->type_str[3]=strdup("CPOS");
m->representation[3]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_CPOS] = 3;
m->type[4]=FEAT_TYPE_POS;
m->type_str[4]=strdup("POS");
m->representation[4]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_POS] = 4;
m->type[5]=FEAT_TYPE_FEATS;
m->type_str[5]=strdup("FEATS");
m->representation[5]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_FEATS] = 5;
m->type[6]=FEAT_TYPE_GOV;
m->type_str[6]=strdup("GOV");
m->representation[6]= MCD_REPRESENTATION_INT;
m->type2col[FEAT_TYPE_GOV] = 6;
m->type[7]=FEAT_TYPE_LABEL;
m->type_str[7]=strdup("LABEL");
m->representation[7]= MCD_REPRESENTATION_VOCAB;
m->type2col[FEAT_TYPE_LABEL] = 7;
return m;
}
mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs)
{
int column;
char type[100];
......@@ -116,7 +262,7 @@ mcd *mcd_read(char *mcd_filename, char *corpus_filename, dico_vec *vocabs)
continue;
}
m->type2col[m->type[column]] = column;
m->col2type[column] = m->type[column];
/* m->col2type[column] = m->type[column]; */
if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB;
......
set(SOURCES src/context.c)
##compiling library
include_directories(src)
add_library(maca_lemmatizer_lib STATIC ${SOURCES})
#compiling, linking and installing executables
add_executable(maca_lemmatizer ./src/maca_lemmatizer.c)
target_link_libraries(maca_lemmatizer maca_lemmatizer_lib)
target_link_libraries(maca_lemmatizer maca_common)
install (TARGETS maca_lemmatizer DESTINATION bin)
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->conll_filename) free(ctx->conll_filename);
if(ctx->fplm_filename) free(ctx->fplm_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->conll_filename = NULL;
ctx->fplm_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_conll_help_message(context *ctx){
fprintf(stderr, "\t-i --conll <file> : conll file name\n");
}
void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-m --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-C --language : identifier of the language to use\n");
}
void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[8] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'i'},
{"mcd", required_argument, 0, 'm'},
{"language", required_argument, 0, 'C'},
{"fplm", required_argument, 0, 'f'},
{"maca_data_path", required_argument, 0, 'M'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:f:m:C:M:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
case 'i':
ctx->conll_filename = strdup(optarg);
break;
case 'm':
ctx->mcd_filename = strdup(optarg);
ctx->mcd_struct = mcd_read(ctx->mcd_filename);
break;
case 'C':
ctx->language = strdup(optarg);
break;
case 'M':
ctx->maca_data_path = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
if(ctx->mcd_filename == NULL)
ctx->mcd_struct = mcd_build_conll07();
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else
strcat(absolute_path, getenv("MACAON_DIR"));
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->fplm_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
}
#ifndef __MACA_LEMMATIZER_CONTEXT__
#define __MACA_LEMMATIZER_CONTEXT__
#include "mcd.h"
#include <stdlib.h>
#define DEFAULT_FPLM_FILENAME "fplm"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *conll_filename;
char *fplm_filename;
char *language;
char *maca_data_path;
char *mcd_filename;
mcd *mcd_struct;
} context;
context *context_new(void);
void context_free(context *ctx);
context *context_read_options(int argc, char *argv[]);
void context_general_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_language_help_message(context *ctx);
void context_fplm_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_mcd_help_message(context *ctx);
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<ctype.h>
#include"util.h"
#include"hash.h"
#include"mcd.h"
#include"context.h"
void maca_lemmatizer_help_message(context *ctx)
{
context_general_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
context_mcd_help_message(ctx);
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_fplm_help_message(ctx);
}
void maca_lemmatizer_check_options(context *ctx){
if(!ctx->conll_filename
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
|| ctx->help
){
maca_lemmatizer_help_message(ctx);
exit(1);
}
}
char **read_fplm_file(char *fplm_filename, hash *form_pos_ht)
{
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char **lemma_array;
int lemma_array_size = 10000;
FILE *f= myfopen(fplm_filename, "r");
int fields_nb;
lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *));
while(!feof(f)){
fields_nb = fscanf(f, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
if(fields_nb != 4){
fprintf(stderr, "incorrect fplm entry, skipping it\n");
continue;
}
strcat(form, "/");
strcat(form, pos);
hash_add(form_pos_ht, strdup(form), num);
if(num >= lemma_array_size){
lemma_array_size = 2 * (lemma_array_size) + 1;
lemma_array = realloc(lemma_array, (lemma_array_size) * sizeof(char *));
}
/* if(lemma_array[num] == NULL) */
lemma_array[num] = strdup(lemma);
num++;
}
/* fprintf(stderr, "%d entries loaded\n", num); */
return lemma_array;
}
char *to_lower_string(char *s)
{
int i;
for(i=0; i < strlen(s); i++)
s[i] = tolower(s[i]);
return s;
}
int main(int argc, char *argv[])
{
hash *form_pos_ht = hash_new(1000000);
char buffer[10000];
char *form;
char *pos;
char *token;
int column_nb;
char form_pos[500];
char *lemma;
int index_form_pos;
char **lemma_array;
context *ctx;
ctx = context_read_options(argc, argv);
maca_lemmatizer_check_options(ctx);
FILE *f = myfopen(ctx->conll_filename, "r");
lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht);
/* look for a valid word */
while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */
if((buffer[0] == '\n') || (buffer[0] == ' ')){
printf("\n");
continue;
}
buffer[strlen(buffer)-1] = '\0';
printf("%s", buffer);
token = strtok(buffer, "\t");
column_nb = 0;
form = NULL;
pos = NULL;
do{
if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_FORM))
form = strdup(token);
if((column_nb < ctx->mcd_struct->nb_col) && (ctx->mcd_struct->type[column_nb] == FEAT_TYPE_POS))
pos = strdup(token);
column_nb++;
} while((token = strtok(NULL , "\t")));
strcpy(form_pos, form);
strcat(form_pos, "/");
strcat(form_pos, pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL){
lemma = lemma_array[index_form_pos];
}
else{
to_lower_string(form_pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL){
lemma = lemma_array[index_form_pos];
}