Skip to content
Snippets Groups Projects
Commit c1b334fd authored by Alexis Nasr's avatar Alexis Nasr
Browse files

implemented a simple morphological analyzer

parent da28bdb4
No related branches found
No related tags found
No related merge requests found
......@@ -28,7 +28,7 @@ add_subdirectory(maca_common)
add_subdirectory(maca_tools)
add_subdirectory(perceptron)
#add_subdirectory(maca_lemmatizer)
add_subdirectory(maca_morpho)
#add_subdirectory(maca_morpho)
add_subdirectory(maca_tokenizer)
add_subdirectory(maca_lexer)
add_subdirectory(maca_trans_parser)
......
......@@ -199,6 +199,7 @@ mcd *mcd_build_conll07(void);
mcd *mcd_build_ifpls(void);
mcd *mcd_build_wplgf(void);
mcd *mcd_build_wplgfs(void);
mcd *mcd_build_wpmlgfs(void);
mcd *mcd_read(char *mcd_filename, int verbose);
void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
......
......@@ -422,6 +422,63 @@ mcd *mcd_build_wplgfs(void)
return m;
}
mcd *mcd_build_wpmlgfs(void)
{
mcd *m = mcd_new(7);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = col;
col = 2;
m->wf[col]=MCD_WF_FEATS;
m->wf_str[col]=strdup("FEATS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FEATS] = col;
col = 3;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = col;
col = 4;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = col;
col = 5;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = col;
col = 6;
m->wf[col]=MCD_WF_SENT_SEG;
m->wf_str[col]=strdup("SENT_SEG");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_SENT_SEG] = col;
return m;
}
/* returns a dico_vec containing the different dictionnaries found in an mcd structure */
......
......@@ -125,6 +125,7 @@ int main(int argc, char *argv[])
char *buffer_copy;
char *form;
char *pos;
char *feats;
char *token;
int column_nb;
......@@ -136,11 +137,16 @@ int main(int argc, char *argv[])
int form_column;
int pos_column;
int lemma_column;
int feats_column;
FILE *f = NULL;
ctx = context_read_options(argc, argv);
maca_lemmatizer_check_options(ctx);
feats_column = ctx->mcd_struct->wf2col[MCD_WF_FEATS];
if(ctx->pos_column != -1)
pos_column = ctx->pos_column;
else
......@@ -177,6 +183,7 @@ int main(int argc, char *argv[])
form = NULL;
pos = NULL;
lemma = NULL;
feats = NULL;
do{
if(column_nb == lemma_column) /* lemma is present in the input file */
if(strcmp(token, "_")) /* and it is not an underscore */
......@@ -188,6 +195,9 @@ int main(int argc, char *argv[])
if(column_nb == pos_column){
pos = strdup(token);
}
if(column_nb == feats_column){
feats = strdup(token);
}
column_nb++;
} while((token = strtok(NULL , "\t")));
......@@ -215,11 +225,13 @@ int main(int argc, char *argv[])
/* print_word(buffer, ctx->mcd_struct, lemma); */
/* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */
printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma);
printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma);
printf("\t%s\n", lemma);
if(pos)free(pos);
if(form)free(form);
if(feats)free(feats);
}
free(buffer_copy);
free(lemma_array);
......
......@@ -40,6 +40,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse)
target_link_libraries(maca_trans_tagger_mcf2cff maca_common)
install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin)
add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c)
target_link_libraries(maca_trans_morpho_mcf2cff perceptron)
target_link_libraries(maca_trans_morpho_mcf2cff transparse)
target_link_libraries(maca_trans_morpho_mcf2cff maca_common)
install (TARGETS maca_trans_morpho_mcf2cff DESTINATION bin)
#add_executable(maca_trans_tagger_mcf2cff_bt ./src/maca_trans_tagger_mcf2cff_bt.c)
#target_link_libraries(maca_trans_tagger_mcf2cff_bt perceptron)
#target_link_libraries(maca_trans_tagger_mcf2cff_bt transparse)
......@@ -100,6 +106,12 @@ target_link_libraries(maca_trans_tagger transparse)
target_link_libraries(maca_trans_tagger maca_common)
install (TARGETS maca_trans_tagger DESTINATION bin)
add_executable(maca_trans_morpho ./src/maca_trans_morpho.c)
target_link_libraries(maca_trans_morpho perceptron)
target_link_libraries(maca_trans_morpho transparse)
target_link_libraries(maca_trans_morpho maca_common)
install (TARGETS maca_trans_morpho DESTINATION bin)
#add_executable(maca_trans_tagger_bt ./src/maca_trans_tagger_bt.c)
#target_link_libraries(maca_trans_tagger_bt perceptron)
#target_link_libraries(maca_trans_tagger_bt transparse)
......
......@@ -283,7 +283,8 @@ context *context_read_options(int argc, char *argv[])
if(ctx->mcd_filename)
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
else
ctx->mcd_struct = mcd_build_wplgfs();
ctx->mcd_struct = mcd_build_wpmlgfs();
/* ctx->mcd_struct = mcd_build_wplgfs(); */
/* initialize maca_data_path field */
......
......@@ -14,6 +14,11 @@
#define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab"
#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model"
#define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd"
#define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm"
#define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab"
#define DEFAULT_MODEL_MORPHO_FILENAME "maca_trans_morpho.model"
#define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd"
#define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm"
#define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab"
......
......@@ -46,6 +46,7 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, in
{
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
#include"config2feat_vec.h"
void decode_morpho_help_message(context *ctx);
void decode_morpho_help_message(context *ctx)
{
context_general_help_message(ctx);
context_beam_help_message(ctx);
context_conll_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_f2p_filename_help_message(ctx);
}
void decode_morpho_check_options(context *ctx){
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
){
decode_morpho_help_message(ctx);
exit(1);
}
}
void decode_morpho_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_MORPHO_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_MORPHO_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_MORPHO_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
}
}
void print_word(word *w, mcd *mcd_struct, dico *dico_morph, int postag)
{
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_pos_col(mcd_struct) == -1){
printf("%s\t%s\n", w->input, dico_int2string(dico_morph, postag));
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_feats_col(mcd_struct))
printf("%s", dico_int2string(dico_morph, postag));
else
word_print_col_n(stdout, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_feats_col(mcd_struct))
printf("\t%s", dico_int2string(dico_morph, postag));
printf("\n");
free(buffer);
}
}
int movement_morpho(config *c, int feats)
{
word_set_feats(word_buffer_b0(c->bf), feats);
word_buffer_move_right(c->bf);
return 1;
}
void simple_decoder_morpho(context *ctx)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int feats;
float max;
word *b0;
dico *dico_feats = dico_vec_get_dico(ctx->vocabs, (char *)"FEATS");
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
feats = word_get_feats(b0);
if(ctx->debug_mode){
fprintf(stderr, "***********************************\n");
config_print(stderr, c);
}
/* if feats is not specified in input it is predicted */
if(feats == -1){
/* config_print(stdout, c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
/* feat_vec_print(stdout, fv); */
feats = feature_table_argmax(fv, ft, &max);
/* printf("feats = %d\n", feats); */
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_feats, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
print_word(b0, ctx->mcd_struct, dico_feats, feats);
movement_morpho(c, feats);
}
/* config_print(stdout, c); */
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
if (ctx->input_filename) fclose(f);
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
decode_morpho_check_options(ctx);
decode_morpho_set_linguistic_resources_filenames(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
if(ctx->beam_width == 1)
simple_decoder_morpho(ctx);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"config2feat_vec.h"
int oracle_morpho(config *c)
{
return word_get_feats(word_buffer_b0(config_get_buffer(c)));
}
int movement_morpho(config *c, int feats)
{
word_set_feats(word_buffer_b0(c->bf), feats);
word_buffer_move_right(c->bf);
return 1;
}
void maca_trans_morpho_mcf2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
context_mcd_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_morpho_mcf2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
){
maca_trans_morpho_mcf2cff_help_message(ctx);
exit(1);
}
}
void morpho_generate_training_file(FILE *output_file, context *ctx)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *conll_file = myfopen(ctx->input_filename, "r");
int feats;
/* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
c = config_new(conll_file, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
feats = oracle_morpho(c);
fprintf(output_file, "%d", feats);
feat_vec_print(output_file, fv);
movement_morpho(c, feats);
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_morpho_mcf2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
if(ctx->cff_filename)
output_file = myfopen(ctx->cff_filename, "w");
else
output_file = stdout;
morpho_generate_training_file(output_file, ctx);
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
}
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment