Skip to content
Snippets Groups Projects
Commit cee7f975 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

maca_trans_chunker a new tool for chunking

parent 7fd094da
No related branches found
No related tags found
No related merge requests found
Showing
with 419 additions and 12 deletions
......@@ -93,7 +93,7 @@ int main(int argc, char *argv[])
float end_array[100];
int path_index = 0;
int next_state;
int orfeo = 1;
int orfeo = 0;
char form[1000];
float start;
float end;
......
%{
#include<stdio.h>
#include"maca_tokenizer_functions_for_lex.h"
extern int defait_amalgames;
extern int offset;
extern int token_length;
extern char *token;
%}
%option prefix="en"
%option noyywrap
%%
[0-9]+\.[0-9]+ printf("%s", yytext);
[ \t]+ printf("\n");
\. printf("\n.");
\, printf("\n,");
don't printf("do\nnot");
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
[ ]*\? {maca_tokenizer_segment((char *)"?", yytext);}
[ ]*\! {maca_tokenizer_segment((char *)"!", yytext);}
[ ]*, {maca_tokenizer_segment((char *)",", yytext);}
[ ]*: {maca_tokenizer_segment((char *)":", yytext);}
[ ]*; {maca_tokenizer_segment((char *)";", yytext);}
[ ]*… {maca_tokenizer_segment((char *)"…", yytext);}
[ ]*\) {maca_tokenizer_segment((char *)")", yytext);}
[ ]*» {maca_tokenizer_segment((char *)"»", yytext);}
\( {maca_tokenizer_segment((char *)"((", yytext);}
\" {maca_tokenizer_segment((char *)"\"", yytext);}
« {maca_tokenizer_segment((char *)"«", yytext);}
[0-9]+\.[0-9]+ {maca_tokenizer_segment(yytext, yytext);}
don't printf("do\nnot\n");
don’t printf("do\nnot");
doesn't printf("does\nnot");
doesn’t printf("does\nnot");
......@@ -23,5 +41,6 @@ wanna printf("want\nto");
’s printf("\n's");
\n+ printf("\n");
. {maca_tokenizer_add_char_to_token(yytext[0]);}
%%
......@@ -5,9 +5,10 @@
extern int defait_amalgames;
/*extern int print_offset;
extern int print_token_length;*/
int offset = 0;
int token_length = 0;
char token[10000];
extern int defait_amalgames;
extern int offset;
extern int token_length;
extern char *token;
%}
%option prefix="fr"
......@@ -31,11 +32,12 @@ char token[10000];
[ ]*\) {maca_tokenizer_segment((char *)")", yytext);}
[ ]*» {maca_tokenizer_segment((char *)"»", yytext);}
\( {maca_tokenizer_segment((char *)"((", yytext);}
' {maca_tokenizer_segment((char *)"'", yytext);}
’ {maca_tokenizer_segment((char *)"'", yytext);}
\" {maca_tokenizer_segment((char *)"\"", yytext);}
« {maca_tokenizer_segment((char *)"«", yytext);}
[^ ]*' {maca_tokenizer_segment((char *)yytext, yytext);}
[^ ]*’ {maca_tokenizer_segment((char *)yytext, yytext);}
[0-9]+,[0-9]+ {maca_tokenizer_segment(yytext, yytext);}
-je {maca_tokenizer_segment((char *)"-je", yytext);}
......
......@@ -10,6 +10,11 @@ int defait_amalgames = 0;
int print_offset = 0;
int print_token_length = 0;
int offset = 0;
int token_length = 0;
char token[10000];
void maca_tokenizer_help_message(context *ctx)
{
context_general_help_message(ctx);
......
......@@ -3,18 +3,21 @@ set(SOURCES src/context.c
src/movement_parser_arc_eager.c
src/movement_tagparser_arc_eager.c
src/movement_tagger.c
src/movement_chunker.c
src/feat_fct.c
# src/global_feat_vec.c
# src/oracle_parser.c
src/oracle_parser_arc_eager.c
src/oracle_tagparser_arc_eager.c
src/oracle_tagger.c
src/oracle_chunker.c
# src/simple_decoder_parser.c
src/simple_decoder_parser_arc_eager.c
src/simple_decoder_tagparser_arc_eager.c
src/simple_decoder_parser_arc_eager_error_predictor.c
# src/simple_decoder_forrest.c
src/simple_decoder_tagger.c
src/simple_decoder_chunker.c
src/simple_decoder_tagger_error_predictor.c
# src/simple_decoder_tagger_bt.c
src/stack.c
......@@ -52,6 +55,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse)
target_link_libraries(maca_trans_tagger_mcf2cff maca_common)
install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin)
add_executable(maca_trans_chunker_mcf2cff ./src/maca_trans_chunker_mcf2cff.c)
target_link_libraries(maca_trans_chunker_mcf2cff perceptron)
target_link_libraries(maca_trans_chunker_mcf2cff transparse)
target_link_libraries(maca_trans_chunker_mcf2cff maca_common)
install (TARGETS maca_trans_chunker_mcf2cff DESTINATION bin)
add_executable(maca_error_predictor_tagger_mcf2cff ./src/maca_error_predictor_tagger_mcf2cff.c)
target_link_libraries(maca_error_predictor_tagger_mcf2cff perceptron)
target_link_libraries(maca_error_predictor_tagger_mcf2cff transparse)
......@@ -136,6 +145,12 @@ target_link_libraries(maca_trans_tagger transparse)
target_link_libraries(maca_trans_tagger maca_common)
install (TARGETS maca_trans_tagger DESTINATION bin)
add_executable(maca_trans_chunker ./src/maca_trans_chunker.c)
target_link_libraries(maca_trans_chunker perceptron)
target_link_libraries(maca_trans_chunker transparse)
target_link_libraries(maca_trans_chunker maca_common)
install (TARGETS maca_trans_chunker DESTINATION bin)
add_executable(maca_trans_morpho ./src/maca_trans_morpho.c)
target_link_libraries(maca_trans_morpho perceptron)
target_link_libraries(maca_trans_morpho transparse)
......
......@@ -14,6 +14,11 @@
#define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab"
#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model"
#define DEFAULT_MULTI_COL_DESC_CHUNKER_FILENAME "maca_trans_chunker.mcd"
#define DEFAULT_FEATURES_MODEL_CHUNKER_FILENAME "maca_trans_chunker.fm"
#define DEFAULT_VOCABS_CHUNKER_FILENAME "maca_trans_chunker.vocab"
#define DEFAULT_MODEL_CHUNKER_FILENAME "maca_trans_chunker.model"
#define DEFAULT_MULTI_COL_DESC_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.mcd"
#define DEFAULT_FEATURES_MODEL_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.fm"
#define DEFAULT_VOCABS_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.vocab"
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
#include"beam.h"
#include"form2pos.h"
#include"simple_decoder_chunker.h"
/*#include"dnn_decoder.h"*/
#include"config2feat_vec.h"
void decode_chunker_help_message(context *ctx)
{
context_general_help_message(ctx);
context_beam_help_message(ctx);
context_conll_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_f2p_filename_help_message(ctx);
}
void decode_chunker_check_options(context *ctx){
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
){
decode_chunker_help_message(ctx);
exit(1);
}
}
void decode_chunker_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_CHUNKER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_CHUNKER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_CHUNKER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_CHUNKER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->f2p_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
decode_chunker_check_options(ctx);
decode_chunker_set_linguistic_resources_filenames(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
if(ctx->beam_width == 1)
simple_decoder_chunker(ctx);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_chunker.h"
#include"oracle_chunker.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_chunker_mcf2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
context_mcd_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_chunker_mcf2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
){
maca_trans_chunker_mcf2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file(FILE *output_file, context *ctx)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *conll_file = myfopen(ctx->input_filename, "r");
int tag;
c = config_new(conll_file, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
tag = oracle_chunker(c);
fprintf(output_file, "%d", tag);
feat_vec_print(output_file, fv);
movement_chunker(c, tag);
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_chunker_mcf2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
if(ctx->cff_filename)
output_file = myfopen(ctx->cff_filename, "w");
else
output_file = stdout;
generate_training_file(output_file, ctx);
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
}
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"util.h"
#include"movement_chunker.h"
int movement_chunker(config *c, int postag)
{
word_set_pos(word_buffer_b0(c->bf), postag);
word_buffer_move_right(c->bf);
return 1;
}
#ifndef __MOVEMENT_CHUNKER__
#define __MOVEMENT_CHUNKER__
#include"config.h"
#include"feat_vec.h"
int movement_chunker(config *c, int postag);
#endif
#include"oracle_chunker.h"
int oracle_chunker(config *c)
{
return word_get_A(word_buffer_b0(config_get_buffer(c)));
}
#ifndef __ORACLE_CHUNKER__
#define __ORACLE_CHUNKER__
#include<stdio.h>
#include<stdlib.h>
#include"config.h"
int oracle_chunker(config *c);
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include<ctype.h>
#include"context.h"
#include"movement_chunker.h"
#include"feat_fct.h"
#include"config2feat_vec.h"
#include"feature_table.h"
#include"dico.h"
#include"mcd.h"
void print_word(word *w, mcd *mcd_struct, dico *dico_bio, int tag)
{
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_a_col(mcd_struct) == -1){
printf("%s\t%s\n", w->input, dico_int2string(dico_bio, tag));
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_a_col(mcd_struct))
printf("%s", dico_int2string(dico_bio, tag));
else
word_print_col_n(stdout, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_a_col(mcd_struct))
printf("\t%s", dico_int2string(dico_bio, tag));
printf("\n");
free(buffer);
}
}
void simple_decoder_chunker(context *ctx)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int tag;
float max;
word *b0;
dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"A");
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
tag = -1;//word_get_pos(b0);
if(ctx->debug_mode){
fprintf(stderr, "***********************************\n");
config_print(stderr, c);
}
/* if tag is not specified in input it is predicted */
if(tag == -1){
/* config_print(stdout, c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
/* feat_vec_print(stdout, fv); */
tag = feature_table_argmax(fv, ft, &max);
/* printf("tag = %d\n", tag); */
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
print_word(b0, ctx->mcd_struct, dico_pos, tag);
movement_chunker(c, tag);
}
/* config_print(stdout, c); */
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
if (ctx->input_filename) fclose(f);
}
#ifndef __SIMPLE_DECODER_CHUNKER__
#define __SIMPLE_DECODER_CHUNKER__
#include "context.h"
void simple_decoder_chunker(context *ctx);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment