Skip to content
Snippets Groups Projects
Commit fa7d3b1f authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added maca_trans_attach_punct a post parsing program that attaches punctuation

parent ad724099
No related branches found
No related tags found
No related merge requests found
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_attach_punct_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
context_root_label_help_message(ctx);
context_punct_label_help_message(ctx);
}
void maca_trans_attach_punct_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_attach_punct_help_message(ctx);
exit(1);
}
}
int look_left_for_target(word_buffer *wb, int target)
{
int position = word_buffer_get_current_index(wb) - 1;
int gov;
int step = 0;
do{
step--;
if(position == target) return step;
gov = word_get_gov(word_buffer_get_word_n(wb, position));
position += gov;
// printf("position = %d gov = %d\n", position, gov);
}while((position >= 0) && (gov < 0));
return 0;
}
int look_right_for_target(word_buffer *wb, int target)
{
int position = word_buffer_get_current_index(wb) + 1;
int gov;
int step = 0;
do{
step++;
if(position == target) return step;
gov = word_get_gov(word_buffer_get_word_n(wb, position));
position += gov;
// printf("position = %d gov = %d\n", position, gov);
}while((position < word_buffer_get_nbelem(wb)) && (gov > 0));
return 0;
}
int get_left_attachement_site(word_buffer *wb)
{
int position = word_buffer_get_current_index(wb) - 1;
word *w;
int gov = 0;
do{
position += gov;
w = word_buffer_get_word_n(wb, position);
gov = word_get_gov(w);
// printf("position = %d gov = %d\n", position, gov);
}while((position >= 0) && (gov < 0));
return position;
}
int get_right_attachement_site(word_buffer *wb)
{
int position = word_buffer_get_current_index(wb) + 1;
word *w;
int gov = 0;
do{
position += gov;
w = word_buffer_get_word_n(wb, position);
gov = word_get_gov(w);
// printf("position = %d gov = %d\n", position, gov);
}while((position < word_buffer_get_nbelem(wb)) && (gov > 0));
return position;
}
void generate_training_file(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
int sentence_nb = 0;
int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
int punct_label = dico_string2int(ctx->dico_labels, (char *) ctx->punct_label);
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
word *b0;
int left_attachement_site;
int right_attachement_site;
int target;
int cla;
int highest_left, highest_right;
// c = config_new(mcf_file, mcd_struct_hyp, 5);
while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){
b0 = word_buffer_b0(ref);
if(word_get_label(b0) == punct_label){
left_attachement_site = get_left_attachement_site(ref);
right_attachement_site = get_right_attachement_site(ref);
highest_left = (word_get_index(b0) + word_get_gov(b0) == left_attachement_site)? 1 : 0;
highest_right = (word_get_index(b0) + word_get_gov(b0) == right_attachement_site)? 1 : 0;
if(highest_left) printf("class = HL\n");
else if(highest_right) printf("class = HR\n");
else{
target = word_get_index(b0) + word_get_gov(b0);
if(word_get_gov(b0) < 0){
cla = look_left_for_target(ref, target);
printf("class = %d", cla);
// if(highest_left) printf("*");
printf("\n");
}
if(word_get_gov(b0) > 0){
cla = look_right_for_target(ref, target);
printf("class = %d", cla);
// if(highest_right) printf("*");
printf("\n");
}
}
}
word_buffer_move_right(ref);
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_attach_punct_check_options(ctx);
// ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
// dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout;
generate_training_file(output_file, ctx);
/* if(ctx->mode == TRAIN_MODE)
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);*/
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment