diff --git a/maca_trans_parser/src/maca_check_projectivity.c b/maca_trans_parser/src/maca_check_projectivity.c new file mode 100644 index 0000000000000000000000000000000000000000..cdb899afddc37906f96feb5c1e429dda1d425d3e --- /dev/null +++ b/maca_trans_parser/src/maca_check_projectivity.c @@ -0,0 +1,116 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_check_projectivity_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + + context_root_label_help_message(ctx); + +} + +void maca_check_projectivity_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_check_projectivity_help_message(ctx); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + word_buffer *wb = NULL; + word *w; + int dep_index, gov_index, min_index, max_index, w_index; + int word_nb = 0; + int word_non_proj = 0; + int *non_proj_array = NULL; + dico *dico_labels; + + ctx = context_read_options(argc, argv); + maca_check_projectivity_check_options(ctx); + + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + // dico_labels = mcd_get_dico_label(ctx->mcd_struct); + non_proj_array = (int *)malloc(dico_labels->nbelem * sizeof(int)); + for(int i = 0; i < dico_labels->nbelem; i++){ + non_proj_array[i] = 0; + } + + wb = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + while(!word_buffer_end(wb)){ + dep_index = word_get_index(word_buffer_b0(wb)); + // printf("dep_index = %d\n", dep_index); + gov_index = word_get_gov(word_buffer_b0(wb)) + dep_index; + if(gov_index < dep_index){ + min_index = gov_index; + max_index = dep_index; + } + else{ + min_index = dep_index; + max_index = gov_index; + } + for(w_index = min_index + 1; w_index < max_index; w_index++){ + w = word_buffer_get_word_n(wb, w_index); + if(!((word_get_gov(w) + w_index <= max_index) && (word_get_gov(w) + w_index >= min_index))){ + word_non_proj++; + + // non_proj_array[word_get_label(word_buffer_b0(wb))]++; + // printf("NON PROJ label = %d\n", word_get_label(word_buffer_b0(wb))); + non_proj_array[word_get_label(word_buffer_b0(wb))]++; + break; + } + } + word_buffer_move_right(wb); + word_nb++; + } + + if(ctx->verbose){ + for(int i = 0; i < dico_labels->nbelem; i++){ + printf("%d\t%s\n", non_proj_array[i], dico_int2string(dico_labels, i)); + } + } + + + + printf("number of dependencies = %d\n", word_nb); + printf("number of non proj dependencies = %d\n", word_non_proj); + printf("non projectivity ratio = %.2f\n", (float) word_non_proj / word_nb); + + + + + + context_free(ctx); + return 0; +} +