diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 147be4588de3fe605327af49f2f9ebd80ceeabe1..a2077fa596a9e0a817978c6ff1469569193e2236 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -49,6 +49,13 @@ target_link_libraries(maca_trans_attach_punct transparse) target_link_libraries(maca_trans_attach_punct maca_common) install (TARGETS maca_trans_attach_punct DESTINATION bin) +add_executable(maca_check_projectivity ./src/maca_check_projectivity.c) +target_link_libraries(maca_check_projectivity perceptron) +target_link_libraries(maca_check_projectivity transparse) +target_link_libraries(maca_check_projectivity maca_common) +install (TARGETS maca_check_projectivity DESTINATION bin) + + add_executable(maca_trans_lemmatizer_mcf2cff ./src/maca_trans_lemmatizer_mcf2cff.c) target_link_libraries(maca_trans_lemmatizer_mcf2cff perceptron) target_link_libraries(maca_trans_lemmatizer_mcf2cff transparse) diff --git a/maca_trans_parser/src/maca_check_projectivity.c b/maca_trans_parser/src/maca_check_projectivity.c new file mode 100644 index 0000000000000000000000000000000000000000..cdb899afddc37906f96feb5c1e429dda1d425d3e --- /dev/null +++ b/maca_trans_parser/src/maca_check_projectivity.c @@ -0,0 +1,116 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_check_projectivity_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + + context_root_label_help_message(ctx); + +} + +void maca_check_projectivity_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_check_projectivity_help_message(ctx); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + word_buffer *wb = NULL; + word *w; + int dep_index, gov_index, min_index, max_index, w_index; + int word_nb = 0; + int word_non_proj = 0; + int *non_proj_array = NULL; + dico *dico_labels; + + ctx = context_read_options(argc, argv); + maca_check_projectivity_check_options(ctx); + + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + // dico_labels = mcd_get_dico_label(ctx->mcd_struct); + non_proj_array = (int *)malloc(dico_labels->nbelem * sizeof(int)); + for(int i = 0; i < dico_labels->nbelem; i++){ + non_proj_array[i] = 0; + } + + wb = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + while(!word_buffer_end(wb)){ + dep_index = word_get_index(word_buffer_b0(wb)); + // printf("dep_index = %d\n", dep_index); + gov_index = word_get_gov(word_buffer_b0(wb)) + dep_index; + if(gov_index < dep_index){ + min_index = gov_index; + max_index = dep_index; + } + else{ + min_index = dep_index; + max_index = gov_index; + } + for(w_index = min_index + 1; w_index < max_index; w_index++){ + w = word_buffer_get_word_n(wb, w_index); + if(!((word_get_gov(w) + w_index <= max_index) && (word_get_gov(w) + w_index >= min_index))){ + word_non_proj++; + + // non_proj_array[word_get_label(word_buffer_b0(wb))]++; + // printf("NON PROJ label = %d\n", word_get_label(word_buffer_b0(wb))); + non_proj_array[word_get_label(word_buffer_b0(wb))]++; + break; + } + } + word_buffer_move_right(wb); + word_nb++; + } + + if(ctx->verbose){ + for(int i = 0; i < dico_labels->nbelem; i++){ + printf("%d\t%s\n", non_proj_array[i], dico_int2string(dico_labels, i)); + } + } + + + + printf("number of dependencies = %d\n", word_nb); + printf("number of non proj dependencies = %d\n", word_non_proj); + printf("non projectivity ratio = %.2f\n", (float) word_non_proj / word_nb); + + + + + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/movements.c b/maca_trans_parser/src/movements.c index b45c080d7d6b860394fa2db8f6180f8226a25272..d4c6c8e0eb47e2d7ca7365b50eee5ff0e4947981 100644 --- a/maca_trans_parser/src/movements.c +++ b/maca_trans_parser/src/movements.c @@ -85,8 +85,10 @@ int movement_ignore(config *c, int movement_code) { if(word_buffer_end(config_get_buffer(c))) return 0; word *b0 = word_buffer_b0(config_get_buffer(c)); + word_set_gov(b0, WORD_INVALID_GOV); word_set_label(b0, -1); + config_push_mvt(c, movement_code, b0, NULL); word_buffer_move_right(config_get_buffer(c)); // fprintf(stderr, "IGNORE\n"); diff --git a/maca_trans_parser/src/oracle_parser_arc_eager.c b/maca_trans_parser/src/oracle_parser_arc_eager.c index 583c528367d591f1189934d60ede476011d860ee..5999cae9070917a3e4e99b9ab8e6c401167139d7 100644 --- a/maca_trans_parser/src/oracle_parser_arc_eager.c +++ b/maca_trans_parser/src/oracle_parser_arc_eager.c @@ -51,12 +51,26 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label, int pun /* if(!stack_is_empty(config_get_stack(c)) && !word_buffer_is_empty(config_get_buffer(c))){ */ + b0 = word_buffer_b0(config_get_buffer(c)); b0_index = word_get_index(b0); b0_gov_index = word_get_gov_index(word_buffer_get_word_n(ref, b0_index)); b0_label = word_get_label(word_buffer_get_word_n(ref, b0_index)); + /* printf("s0_index = %d b0_index = %d\n", s0_index, b0_index); + printf("dans ref gov de s0 (%d) = %d\n", s0_index, s0_gov_index); + printf("dans ref gov de b0 (%d) = %d\n", b0_index, b0_gov_index);*/ + + /* s0 is the root of the sentence */ + if((s0_label == root_label) + // && (word_get_label(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != root_label) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) + ){ + return MVT_PARSER_ROOT; + } + + /* word in front of the buffer is an end of sentence marker */ if(word_get_sent_seg(word_buffer_get_word_n(ref, b0_index)) == 1) return MVT_PARSER_EOS;