Commit f50e7d93 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed few little bugs

parent 61309227
#ifndef __WORD__
#define __WORD__
#include<ctype.h>
#include "mcd.h"
#include "char16.h"
......@@ -28,6 +28,7 @@ typedef struct _word {
#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6])
*/
#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3])
......@@ -51,8 +52,8 @@ typedef struct _word {
#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
#define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET])
#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH])
#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET])
#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH])
#define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
#define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA])
#define word_get_cpos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS])
......
......@@ -32,13 +32,13 @@
#define word_buffer_is_empty(wb) (((wb)->nbelem == 0)? 1 : 0)
typedef struct {
int size; /* size of the array used to store words */
int nbelem; /* number of words in the buffer */
int lookahead; /* number of words between the current word and the last word of the buffer */
int current_index; /* position of the current word */
word **array; /* array to store words */
FILE *input_file; /* file to read the words from */
mcd *mcd_struct; /* mcd describing the format of input_file */
int size; /* size of the array used to store words */
int nbelem; /* number of words in the buffer */
int lookahead; /* number of words between the current word and the last word of the buffer */
int current_index; /* position of the current word */
word **array; /* array to store words */
FILE *input_file; /* file to read the words from */
mcd *mcd_struct; /* mcd describing the format of input_file */
} word_buffer;
......
......@@ -210,9 +210,10 @@ install (TARGETS maca_trans_lemmatizer DESTINATION bin)
#target_link_libraries(test_w2v transparse)
#install (TARGETS test_w2v DESTINATION bin)
#add_executable(w2v_filter ./src/w2v_filter.c)
#target_link_libraries(w2v_filter transparse)
#install (TARGETS w2v_filter DESTINATION bin)
add_executable(w2v_filter ./src/w2v_filter.c)
target_link_libraries(w2v_filter transparse)
target_link_libraries(w2v_filter maca_common)
install (TARGETS w2v_filter DESTINATION bin)
#add_executable(test_word_emb ./src/test_word_emb.c)
#target_link_libraries(test_word_emb transparse)
......
......@@ -52,6 +52,20 @@ void one_hot_print(FILE *f, int val, int dim)
fprintf(f, "%d ", (i == val)? 1 : 0);
}
void check_feature_model(feat_model *fm)
{
int i;
feat_desc *fd;
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
if(fd->nbelem > 1){
fprintf(stderr, "feature %d is a complex feature, aborting\n", i);
exit(1);
}
}
}
void print_header(mcd *m, feat_model *fm)
{
int i;
......@@ -62,33 +76,24 @@ void print_header(mcd *m, feat_model *fm)
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
if(fd->nbelem > 1){
printf("feature %d is a complex feature, skipping it\n", i);
}
else{
sfd = fd->array[0];
printf("\t%s", sfd->name);
}
sfd = fd->array[0];
printf("\t%s", sfd->name);
}
printf("\n");
printf("OUT");
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
if(fd->nbelem > 1){
printf("feature %d is a complex feature, skipping it\n", i);
}
else{
sfd = fd->array[0];
if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
printf("\tUNK");
}
sfd = fd->array[0];
if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
printf("\tUNK");
}
printf("\n");
/*
for(i=0; i < m->nb_col; i++){
......@@ -127,6 +132,7 @@ void cff2fann(context *ctx)
char feature_type[64];
int feature_valindex;
int count = 0;
char *feat_str = NULL;
vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
......@@ -142,34 +148,46 @@ void cff2fann(context *ctx)
if (count % 100 == 0)
fprintf(stderr, "%d\r", count);
while(token){
/* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */
/* printf("col = %d token = %s\n", col_nb, token); */
val = atoi(token);
if(col_nb == 0){
/* one_hot_print(stdout, val, ctx->mvt_nb); */
/* printf("\n"); */
printf("%d", val);
} else {
sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex);
/* printf("feature_type = %s\n", feature_type); */
feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
/* printf("feat_type = %d\n", feat_type); */
/* printf("%d: ", col_nb); */
int mcd_col = m->wf2col[feat_type];
/* printf("representation = %d\n", m->representation[mcd_col]); */
if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
/* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */
/* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
/* printf("\n"); */
printf("\t%d", feature_valindex);
} else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
/* printf("it is a vocab\n"); */
/* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */
/* printf("\n"); */
feat_str = dico_int2string(vocab, val);
if(feat_str){
/* printf("feat str = %s\n", feat_str); */
sscanf(feat_str, "%[^==]==%d", feature_type, &feature_valindex);
/* printf("feature_type = %s\n", feature_type); */
feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
/* printf("feat_type = %d\n", feat_type); */
/* printf("%d: ", col_nb); */
int mcd_col = m->wf2col[feat_type];
/* printf("representation = %d\n", m->representation[mcd_col]); */
if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
/* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */
/* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
/* printf("\n"); */
printf("\t%d", feature_valindex);
} else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
/* printf("it is a vocab\n"); */
/* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */
/* printf("\n"); */
printf("\t%d", feature_valindex);
} else {
printf("\t%d", feature_valindex);
}
}
else{
fprintf(stderr, "WARNING cannot find the description of feature : %d\n", val);
feature_valindex = -1;
printf("\t%d", feature_valindex);
} else {
printf("\t%d", feature_valindex);
}
}
}
col_nb++;
token = strtok(NULL , "\t");
......@@ -193,6 +211,9 @@ int main(int argc, char *argv[])
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
check_feature_model(ctx->features_model);
look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class);
ctx->mvt_nb = nb_class;
......
......@@ -158,7 +158,8 @@ int main(int argc, char *argv[])
lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
if(lemma_from_fplm){
// printf("lemma %s found in exceptions file\n", lemma_from_fplm);
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
// print_word(b0, ctx->mcd_struct, to_lower_string(lemma_from_fplm));
print_word(b0, ctx->mcd_struct, lemma_from_fplm);
}
// if lemma is not found in exception file, predict an l_rule
else{
......@@ -185,15 +186,16 @@ int main(int argc, char *argv[])
if(l_rule_is_applicable(form, l_rule)){
char *transformed_lemma = apply_l_rule(form, l_rule);
// printf("transformed_lemma = %s\n", transformed_lemma);
// print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
print_word(b0, ctx->mcd_struct, transformed_lemma);
// print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
print_word(b0, ctx->mcd_struct, transformed_lemma);
free(transformed_lemma);
break;
}
}
/* no rule applied */
if(i == 10){
print_word(b0, ctx->mcd_struct, form);
// print_word(b0, ctx->mcd_struct, to_lower_string(form));
print_word(b0, ctx->mcd_struct, form);
}
free(vcode_array);
}
......
......@@ -7,9 +7,9 @@
#define MVT_PARSER_SHIFT 0
#define MVT_PARSER_REDUCE 1
#define MVT_PARSER_ROOT 2
#define MVT_PARSER_EOS -1
#define MVT_PARSER_LEFT 3
#define MVT_PARSER_RIGHT 4
#define MVT_PARSER_EOS 3
#define MVT_PARSER_LEFT 4
#define MVT_PARSER_RIGHT 5
/* even movements are left movements (except 0, which is shift and 2 which is root) */
#define movement_parser_left_code(label) (2 * (label) + 4)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment