Select Git revision
cff2fann.c 5.36 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"util.h"
#include"cf_file.h"
#include"feat_lib.h"
#include"feat_types.h"
void cff2fann_help_message(context *ctx)
{
context_general_help_message(ctx);
printf("\nInput:\n");
context_cff_help_message(ctx);
context_features_model_help_message(ctx);
context_vocabs_help_message(ctx);
}
void cff2fann_check_options(context *ctx)
{
if(ctx->cff_filename) fprintf(stderr, "cff filename = %s\n", ctx->cff_filename);
if(ctx->mcd_filename) fprintf(stderr, "mcd filename = %s\n", ctx->mcd_filename);
if(ctx->features_model_filename) fprintf(stderr, "fm filename = %s\n", ctx->features_model_filename);
if(!ctx->cff_filename
|| ctx->help
|| !ctx->vocabs_filename
|| !ctx->features_model_filename
){
cff2fann_help_message(ctx);
exit(1);
}
}
void one_hot_print(FILE *f, int val, int dim)
{
int i;
for(i=0; i < dim; i++)
fprintf(f, "%d ", (i == val)? 1 : 0);
}
void check_feature_model(feat_model *fm)
{
int i;
feat_desc *fd;
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
if(fd->nbelem > 1){
fprintf(stderr, "feature %d is a complex feature, aborting\n", i);
exit(1);
}
}
}
void print_header(mcd *m, feat_model *fm)
{
int i;
feat_desc *fd;
simple_feat_desc *sfd;
printf("OUT");
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
sfd = fd->array[0];
printf("\t%s", sfd->name);
}
printf("\n");
printf("OUT");
for(i=0; i <fm->nbelem; i++){
fd = fm->array[i];
sfd = fd->array[0];
if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
printf("\tUNK");
}
printf("\n");
/*
for(i=0; i < m->nb_col; i++){
if(m->representation[i] == MCD_REPRESENTATION_EMB){
printf("\tEMB");
continue;
}
if(m->representation[i] == MCD_REPRESENTATION_NULL){
continue;
}
if(m->representation[i] == MCD_REPRESENTATION_VOCAB){
printf("\t%s", m->wf_str[i]);
continue;
}
if(m->representation[i] == MCD_REPRESENTATION_INT){
printf("\tINT");
continue;
}
}
printf("\n");*/
}
void cff2fann(context *ctx)
{
char buffer[10000];
char *token;
int col_nb;
int feat_type;
mcd *m = ctx->mcd_struct;
FILE *f = myfopen(ctx->cff_filename, "r");
int val;
dico *vocab;
char feature_type[64];
int feature_valindex;
int count = 0;
char *feat_str = NULL;
vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* printf("%d %d\n", 1, ctx->features_model->nbelem); */
print_header(m, ctx->features_model);
while(fgets(buffer, 10000, f)){
/* printf("%s", buffer); */
/* printf("\n"); */
token = strtok(buffer, "\t");
col_nb = 0;
if (count % 100 == 0)
fprintf(stderr, "%d\r", count);
while(token){
/* printf("col = %d token = %s\n", col_nb, token); */
val = atoi(token);
if(col_nb == 0){
/* one_hot_print(stdout, val, ctx->mvt_nb); */
/* printf("\n"); */
printf("%d", val);
} else {
feat_str = dico_int2string(vocab, val);
if(feat_str){
/* printf("feat str = %s\n", feat_str); */
sscanf(feat_str, "%[^==]==%d", feature_type, &feature_valindex);
/* printf("feature_type = %s\n", feature_type); */
feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
/* printf("feat_type = %d\n", feat_type); */
/* printf("%d: ", col_nb); */
int mcd_col = m->wf2col[feat_type];
/* printf("representation = %d\n", m->representation[mcd_col]); */
if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
fprintf(stderr, "it is an embedding val = %d, file = %s\n", feature_valindex, m->filename[mcd_col]);
//int word_emb_get_code(word_emb *we, char *word)
/* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
/* printf("\n"); */
printf("\t%d", feature_valindex);
} else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
/* printf("it is a vocab\n"); */
/* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */
/* printf("\n"); */
printf("\t%d", feature_valindex);
} else {
printf("\t%d", feature_valindex);
}
}
else{
fprintf(stderr, "WARNING cannot find the description of feature : %d\n", val);
feature_valindex = -1;
printf("\t%d", feature_valindex);
}
}
col_nb++;
token = strtok(NULL , "\t");
}
printf("\n");
count++;
}
fclose(f);
}
int main(int argc, char *argv[])
{
context *ctx;
int nb_feat;
int nb_class;
ctx = context_read_options(argc, argv);
cff2fann_check_options(ctx);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
check_feature_model(ctx->features_model);
look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class);
ctx->mvt_nb = nb_class;
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, 1);
cff2fann(ctx);
return 0;
}