Select Git revision
CMakeLists.txt
-
Franck Dary authoredFranck Dary authored
mcd.c 15.92 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include "mcd.h"
#include "util.h"
#include "dico.h"
#include "word_emb.h"
void mcd_remove_wf_column(mcd *m, int wf_code)
{
int col = m->wf2col[wf_code];
m->wf2col[wf_code] = -1;
m->representation[col] = MCD_REPRESENTATION_NULL;
m->wf[col] = -1;
m->wf_str[col] = NULL;
m->filename[col] = NULL;
m->dico_array[col] = NULL;
m->word_emb_array[col] = NULL;
}
mcd *mcd_new(int nb_col)
{
mcd *m = (mcd *)memalloc(sizeof(mcd));
int i;
m->nb_col = nb_col;
for(i=0; i < MCD_WF_NB; i++)
m->wf2col[i] = -1;
m->representation = (int *) memalloc(nb_col * sizeof(int));
m->wf = (int *) memalloc(nb_col * sizeof(int));
m->wf_str = (char **) memalloc(nb_col * sizeof(char *));
m->filename = (char **) memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
for(i=0; i < nb_col; i++){
m->representation[i] = MCD_REPRESENTATION_NULL;
m->wf[i] = -1;
m->wf_str[i] = NULL;
m->filename[i] = NULL;
m->dico_array[i] = NULL;
m->word_emb_array[i] = NULL;
}
return m;
}
mcd *mcd_copy(mcd *m)
{
int i;
mcd *copy = mcd_new(m->nb_col);
for(i=0; i < MCD_WF_NB; i++)
copy->wf2col[i] = m->wf2col[i];
for(i=0; i < m->nb_col; i++){
copy->representation[i] = m->representation[i];
copy->wf[i] = m->wf[i];
copy->wf_str[i] = (m->wf_str[i]) ? strdup(m->wf_str[i]) : NULL;
copy->filename[i] = (m->filename[i]) ? strdup(m->filename[i]) : NULL;
copy->dico_array[i] = (m->dico_array[i]) ? m->dico_array[i] : NULL;
copy->word_emb_array[i] = (m->word_emb_array[i])? m->word_emb_array[i] : NULL;
}
return copy;
}
void mcd_free(mcd *m)
{
int i;
for(i=0; i < m->nb_col; i++){
if(m->dico_array[i]) dico_free(m->dico_array[i]);
if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]);
if(m->wf_str[i]) free(m->wf_str[i]);
if (m->filename[i]) free(m->filename[i]);
}
free(m->representation);
free(m->filename);
free(m->dico_array);
free(m->word_emb_array);
free(m->wf_str);
free(m->wf);
free(m);
}
/* this function is used when reading a corpus file which structure is described in mcd m */
/* it returns the code associated to string str found in column col */
/* the code depends on the way the column is represented (vocabulary, embedding or integer) */
int mcd_get_code(mcd *m, char *str, int col){
if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return (m->dico_array[col])? dico_string2int(m->dico_array[col], str) : -1;
if(m->representation[col] == MCD_REPRESENTATION_EMB)
return word_emb_get_code(m->word_emb_array[col], str);
if(m->representation[col] == MCD_REPRESENTATION_INT)
return atoi(str);
return MCD_INVALID_VALUE;
}
/* look for the number of columns in an mcd file */
int mcd_max_column_index_in_file(char *mcd_filename)
{
int max_col = -1;
FILE *f = myfopen(mcd_filename, "r");
char buffer[1000]; /* ugly */
int column;
char wf[100];
char representation[100];
char filename[500]; /* ugly */
int fields_number;
int line_number = 0;
while(fgets(buffer, 1000, f)){
line_number++;
if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){
fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename);
continue;
}
if(column > max_col) max_col = column;
}
if (f != NULL) fclose(f);
return max_col;
}
/* takes as argument an mcd structure (m) and the name of a corpus file (corpus_filename) */
/* populates the vocabularies of m with values found in corpus_filename */
void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
{
int column;
for(column=0; column < m->nb_col; column++){
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
/* && (strcmp(m->filename[column], "_")) */
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->wf_str[column]);
fprintf(stderr, "extracting dico %s \tfrom corpus\n", m->wf_str[column]);
}
}
}
/* takes as argument an mcd structure (m) and a dictionary vector (vocabs) */
/* links the vocabularies of m to vocabularies of vocabs (based on their names) */
void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
{
int column;
for(column=0; column < m->nb_col; column++){
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
&& (!strcmp(m->filename[column], "_"))
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_vec_get_dico(vocabs, m->wf_str[column]);
if(verbose) fprintf(stderr, "linking to dico %s\n", m->wf_str[column]);
}
}
}
/* read an multi column description file and produces an mcd structure */
mcd *mcd_read(char *mcd_filename, int verbose)
{
int column;
char wf[100];
char representation[100];
char filename[500]; /* ugly */
int fields_number;
int line_number = 0;
char buffer[1000]; /* ugly */
int nb_col = mcd_max_column_index_in_file(mcd_filename);
mcd *m = mcd_new(nb_col + 1);
FILE *f = myfopen(mcd_filename, "r");
/* int first = 1; */
while(fgets(buffer, 1000, f)){
line_number++;
if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){
/* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
continue;
}
if(verbose) fprintf(stderr, "column = %d\tword feature = %s\trepresentation = %s\tfilename = %s\n", column, wf, representation, filename);
column--; /* in the mcd file, columns begin at index 1, in internal representation, columns begin at index 0 */
m->wf[column] = mcd_wf_code(wf);
m->wf_str[column] = strdup(wf);
if(m->wf[column] == -1){
fprintf(stderr, "in line %d of mcd file %s invalid wf, I'm skipping it\n", line_number, mcd_filename);
continue;
}
m->wf2col[m->wf[column]] = column;
if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB;
else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT;
else{
fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename);
m->representation[column] = MCD_REPRESENTATION_NULL;
}
if(m->representation[column] != MCD_REPRESENTATION_NULL)
m->filename[column] = strdup(filename);
if(strcmp(m->filename[column], "_")){
if(m->representation[column] == MCD_REPRESENTATION_EMB){
if(verbose) fprintf(stderr, "loading word embedding %s\n", m->filename[column]);
m->word_emb_array[column] = word_emb_load(m->filename[column]);
}
else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){
if(verbose) fprintf(stderr, "loading dico %s\n", m->filename[column]);
m->dico_array[column] = dico_read(m->filename[column], 0.5);
}
}
}
fclose(f);
return m;
}
/* builds an mcd corresponding to the conll07 format */
mcd *mcd_build_conll07(void)
{
mcd *m = mcd_new(8);
m->wf[0]=MCD_WF_ID;
m->wf_str[0]=strdup("INDEX");
m->representation[0]= MCD_REPRESENTATION_INT;
m->filename[0] = strdup("_");
m->wf2col[MCD_WF_ID] = 0;
m->wf[1]=MCD_WF_FORM;
m->wf_str[1]=strdup("FORM");
m->representation[1]= MCD_REPRESENTATION_VOCAB;
m->filename[1] = strdup("_");
m->wf2col[MCD_WF_FORM] = 1;
m->wf[2]=MCD_WF_LEMMA;
m->wf_str[2]=strdup("LEMMA");
m->representation[2]= MCD_REPRESENTATION_VOCAB;
m->filename[2] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = 2;
m->wf[3]=MCD_WF_CPOS;
m->wf_str[3]=strdup("CPOS");
m->representation[3]= MCD_REPRESENTATION_VOCAB;
m->filename[3] = strdup("_");
m->wf2col[MCD_WF_CPOS] = 3;
m->wf[4]=MCD_WF_POS;
m->wf_str[4]=strdup("POS");
m->representation[4]= MCD_REPRESENTATION_VOCAB;
m->filename[4] = strdup("_");
m->wf2col[MCD_WF_POS] = 4;
m->wf[5]=MCD_WF_FEATS;
m->wf_str[5]=strdup("FEATS");
m->representation[5]= MCD_REPRESENTATION_VOCAB;
m->filename[5] = strdup("_");
m->wf2col[MCD_WF_FEATS] = 5;
m->wf[6]=MCD_WF_GOV;
m->wf_str[6]=strdup("GOV");
m->representation[6]= MCD_REPRESENTATION_INT;
m->filename[6] = strdup("_");
m->wf2col[MCD_WF_GOV] = 6;
m->wf[7]=MCD_WF_LABEL;
m->wf_str[7]=strdup("LABEL");
m->representation[7]= MCD_REPRESENTATION_VOCAB;
m->filename[7] = strdup("_");
m->wf2col[MCD_WF_LABEL] = 7;
return m;
}
/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
mcd *mcd_build_wplgf(void)
{
mcd *m = mcd_new(5);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = 1;
col = 2;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = 2;
col = 3;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = 3;
col = 4;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = 4;
return m;
}
/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
mcd *mcd_build_ifpls(void)
{
mcd *m = mcd_new(6);
m->wf[0]=MCD_WF_ID;
m->wf_str[0]=strdup("INDEX");
m->representation[0]= MCD_REPRESENTATION_INT;
m->filename[0] = strdup("_");
m->wf2col[MCD_WF_ID] = 0;
m->wf[1]=MCD_WF_FORM;
m->wf_str[1]=strdup("FORM");
m->representation[1]= MCD_REPRESENTATION_VOCAB;
m->filename[1] = strdup("_");
m->wf2col[MCD_WF_FORM] = 1;
m->wf[2]=MCD_WF_POS;
m->wf_str[2]=strdup("POS");
m->representation[2]= MCD_REPRESENTATION_VOCAB;
m->filename[2] = strdup("_");
m->wf2col[MCD_WF_POS] = 2;
m->wf[3]=MCD_WF_LEMMA;
m->wf_str[3]=strdup("LEMMA");
m->representation[3]= MCD_REPRESENTATION_VOCAB;
m->filename[3] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = 3;
m->wf[4]=MCD_WF_GOV;
m->wf_str[4]=strdup("GOV");
m->representation[4]= MCD_REPRESENTATION_INT;
m->filename[4] = strdup("_");
m->wf2col[MCD_WF_GOV] = 4;
m->wf[5]=MCD_WF_LABEL;
m->wf_str[5]=strdup("LABEL");
m->representation[5]= MCD_REPRESENTATION_VOCAB;
m->filename[5] = strdup("_");
m->wf2col[MCD_WF_LABEL] = 5;
return m;
}
mcd *mcd_build_wplgfs(void)
{
mcd *m = mcd_new(6);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = 1;
col = 2;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = 2;
col = 3;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = 3;
col = 4;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = 4;
col = 5;
m->wf[col]=MCD_WF_SENT_SEG;
m->wf_str[col]=strdup("SENT_SEG");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_SENT_SEG] = 5;
return m;
}
mcd *mcd_build_wpmlgfs(void)
{
mcd *m = mcd_new(7);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = col;
col = 2;
m->wf[col]=MCD_WF_FEATS;
m->wf_str[col]=strdup("FEATS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FEATS] = col;
col = 3;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = col;
col = 4;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = col;
col = 5;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = col;
col = 6;
m->wf[col]=MCD_WF_SENT_SEG;
m->wf_str[col]=strdup("SENT_SEG");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_SENT_SEG] = col;
return m;
}
/* returns a dico_vec containing the different dictionnaries found in an mcd structure */
dico_vec *mcd_build_dico_vec(mcd *mcd_struct)
{
dico_vec *dv = dico_vec_new();
int i;
for(i=0; i < mcd_struct->nb_col; i++){
/* printf("in mcd_build_dico_vec i = %d\n", i); */
if(mcd_struct->dico_array[i]){
/* printf("dico name = %s\n", mcd_struct->dico_array[i]->name); */
dico_vec_add(dv, mcd_struct->dico_array[i]);
}
}
return dv;
}
int mcd_wf_code(char *wf)
{
if(!strcmp(wf, "INDEX")) return MCD_WF_ID;
if(!strcmp(wf, "FORM")) return MCD_WF_FORM;
if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA;
if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS;
if(!strcmp(wf, "POS")) return MCD_WF_POS;
if(!strcmp(wf, "FEATS")) return MCD_WF_FEATS;
if(!strcmp(wf, "LABEL")) return MCD_WF_LABEL;
if(!strcmp(wf, "STAG")) return MCD_WF_STAG;
/* if(!strcmp(wf, "INT")) return MCD_WF_INT; */
if(!strcmp(wf, "GOV")) return MCD_WF_GOV;
if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG;
if(!strcmp(wf, "A")) return MCD_WF_A;
if(!strcmp(wf, "B")) return MCD_WF_B;
if(!strcmp(wf, "C")) return MCD_WF_C;
if(!strcmp(wf, "D")) return MCD_WF_D;
if(!strcmp(wf, "E")) return MCD_WF_E;
if(!strcmp(wf, "F")) return MCD_WF_F;
if(!strcmp(wf, "G")) return MCD_WF_G;
if(!strcmp(wf, "H")) return MCD_WF_H;
if(!strcmp(wf, "I")) return MCD_WF_I;
if(!strcmp(wf, "J")) return MCD_WF_J;
if(!strcmp(wf, "K")) return MCD_WF_K;
if(!strcmp(wf, "L")) return MCD_WF_L;
if(!strcmp(wf, "M")) return MCD_WF_M;
if(!strcmp(wf, "N")) return MCD_WF_N;
if(!strcmp(wf, "O")) return MCD_WF_O;
if(!strcmp(wf, "P")) return MCD_WF_P;
if(!strcmp(wf, "Q")) return MCD_WF_Q;
if(!strcmp(wf, "R")) return MCD_WF_R;
if(!strcmp(wf, "S")) return MCD_WF_S;
if(!strcmp(wf, "T")) return MCD_WF_T;
if(!strcmp(wf, "U")) return MCD_WF_U;
if(!strcmp(wf, "V")) return MCD_WF_V;
if(!strcmp(wf, "W")) return MCD_WF_W;
if(!strcmp(wf, "X")) return MCD_WF_X;
if(!strcmp(wf, "Y")) return MCD_WF_Y;
if(!strcmp(wf, "Z")) return MCD_WF_Z;
return -1;
}
char *mcd_get_str(mcd *m, int code, int col)
{
if((col < 0) || (col >= m->nb_col)) return NULL;
if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return (m->dico_array[col])? dico_int2string(m->dico_array[col], code) : NULL;
return NULL;
}