Skip to content
Snippets Groups Projects
Commit badf0ce1 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added maca_tools directory and mcf2conll tool in it

parent 775995eb
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ include_directories(maca_common/include)
include_directories(perceptron/lib/include)
add_subdirectory(maca_common)
add_subdirectory(maca_tools)
add_subdirectory(perceptron)
add_subdirectory(maca_lemmatizer)
add_subdirectory(maca_trans_parser)
......
......@@ -44,17 +44,19 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht)
int num = 0;
char **lemma_array;
int lemma_array_size = 10000;
char buffer[10000];
FILE *f= myfopen(fplm_filename, "r");
int fields_nb;
lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *));
while(!feof(f)){
fields_nb = fscanf(f, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
/* if(!strcmp(form, "d")) */
/* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */
if(fields_nb != 4){
fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);
fprintf(stderr, "incorrect fplm entry, skipping it\n");
continue;
}
......@@ -161,6 +163,9 @@ int main(int argc, char *argv[])
lemma = lemma_array[index_form_pos];
}
else
if(ctx->verbose){
fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
}
lemma = form;
}
......
#compiling, linking and installing executables
add_executable(mcf2conll ./src/mcf2conll.c)
target_link_libraries(mcf2conll perceptron)
target_link_libraries(mcf2conll transparse)
target_link_libraries(mcf2conll maca_common)
install (TARGETS mcf2conll DESTINATION bin)
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<getopt.h>
#include"mcd.h"
#include"util.h"
#include"word_buffer.h"
typedef struct {
int help;
int verbose;
int debug_mode;
char *program_name;
char *conll_filename;
char *mcf_filename;
char *mcd_filename;
mcd *mcd_struct;
} context;
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->conll_filename = NULL;
ctx->mcf_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-C --mcd : mcd filename\n");
fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n");
fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n");
}
mcd *mcd_build_wplgfs(void)
{
mcd *m = mcd_new(6);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = 1;
col = 2;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = 2;
col = 3;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = 3;
col = 4;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = 4;
col = 5;
m->wf[col]=MCD_WF_SENT_SEG;
m->wf_str[col]=strdup("SENT_SEG");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_SENT_SEG] = 5;
return m;
}
void mcf2conll_check_options(context *ctx){
if(ctx->help){
context_general_help_message(ctx);
exit(1);
}
}
void str_print_col_n(FILE *f, char *buffer, int n)
{
int i;
int col = 0;
int l= strlen(buffer);
for(i=0; i < l; i++){
if(buffer[i] == '\t') {
col++;
continue;
}
if(col == n)
fprintf(f, "%c", buffer[i]);
}
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[6] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"mcf", required_argument, 0, 'i'},
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'o':
ctx->conll_filename = strdup(optarg);
break;
case 'i':
ctx->mcf_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
}
}
if(ctx->mcd_filename){
ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose);
}
else{
ctx->mcd_struct = mcd_build_wplgfs();
}
return ctx;
}
word_buffer *word_buffer_load_mcf2(char *mcf_filename, mcd *mcd_struct)
{
FILE *f;
if(mcf_filename == NULL)
f = stdin;
else
f = myfopen(mcf_filename, "r");
word_buffer *wb = word_buffer_new(f, mcd_struct, 0);
while(word_buffer_read_next_word(wb) != -1){
/* printf("load word %d\n", wb->nbelem - 1); */
}
if(mcf_filename != NULL)
fclose(f);
return wb;
}
char *mcd_get_str(mcd *m, int code, int col)
{
if((col < 0) || (col >= m->nb_col)) return NULL;
if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return (m->dico_array[col])? dico_int2string(m->dico_array[col], code) : NULL;
return NULL;
}
int main(int argc, char *argv[])
{
FILE *output_file;
context *ctx = context_read_options(argc, argv);
mcf2conll_check_options(ctx);
word_buffer *wb = word_buffer_load_mcf2(ctx->mcf_filename, ctx->mcd_struct);
word *w = NULL;
int form_col = mcd_get_form_col(ctx->mcd_struct);
int pos_col = mcd_get_pos_col(ctx->mcd_struct);
int cpos_col = mcd_get_cpos_col(ctx->mcd_struct);
int lemma_col = mcd_get_lemma_col(ctx->mcd_struct);
int gov_col = mcd_get_gov_col(ctx->mcd_struct);
int label_col = mcd_get_label_col(ctx->mcd_struct);
int feats_col = mcd_get_feats_col(ctx->mcd_struct);
int sent_seg_col = mcd_get_sent_seg_col(ctx->mcd_struct);
int index = 1;
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
do{
w = word_buffer_b0(wb);
printf("%d\t", index);
if(form_col != -1)
str_print_col_n(output_file, w->input, form_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
if(lemma_col != -1)
str_print_col_n(output_file, w->input, lemma_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
if(cpos_col != -1)
str_print_col_n(output_file, w->input, cpos_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
if(pos_col != -1)
str_print_col_n(output_file, w->input, pos_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
if(feats_col != -1)
str_print_col_n(output_file, w->input, feats_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
if(gov_col)
fprintf(output_file, "%d\t", word_get_gov(w) + index);
else
fprintf(output_file, "_\t");
if(label_col != -1)
str_print_col_n(output_file, w->input, label_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\t");
fprintf(output_file, "\t_\t\n");
if((sent_seg_col) && (word_get_sent_seg(w))){
fprintf(output_file, "\n");
index = 0;
}
index ++;
} while(word_buffer_move_right(wb));
if(ctx->conll_filename)
fclose(output_file);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment