Skip to content
Snippets Groups Projects
Commit c03efa83 authored by Johannes Heinecke's avatar Johannes Heinecke
Browse files

Merge branch 'master' of https://gitlab.lif.univ-mrs.fr/alexis.nasr/macaon2 into johannes

parents 651cd59b 36ddf619
No related branches found
No related tags found
1 merge request!9Johannes
Showing
with 604 additions and 20 deletions
......@@ -4,6 +4,13 @@ project(macaon2)
find_package(FLEX)
add_definitions("-Wall" )
SET(CMAKE_C_COMPILER g++)
SET(CMAKE_CXX_COMPILER g++)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS")
SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lm -lopenblas" )
if (${CMAKE_C_COMPILER_VERSION} VERSION_LESS 5.3)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11")
......@@ -28,11 +35,12 @@ add_subdirectory(maca_common)
add_subdirectory(maca_tools)
add_subdirectory(perceptron)
#add_subdirectory(maca_lemmatizer)
#add_subdirectory(maca_morpho)
add_subdirectory(maca_tokenizer)
add_subdirectory(maca_lexer)
add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
add_subdirectory(maca_graph_parser)
#add_subdirectory(maca_graph_parser)
if(MACA_EXPORT)
add_subdirectory(maca_export)
......
......@@ -12,6 +12,7 @@ set(SOURCES src/util.c
src/feat_desc.c
src/feat_lib.c
src/feat_model.c
src/char16.c
)
......
#ifndef __CHAR16__
#define __CHAR16__
typedef short char16;
int utf8_strlen(char *utf8_string);
char *char16toutf8(char16 *char16_string);
int char16_strlen(char16 *string);
char16 *utf8tochar16(char *utf8_string);
#endif
......@@ -24,5 +24,5 @@ feat_desc *feat_model_add(feat_model *fm, feat_desc *fd);
feat_model *feat_model_read(char *filename, feat_lib *fl, int verbose);
void feat_model_compute_ranges(feat_model *fm, mcd *m, int mvt_nb);
int feat_model_get_type_feat_n(feat_model *fm, int n);
void catenate_int(char *string, int val);
#endif
......@@ -8,7 +8,7 @@
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 36
#define MCD_WF_NB 47
#define MCD_WF_ID 0
#define MCD_WF_FORM 1
......@@ -47,6 +47,81 @@
#define MCD_WF_Y 34
#define MCD_WF_Z 35
#define MCD_WF_Aspect 36
#define MCD_WF_Case 37
#define MCD_WF_Clitic 38
#define MCD_WF_Definite 39
#define MCD_WF_Gender 40
#define MCD_WF_Mood 41
#define MCD_WF_NameType 42
#define MCD_WF_NounType 43
#define MCD_WF_Number 44
#define MCD_WF_Person 45
#define MCD_WF_Tense 46
/*Abbr
AdpType
AdvType
Animacy
Animacy[gram]
ConjType
Connegative
Degree
Derivation
Dialect
Echo
Evident
Foreign
Form
Gender[dat]
Gender[erg]
Gender[psor]
HebBinyan
HebExistential
HebSource
Hyph
InfForm
Number[abs]
Number[dat]
Number[erg]
Number[psed]
Number[psor]
NumForm
NumType
NumValue
PartForm
PartType
Person[abs]
Person[dat]
Person[erg]
Person[psor]
Polarity
Polite
Polite[abs]
Polite[dat]
Polite[erg]
Position
Poss
Prefix
PrepCase
PrepForm
PronType
PunctSide
PunctType
Reflex
Strength
Style
Subcat
Typo
Variant
VerbForm
VerbType
Voice
Xtra*/
#include "dico.h"
#include "word_emb.h"
#include "dico_vec.h"
......@@ -90,6 +165,9 @@
#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y]
#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z]
#define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
......@@ -121,6 +199,7 @@ mcd *mcd_build_conll07(void);
mcd *mcd_build_ifpls(void);
mcd *mcd_build_wplgf(void);
mcd *mcd_build_wplgfs(void);
mcd *mcd_build_wpmlgfs(void);
mcd *mcd_read(char *mcd_filename, int verbose);
void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
......
......@@ -2,6 +2,7 @@
#define __WORD__
#include "mcd.h"
#include "char16.h"
#define WORD_INVALID_GOV 10000
......@@ -12,10 +13,41 @@ typedef struct _word {
int signature; /* pos tags that this form can have (represented as a boolean string) */
int label;
char *form;
char16 *form_char16;
int index;
int is_root;
} word;
/*
#define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6])
*/
#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1])
#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2])
#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3])
#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4])
#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5])
#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6])
/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5])
*/
#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0])
#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1])
#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2])
#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3])
#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4])
#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
#define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
#define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
#define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA])
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
typedef short char16;
#define char_bit1(c) ((c) & 1)
#define char_bit2(c) (((c) & 2) >> 1)
#define char_bit3(c) (((c) & 4) >> 2)
#define char_bit4(c) (((c) & 8) >> 3)
#define char_bit5(c) (((c) & 16) >> 4)
#define char_bit6(c) (((c) & 32) >> 5)
#define char_bit7(c) (((c) & 64) >> 6)
#define char_bit8(c) (((c) & 128) >> 7)
#define length(c) ((!char_bit8((c)) || (char_bit8(c) && !char_bit7(c)))? 1 : 2)
/*
int length(char c)
{
if(!char_bit8(c)) return 1;
if(char_bit8(c) && !char_bit7(c)) return 1;
if(char_bit7(c)) return 2;
if(char_bit6(c)) return 3;
if(char_bit5(4)) return 4;
}
*/
int utf8_strlen(char *utf8_string)
{
int l = 0;
while(*utf8_string){
l += (length(*utf8_string) == 1) ? 1 : 0;
utf8_string++;
}
return l;
}
char *char16toutf8(char16 *char16_string)
{
return NULL;
}
int char16_strlen(char16 *string)
{
int i=0;
while(string[i]) i++;
return i;
}
char16 *utf8tochar16(char *utf8_string)
{
int i,j;
int utf8_length = strlen(utf8_string);
int char16_length = 0;
char16 *char16_string;
for(i=0; i < utf8_length; i++)
char16_length += length(utf8_string[i]);
char16_string = (char16*) malloc((char16_length + 1)* sizeof(char16));
for(i=0, j=0; i < utf8_length; i++, j++){
if(length(utf8_string[i]) == 1){
char16_string[j] = (char16)utf8_string[i];
}
if(length(utf8_string[i]) == 2){
char16_string[j] = utf8_string[i];
char16_string[j] = char16_string[j] << 8;
char16_string[j] += utf8_string[++i];
}
}
char16_string[j] = 0;
return char16_string;
}
/*
int main(void)
{
int i;
char string[200];
char16 *char16_string;
strcpy(string, "élémentaire");
printf("string = %s\n", string);
printf("length = %d\n", (int)strlen(string));
printf("utf8 length = %d\n", (int)utf8_strlen(string));
for(i=0; i < strlen(string); i++){
printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, string[i], (int)string[i], char_bit1(string[i]), char_bit2(string[i]), char_bit3(string[i]), char_bit4(string[i]), char_bit5(string[i]), char_bit6(string[i]), char_bit7(string[i]), char_bit8(string[i]), length(string[i]));
}
char16_string = utf8tochar16(string);
printf("char16_strlen = %d\n", char16_strlen(char16_string));
}
*/
......@@ -131,7 +131,6 @@ void catenate_int(char *string, int val)
}
feat_model *feat_model_new(char *name)
{
feat_model *fm = (feat_model *)memalloc(sizeof(feat_model));
......
......@@ -6,13 +6,13 @@
form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list)
{
form2pos *f2p = memalloc(sizeof(form2pos));
form2pos *f2p = (form2pos *)memalloc(sizeof(form2pos));
char *token;
f2p->nbelem = nbelem;
f2p->pos_nb = pos_nb;
f2p->d_pos = dico_new("d_pos", pos_nb * 10);
f2p->d_signature = dico_new("d_signature", pos_nb * 10);
f2p->d_pos = dico_new((char *)"d_pos", pos_nb * 10);
f2p->d_signature = dico_new((char *)"d_signature", pos_nb * 10);
f2p->h_form2signature = hash_new(nbelem * 4);
token = strtok(pos_list, "\t");
do{
......
......@@ -422,6 +422,63 @@ mcd *mcd_build_wplgfs(void)
return m;
}
mcd *mcd_build_wpmlgfs(void)
{
mcd *m = mcd_new(7);
int col;
col = 0;
m->wf[col]=MCD_WF_FORM;
m->wf_str[col]=strdup("FORM");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FORM] = col;
col = 1;
m->wf[col]=MCD_WF_POS;
m->wf_str[col]=strdup("POS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_POS] = col;
col = 2;
m->wf[col]=MCD_WF_FEATS;
m->wf_str[col]=strdup("FEATS");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_FEATS] = col;
col = 3;
m->wf[col]=MCD_WF_LEMMA;
m->wf_str[col]=strdup("LEMMA");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LEMMA] = col;
col = 4;
m->wf[col]=MCD_WF_GOV;
m->wf_str[col]=strdup("GOV");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_GOV] = col;
col = 5;
m->wf[col]=MCD_WF_LABEL;
m->wf_str[col]=strdup("LABEL");
m->representation[col]= MCD_REPRESENTATION_VOCAB;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_LABEL] = col;
col = 6;
m->wf[col]=MCD_WF_SENT_SEG;
m->wf_str[col]=strdup("SENT_SEG");
m->representation[col]= MCD_REPRESENTATION_INT;
m->filename[col] = strdup("_");
m->wf2col[MCD_WF_SENT_SEG] = col;
return m;
}
/* returns a dico_vec containing the different dictionnaries found in an mcd structure */
......
......@@ -7,7 +7,7 @@
trie_state *trie_state_new(trie_trans *transitions, int is_accept)
{
trie_state *state = memalloc(sizeof(trie_state));
trie_state *state = (trie_state *) memalloc(sizeof(trie_state));
state->transitions = transitions;
state->is_accept = is_accept;
state->fail = 0;
......@@ -24,7 +24,7 @@ void trie_state_free(trie_state *state)
trie *trie_new(void)
{
trie *t = memalloc(sizeof(trie));
trie *t = (trie *) memalloc(sizeof(trie));
t->states = NULL;
t->size = 0;
t->states_nb = 0;
......@@ -45,7 +45,7 @@ void trie_free(trie *t)
trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next)
{
trie_trans *trans = memalloc(sizeof(trie_trans));
trie_trans *trans = (trie_trans *)memalloc(sizeof(trie_trans));
trans->destination = destination;
trans->symbol = symbol;
trans->next = next;
......
......@@ -19,6 +19,7 @@ word *word_new(char *input)
w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
w->form = NULL;
w->form_char16 = NULL;
w->index = -1;
w->signature = -1;
......@@ -59,11 +60,13 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct)
w = word_new(buffer);
token = strtok(buffer, "\t");
do{
if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){
/* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */
if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){
w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
}
if(mcd_struct->wf[col] == MCD_WF_FORM){
w->form = strdup(token);
w->form_char16 = utf8tochar16(w->form);
w->U1 = isupper(token[0]) ? 1 : 0;
}
col++;
......@@ -95,6 +98,7 @@ void word_free(word *w)
if(w == NULL) return;
if(w->input) free(w->input);
if(w->form) free(w->form);
if(w->form_char16) free(w->form_char16);
free(w);
}
......
......@@ -2,7 +2,7 @@
#include "array.h"
array_t* array_new() {
array_t* array = malloc(sizeof(array_t));
array_t* array = (array_t *)malloc(sizeof(array_t));
array->num_elements = 0;
array->data = NULL;
return array;
......@@ -18,7 +18,7 @@ ARRAY_TYPE array_get(array_t* array, int element) {
}
void array_push(array_t* array, ARRAY_TYPE value) {
array->data = realloc(array->data, sizeof(ARRAY_TYPE) * (array->num_elements + 1));
array->data = (ARRAY_TYPE *)realloc(array->data, sizeof(ARRAY_TYPE) * (array->num_elements + 1));
array->data[array->num_elements] = value;
array->num_elements++;
}
......
......@@ -75,7 +75,7 @@ void maca_graph_parser_print_ctx(maca_graph_parser_ctx *ctx)
maca_graph_parser_ctx * maca_graph_parser_InitCTX()
{
maca_graph_parser_ctx * ctx = calloc(sizeof(maca_graph_parser_ctx), 1);
maca_graph_parser_ctx * ctx = (maca_graph_parser_ctx *)calloc(sizeof(maca_graph_parser_ctx), 1);
ctx->cfg=MACA_DEFAULT_CFG;
ctx->verbose_flag = maca_verbose;
......
......@@ -36,7 +36,7 @@ void maca_graph_parser_alphabet_free(maca_graph_parser_alphabet *a)
maca_graph_parser_alphabet *maca_graph_parser_alphabet_new(char *name)
{
maca_graph_parser_alphabet *a = malloc(sizeof(maca_graph_parser_alphabet));
maca_graph_parser_alphabet *a = (maca_graph_parser_alphabet *)malloc(sizeof(maca_graph_parser_alphabet));
if(a == NULL){
fprintf(stderr, "memory allocation error\n");
exit(1);
......@@ -153,7 +153,7 @@ maca_graph_parser_alphabet **maca_graph_parser_alphabet_load4(char *filename)
int i = 0;
char symbol[1000];
maca_graph_parser_alphabet *a = NULL;
maca_graph_parser_alphabet **alpha_array = malloc(4 * sizeof(maca_graph_parser_alphabet*));
maca_graph_parser_alphabet **alpha_array = (maca_graph_parser_alphabet **)malloc(4 * sizeof(maca_graph_parser_alphabet*));
for(i=0; i < 4; i++)
alpha_array[i] = NULL;
......@@ -182,7 +182,7 @@ maca_graph_parser_alphabet **maca_graph_parser_alphabet_load5(char *filename)
int i = 0;
char symbol[1000];
maca_graph_parser_alphabet *a = NULL;
maca_graph_parser_alphabet **alpha_array = malloc(5 * sizeof(maca_graph_parser_alphabet*));
maca_graph_parser_alphabet **alpha_array = (maca_graph_parser_alphabet **)malloc(5 * sizeof(maca_graph_parser_alphabet*));
for(i=0; i < 5; i++)
alpha_array[i] = NULL;
......
......@@ -125,6 +125,7 @@ int main(int argc, char *argv[])
char *buffer_copy;
char *form;
char *pos;
char *feats;
char *token;
int column_nb;
......@@ -136,11 +137,16 @@ int main(int argc, char *argv[])
int form_column;
int pos_column;
int lemma_column;
int feats_column;
FILE *f = NULL;
ctx = context_read_options(argc, argv);
maca_lemmatizer_check_options(ctx);
feats_column = ctx->mcd_struct->wf2col[MCD_WF_FEATS];
if(ctx->pos_column != -1)
pos_column = ctx->pos_column;
else
......@@ -177,6 +183,7 @@ int main(int argc, char *argv[])
form = NULL;
pos = NULL;
lemma = NULL;
feats = NULL;
do{
if(column_nb == lemma_column) /* lemma is present in the input file */
if(strcmp(token, "_")) /* and it is not an underscore */
......@@ -188,6 +195,9 @@ int main(int argc, char *argv[])
if(column_nb == pos_column){
pos = strdup(token);
}
if(column_nb == feats_column){
feats = strdup(token);
}
column_nb++;
} while((token = strtok(NULL , "\t")));
......@@ -215,11 +225,13 @@ int main(int argc, char *argv[])
/* print_word(buffer, ctx->mcd_struct, lemma); */
/* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */
printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma);
printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma);
printf("\t%s\n", lemma);
if(pos)free(pos);
if(form)free(form);
if(feats)free(feats);
}
free(buffer_copy);
free(lemma_array);
......
......@@ -29,7 +29,7 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb
char token[1000];
int l;
int i, j;
dico *d_tokens = dico_new("TOKENS", 100000);
dico *d_tokens = dico_new((char *)"TOKENS", 100000);
int token_code;
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
......@@ -71,6 +71,6 @@ int main(int argc, char *argv[])
dico *d_tokens;
d_tokens = decompose_mwe_in_fplm_file(argv[1], stdout, 1);
dico_print("d_tokens.dico", d_tokens);
dico_print((char *)"d_tokens.dico", d_tokens);
dico_free(d_tokens);
}
set(SOURCES
src/maca_morpho_feat_fct.c
src/maca_morpho_context.c
src/vectorize.c
)
#compiling library
include_directories(src)
add_library(maca_morpho STATIC ${SOURCES})
target_link_libraries(maca_morpho perceptron)
target_link_libraries(maca_morpho maca_common)
#compiling, linking and installing executables
add_executable(fplm2cff ./src/fplm2cff.c)
target_link_libraries(fplm2cff perceptron)
target_link_libraries(fplm2cff maca_common)
target_link_libraries(fplm2cff maca_morpho)
install (TARGETS fplm2cff DESTINATION bin)
add_executable(predict ./src/predict.c)
target_link_libraries(predict perceptron)
target_link_libraries(predict maca_common)
target_link_libraries(predict maca_morpho)
install (TARGETS predict DESTINATION bin)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "maca_morpho_context.h"
#include "feat_model.h"
#include "feat_vec.h"
#include "dico.h"
#include "util.h"
#include "vectorize.h"
void decompose_feature_value(char *feature_value, char *feature, char *value)
{
int i,j;
int l = strlen(feature_value);
int before = 1;
for(i=0; (i < l) && (feature_value[i] != '='); i++){
feature[i] = feature_value[i];
}
feature[i] = '\0';
i++;
for(j=0; i<l; i++, j++){
value[j] = feature_value[i];
}
value[j] = '\0';
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
if(ctx->help){
context_general_help_message(ctx);
context_language_help_message(ctx);
context_fplm_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_features_filename_help_message(ctx);
context_features_model_help_message(ctx);
exit(1);
}
feat_vec *fv = feat_vec_new(10);
dico *dico_features = dico_new("dico_features", 1000);
/* feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); */
char form[100];
char pos[100];
char lemma[100];
char morpho[100];
FILE *F_fplm = NULL;
char buffer[1000];
char feature_value[100];
char feature[100];
char value[100];
char *token;
F_fplm = myfopen(ctx->fplm_filename, "r");
while(fgets(buffer, 1000, F_fplm)){
if(feof(F_fplm))
break;
// printf("%s", buffer);
buffer[strlen(buffer) - 1] = '\0';
sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form, pos, lemma, morpho);
//printf("form = %s pos = %s lemma = %s morpho = %s\n", form, pos, lemma, morpho);
token = strtok(morpho, "|");
do{
//printf("token = %s\n", token);
decompose_feature_value(token, feature, value);
//printf("feature = %s value = %s\n", feature, value);
}while((token = strtok(NULL, "|")));
}
fclose(F_fplm);
}
/*
while(strcmp(form, "end")){
fscanf(stdin, "%s", form);
printf("form = %s\n", form);
form2fv(form, fv, fm, dico_features, ADD_MODE);
//void feat_vec_print_string(feat_vec *fv, dico *dico_features);
feat_vec_print(stdout, fv);
}
//dico_print_fh(stdout, dico_features);
if(ctx->features_filename)
dico_print(ctx->features_filename, dico_features);
*/
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include "maca_morpho_context.h"
#include "util.h"
void context_set_linguistic_resources_filenames(context *ctx);
void context_free(context *ctx)
{
if(ctx->program_name) free(ctx->program_name);
if(ctx->fplm_filename) free(ctx->fplm_filename);
if(ctx->cfw_filename) free(ctx->cfw_filename);
if(ctx->language) free(ctx->language);
if(ctx->maca_data_path) free(ctx->maca_data_path);
free(ctx);
}
context *context_new(void)
{
context *ctx = (context *)memalloc(sizeof(context));
ctx->help = 0;
ctx->verbose = 0;
ctx->debug_mode = 0;
ctx->program_name = NULL;
ctx->fplm_filename = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->features_filename = NULL;
ctx->cfw_filename = NULL;
return ctx;
}
void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
}
void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
}
void context_fm_help_message(context *ctx){
fprintf(stderr, "\t-F --fm <file> : feature model file name\n");
}
void context_features_filename_help_message(context *ctx){
fprintf(stderr, "\t-x --feat <file> : features dictionary file name\n");
}
void context_weights_matrix_filename_help_message(context *ctx){
fprintf(stderr, "\t-w --weights <file> : weight matrix (cfw) filename\n");
}
void context_features_model_help_message(context *ctx){
fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
int option_index = 0;
context *ctx = context_new();
ctx->program_name = strdup(argv[0]);
static struct option long_options[10] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"fplm", required_argument, 0, 'f'},
{"maca_data_path", required_argument, 0, 'D'},
{"fm", required_argument, 0, 'F'},
{"feat", required_argument, 0, 'x'},
{"weights", required_argument, 0, 'w'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:x:w:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
ctx->debug_mode = 1;
break;
case 'h':
ctx->help = 1;
break;
case 'v':
ctx->verbose = 1;
break;
case 'f':
ctx->fplm_filename = strdup(optarg);
break;
case 'L':
ctx->language = strdup(optarg);
break;
case 'D':
ctx->maca_data_path = strdup(optarg);
break;
case 'F':
ctx->fm_filename = strdup(optarg);
break;
case 'x':
ctx->features_filename = strdup(optarg);
break;
case 'w':
ctx->cfw_filename = strdup(optarg);
break;
}
}
context_set_linguistic_resources_filenames(ctx);
return ctx;
}
void context_set_linguistic_resources_filenames(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else {
char *e = getenv("MACAON_DIR");
if (e != NULL) {
strcat(absolute_path, e);
} else {
fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n");
}
}
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->fplm_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment