diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index ea4625b2b13c5452932cd1cef17f3590794d6304..f54ad51c84a92ad918ecc7d9e855605fa83e5238 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -9,12 +9,12 @@ set(SOURCES src/util.c src/sentence.c src/word_buffer.c src/trie.c - src/feat_desc.c - src/feat_lib.c - src/feat_model.c - src/char16.c - - + src/feat_desc.c + src/feat_lib.c + src/feat_model.c + src/char16.c + src/l_rule.c + src/fplm.c ) #compiling library diff --git a/maca_common/include/dico.h b/maca_common/include/dico.h index 03f6a7862af0378fafc130fa4001061d98efcd5d..b10b9e3af475b4abcbe169f5be68e81ebf48ba69 100644 --- a/maca_common/include/dico.h +++ b/maca_common/include/dico.h @@ -18,12 +18,12 @@ typedef struct } dico; dico *dico_new(char *name, int size); -void dico_free(dico *d); -int dico_add(dico *d, char *key); +void dico_free(dico *d); +int dico_add(dico *d, char *key); char *dico_int2string(dico *d, int val); -int dico_string2int(dico *d, char *string); -void dico_print(char *filename, dico *d); -void dico_print_fh(FILE *f, dico *d); +int dico_string2int(dico *d, char *string); +void dico_print(char *filename, dico *d); +void dico_print_fh(FILE *f, dico *d); dico *dico_read(char *filename, float ratio); dico *dico_read_fh(FILE *f, float ratio); dico *dico_extract_from_corpus(char *filename, int column, char *dico_name); diff --git a/maca_common/include/fplm.h b/maca_common/include/fplm.h new file mode 100644 index 0000000000000000000000000000000000000000..d39ed84d489d5a21ce76a5e3eb5ba90048b4bfb3 --- /dev/null +++ b/maca_common/include/fplm.h @@ -0,0 +1,21 @@ +#ifndef __FPLM__ +#define __FPLM__ + +#include<stdio.h> +#include<stdlib.h> +#include"hash.h" + +typedef struct +{ + char **lemma_array; + int lemma_array_size; + hash *form_pos_ht; + int nbelem; +} fplm_struct; + +fplm_struct *fplm_new(); +void fplm_free(fplm_struct *fplm); +fplm_struct *fplm_load_file(char *fplm_filename, int debug_mode); +char *fplm_lookup_lemma(fplm_struct *fplm, char *form, char *pos, int verbose); +void fplm_add(fplm_struct *fplm, char *form, char *pos, char *lemma); +#endif diff --git a/maca_common/include/hash.h b/maca_common/include/hash.h index a62d26ac75dead2ce6b098e271036a92b98652f7..1f558619d2fe82d07c89f9e1d8acc557dbca1020 100644 --- a/maca_common/include/hash.h +++ b/maca_common/include/hash.h @@ -19,14 +19,15 @@ typedef struct cell *cell_new(char *key, int val, cell *next); -void cell_free(cell *c); +void cell_free(cell *c); hash *hash_new(int size); -void hash_free(hash *h); +void hash_free(hash *h); cell *hash_lookup(hash *h, char *key); -int hash_get_val(hash *h, char *key); -void hash_add(hash *h, char *key, int val); -void hash_stats(hash *h); +int hash_get_val(hash *h, char *key); +cell *hash_add(hash *h, char *key, int val); +void hash_stats(hash *h); +void hash_inc_val(hash *h, char *key, int inc); #endif diff --git a/maca_common/include/l_rule.h b/maca_common/include/l_rule.h new file mode 100644 index 0000000000000000000000000000000000000000..6a74b082c5b45f7597ab223d4a201e23bd2ff5e9 --- /dev/null +++ b/maca_common/include/l_rule.h @@ -0,0 +1,7 @@ +#ifndef __L_RULE__ +#define __L_RULE__ + +char *apply_l_rule(char *form, char *l_rule); +char *compute_l_rule(char *lemma, char *form, int strict); + +#endif diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c index 0631fdb0bb8190ed8a160c492ee57e231b215464..e4dd2c954522597c0ee1facfe8ee95d134b2c9f3 100644 --- a/maca_common/src/char16.c +++ b/maca_common/src/char16.c @@ -2,8 +2,8 @@ #include<stdlib.h> #include<string.h> -//#include"char16.h" -typedef unsigned short char16; +#include"char16.h" +//typedef unsigned short char16; #define char_bit1(c) ((c) & 1) #define char_bit2(c) (((c) & 2) >> 1) #define char_bit3(c) (((c) & 4) >> 2) @@ -12,9 +12,11 @@ typedef unsigned short char16; #define char_bit6(c) (((c) & 32) >> 5) #define char_bit7(c) (((c) & 64) >> 6) #define char_bit8(c) (((c) & 128) >> 7) -#define length(c) ((!char_bit8((c)) || (char_bit8(c) && !char_bit7(c)))? 1 : 2) + +/* return 1 if it is a character that is stored on a single byte, return 2 otherwise */ +#define utf8_span(c) ((!char_bit8((c)) || (char_bit8(c) && !char_bit7(c)))? 1 : 2) /* -int length(char c) +int utf8_span(char c) { if(!char_bit8(c)) return 1; if(char_bit8(c) && !char_bit7(c)) return 1; @@ -28,7 +30,7 @@ int utf8_strlen(char *utf8_string) { int l = 0; while(*utf8_string){ - l += (length(*utf8_string) == 1) ? 1 : 0; + l += (utf8_span(*utf8_string) == 1) ? 1 : 0; utf8_string++; } return l; @@ -58,15 +60,16 @@ char *char16toutf8(char16 *char16_string) else length_utf8 += 1; } - utf8_string = (char *)malloc(length_utf8 * sizeof(char)); + + utf8_string = (char *) malloc(length_utf8 * sizeof(char)); j = 0; for(i=0; i < length_char16; i++){ c = char16_string[i]; lo = c & 255; hi = c >> 8; - printf("c = %d hi = %d lo = %d\n", c, hi, lo); + /* printf("c = %d hi = %d lo = %d (%c)\n", c, hi, lo, (char) lo); */ if(hi != 0) - utf8_string[j++] = (char)hi; + utf8_string[j++] = (char)hi + 1; /* something wrong, we should not have to add one */ utf8_string[j++] = (char)lo; } utf8_string[j] = 0; @@ -78,17 +81,13 @@ char16 *utf8tochar16(char *utf8_string) { int i,j; int utf8_length = strlen(utf8_string); - int char16_length = 0; - char16 *char16_string; - for(i=0; i < utf8_length; i++) - char16_length += length(utf8_string[i]); + int char16_length = utf8_strlen(utf8_string); + char16 *char16_string = (char16*) malloc((char16_length + 1) * sizeof(char16)); - char16_string = (char16*) malloc((char16_length + 1)* sizeof(char16)); for(i=0, j=0; i < utf8_length; i++, j++){ - if(length(utf8_string[i]) == 1){ + if(utf8_span(utf8_string[i]) == 1) char16_string[j] = (char16)utf8_string[i]; - } - if(length(utf8_string[i]) == 2){ + if(utf8_span(utf8_string[i]) == 2){ char16_string[j] = utf8_string[i]; char16_string[j] = char16_string[j] << 8; char16_string[j] += utf8_string[++i]; @@ -97,8 +96,8 @@ char16 *utf8tochar16(char *utf8_string) char16_string[j] = 0; return char16_string; } -/* -int main(void) + +/*int main(void) { int i; char string[200]; @@ -110,7 +109,7 @@ int main(void) printf("length = %d\n", (int)strlen(string)); printf("utf8 length = %d\n", (int)utf8_strlen(string)); for(i=0; i < strlen(string); i++){ - printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, string[i], (int)string[i], char_bit1(string[i]), char_bit2(string[i]), char_bit3(string[i]), char_bit4(string[i]), char_bit5(string[i]), char_bit6(string[i]), char_bit7(string[i]), char_bit8(string[i]), length(string[i])); + printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, string[i], (int)string[i], char_bit1(string[i]), char_bit2(string[i]), char_bit3(string[i]), char_bit4(string[i]), char_bit5(string[i]), char_bit6(string[i]), char_bit7(string[i]), char_bit8(string[i]), utf8_span(string[i])); } @@ -118,10 +117,11 @@ int main(void) printf("char16_strlen = %d\n", char16_strlen(char16_string)); utf8_string = char16toutf8(char16_string); + printf("string after conversion = %s\n", utf8_string); for(i=0; i < strlen(utf8_string); i++){ - printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, utf8_string[i], (int)utf8_string[i], char_bit1(utf8_string[i]), char_bit2(utf8_string[i]), char_bit3(utf8_string[i]), char_bit4(utf8_string[i]), char_bit5(utf8_string[i]), char_bit6(utf8_string[i]), char_bit7(utf8_string[i]), char_bit8(utf8_string[i]), length(utf8_string[i])); + printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, utf8_string[i], (int)utf8_string[i], char_bit1(utf8_string[i]), char_bit2(utf8_string[i]), char_bit3(utf8_string[i]), char_bit4(utf8_string[i]), char_bit5(utf8_string[i]), char_bit6(utf8_string[i]), char_bit7(utf8_string[i]), char_bit8(utf8_string[i]), utf8_span(utf8_string[i])); } -*/ + } - +*/ diff --git a/maca_common/src/dico.c b/maca_common/src/dico.c index 04701e7e87fcc5ef0b01e67295f9d497dd522bc0..eb0a1cd3a8555d67bbe47f4405807d7d0cc47ba0 100644 --- a/maca_common/src/dico.c +++ b/maca_common/src/dico.c @@ -107,7 +107,7 @@ void dico_print(char *filename, dico *d) int dico_add(dico *d, char *key) { int val = d->nbelem; - char *key_copy; + // char *key_copy; cell *c; if((c = hash_lookup(d->htable, key)) != NULL){ @@ -115,16 +115,15 @@ int dico_add(dico *d, char *key) return c->val; } - key_copy = strdup(key); + // key_copy = strdup(key); d->nbelem++; /* printf("adding it nbelem = %d\n", d->nbelem); */ - hash_add(d->htable, key_copy, val); + c = hash_add(d->htable, key, val); if(d->nbelem > d->array_size){ - d->array_size = 2 * (d->array_size +1); d->array = (char **)realloc(d->array, d->array_size * sizeof(char*)); } - d->array[val] = key_copy; + d->array[val] = c->key; return val; } diff --git a/maca_common/src/dico_vec.c b/maca_common/src/dico_vec.c index a0642888d331831adb5ec84d8c5381235c0f1cfd..1fda6012398f212472fb4aec7a857ba60ee75769 100644 --- a/maca_common/src/dico_vec.c +++ b/maca_common/src/dico_vec.c @@ -36,14 +36,14 @@ dico *dico_vec_get_dico(dico_vec *dv, char *dico_name) int dico_vec_add(dico_vec *dv, dico *d) { - char *dico_name = NULL; + // char *dico_name = NULL; int dico_nb = dv->nb; dv->nb++; dv->t = (dico **)realloc(dv->t, dv->nb * sizeof(dico *)); dv->t[dico_nb] = d; - if(d->name) dico_name = strdup(d->name); + // if(d->name) dico_name = strdup(d->name); - hash_add(dv->ht, dico_name, dico_nb); + hash_add(dv->ht, d->name, dico_nb); return dv->nb; } diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index 853d06108167523713cd1f8a16d0b5d55b50b78b..13af28a60f1cb017d98085b6f2f7a0b86fc2d767 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -82,7 +82,7 @@ form2pos *form2pos_read(char *filename) while(!feof(f)){ fscanf(f, "%[^\t]\t%s\n", form, signature); /* printf("form = %s signature = %s code = %d\n", form, signature, signature_code); */ - hash_add(f2p->h_form2signature, strdup(form), dico_add(f2p->d_signature, signature)); + hash_add(f2p->h_form2signature, form, dico_add(f2p->d_signature, signature)); } fclose(f); return f2p; diff --git a/maca_common/src/fplm.c b/maca_common/src/fplm.c new file mode 100644 index 0000000000000000000000000000000000000000..a64af843e65d372c730276dd4543ced99085f4e5 --- /dev/null +++ b/maca_common/src/fplm.c @@ -0,0 +1,112 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> + +#include"fplm.h" +#include"dico.h" +#include"util.h" + + +fplm_struct *fplm_new(void){ + fplm_struct *fplm = (fplm_struct *)memalloc(sizeof(fplm_struct)); + fplm->form_pos_ht = hash_new(1000000); + fplm->lemma_array = NULL; + fplm->lemma_array_size = 0; + fplm->nbelem = 0; + return fplm; +} + +void fplm_free(fplm_struct *fplm){ + hash_free(fplm->form_pos_ht); + + for(int i=0; i< fplm->lemma_array_size; ++i) + if(fplm->lemma_array[i]) + free(fplm->lemma_array[i]); + + free(fplm->lemma_array); + free(fplm); +} + +void fplm_add(fplm_struct *fplm, char *form, char *pos, char *lemma) +{ + char form_pos[1000]; + + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); + + hash_add(fplm->form_pos_ht, form_pos, fplm->nbelem); + + if(fplm->nbelem >= fplm->lemma_array_size){ + fplm->lemma_array_size = 2 * fplm->lemma_array_size + 1; + fplm->lemma_array = (char **)realloc(fplm->lemma_array, fplm->lemma_array_size * sizeof(char *)); + // initialize in order to be able to free correctly and the end + for(int i = fplm->nbelem; i < fplm->lemma_array_size; ++i) + fplm->lemma_array[i] = NULL; + } + + /* if(fplm->lemma_array[fplm->nbelem] == NULL) */ + fplm->lemma_array[fplm->nbelem] = strdup(lemma); + fplm->nbelem++; +} + +fplm_struct *fplm_load_file(char *fplm_filename, int debug_mode) +{ + char form[1000]; + char pos[1000]; + + char lemma[1000]; + char morpho[1000]; + char buffer[10000]; + int fields_nb; + FILE *f= myfopen(fplm_filename, "r"); + fplm_struct *fplm = fplm_new(); + + while(fgets(buffer, 10000, f)){ + fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + /* if(!strcmp(form, "d")) */ + // fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); + if(fields_nb != 4){ + if(debug_mode){ + fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + fplm_add(fplm, form, pos, lemma); + } + /* fprintf(stderr, "%d entries loaded\n", num); */ + fclose(f); + return fplm; +} + +char *fplm_lookup_lemma(fplm_struct *fplm, char *form, char *pos, int verbose) +{ + char form_pos[1000]; + int index_form_pos; + + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(fplm->form_pos_ht, form_pos); + + if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */ + return fplm->lemma_array[index_form_pos]; + + strcpy(form_pos, form); + to_lower_string(form_pos); /* change form to lower case and look it up again */ + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(fplm->form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL) + return fplm->lemma_array[index_form_pos]; + + /* even in lower case couple form/pos is not found, return the form as lemma */ + if(verbose) + fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + + return form; +} diff --git a/maca_common/src/hash.c b/maca_common/src/hash.c index 724bcceaa1211c2d0cc7bd969c6633df3c474fd5..9b8ccf899fa853258c4a4c368727ce7fa2b02c92 100644 --- a/maca_common/src/hash.c +++ b/maca_common/src/hash.c @@ -73,13 +73,16 @@ int hash_get_val(hash *h, char *key) return HASH_INVALID_VAL; } -void hash_add(hash *h, char *key, int val) +cell *hash_add(hash *h, char *key, int val) { int index; - if(hash_lookup(h, key)) return; + cell *c = hash_lookup(h, key); + if(c != NULL) return c; index = hash_func(key, h->size); - h->array[index] = cell_new(key, val, h->array[index]); + c = cell_new(strdup(key), val, h->array[index]); + h->array[index] = c; h->nbelem++; + return c; } int cell_nb(cell *c) @@ -88,6 +91,15 @@ int cell_nb(cell *c) return 1 + cell_nb(c->next); } +void hash_inc_val(hash *h, char *key, int inc) +{ + cell *c = hash_lookup(h, key); + if(c == NULL) + hash_add(h, key, 0); + else + c->val += inc; +} + void hash_stats(hash *h) { int max = 0; @@ -107,6 +119,4 @@ void hash_stats(hash *h) for(i=0; i < nb; i++) printf("%d %d\n", i, table[i]); - - } diff --git a/maca_common/src/l_rule.c b/maca_common/src/l_rule.c new file mode 100644 index 0000000000000000000000000000000000000000..e851f786d116ee9406cc176e08f75cde704f37a0 --- /dev/null +++ b/maca_common/src/l_rule.c @@ -0,0 +1,69 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include "util.h" + + +char *apply_l_rule(char *form, char *l_rule) +{ + int i,j; + int sep_index; + int lemma_suffix_length; + int form_suffix_length; + char *lemma = NULL; + int form_length = strlen(form); + int l_rule_length = strlen(l_rule); + int lemma_length; + + for(sep_index=1; sep_index < l_rule_length; sep_index++) + if(l_rule[sep_index] == '@') + break; + + lemma_suffix_length = l_rule_length - 1 - sep_index; + form_suffix_length = sep_index - 1; + + lemma_length = form_length + lemma_suffix_length - form_suffix_length; + if(lemma_length < form_length) + lemma_length = form_length; + lemma = (char *) memalloc((lemma_length + 1) * sizeof(char)); + strcpy(lemma, form); + + for(j=0, i=form_length - form_suffix_length; j < lemma_suffix_length; i++, j++){ + lemma[i] = l_rule[sep_index + j + 1]; + } + lemma[i] = 0; + return lemma; +} + + +char *compute_l_rule(char *lemma, char *form, int strict) +{ + int breakpoint,j,k; + int lemma_suffix_length; + int form_suffix_length; + int lemma_length = strlen(lemma); + int form_length = strlen(form); + char *l_rule; + + for(breakpoint=0; (breakpoint < lemma_length) && (breakpoint < form_length); breakpoint++) + if(form[breakpoint] != lemma[breakpoint]) + break; + + lemma_suffix_length = lemma_length - breakpoint; + form_suffix_length = form_length - breakpoint; + + // printf("lemma suffix length = %d form suffix length = %d\n", lemma_suffix_length, form_suffix_length); + l_rule = (char *)memalloc((lemma_suffix_length + form_suffix_length + 3) * sizeof(char)); + + j = 0; + l_rule[j++] = '@'; + for(k=0; k < form_suffix_length; k++) + l_rule[j++] = strict? form[breakpoint + k] : '*'; + l_rule[j++] = '@'; + for(k=0; k < lemma_suffix_length; k++) + l_rule[j++] = lemma[breakpoint + k]; + l_rule[j] = 0; + return l_rule; +} + diff --git a/maca_common/src/util.c b/maca_common/src/util.c index 9b16c7536b15e35a14458c663cf0f3c38e817d69..351bfd030ea535fca60c620588607b5aa2d5af60 100644 --- a/maca_common/src/util.c +++ b/maca_common/src/util.c @@ -39,8 +39,9 @@ FILE *myfopen_no_exit(const char *path, const char *mode) char *to_lower_string(char *s) { - int i; - for(i=0; i < strlen(s); i++) + size_t i; + size_t l = strlen(s); + for(i=0; i < l; i++) s[i] = tolower(s[i]); return s; } diff --git a/maca_common/src/word_emb.c b/maca_common/src/word_emb.c index 09fa33d2513c9776527837b83e07e403db959c50..b94743e97e812de6987adbfb39db065407a680cb 100644 --- a/maca_common/src/word_emb.c +++ b/maca_common/src/word_emb.c @@ -35,7 +35,7 @@ word_emb *word_emb_load_w2v_file_filtered(char *file_name, dico *d) if(dico_string2int(d, word) != -1){ fprintf(stderr, "found word %s\n", word); - hash_add(we->htable, strdup(word), word_nb++); + hash_add(we->htable, word, word_nb++); /* fprintf(stdout, "read word %s %d\n", word, word_nb - 1); */ /* fprintf(stdout, "\r%d", word_nb - 1); */ for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f); @@ -72,7 +72,7 @@ word_emb *word_emb_load_w2v_file(char *file_name) if ((a < w2v_max_w) && (word[a] != '\n')) a++; } word[a] = 0; - hash_add(we->htable, strdup(word), word_nb++); + hash_add(we->htable, word, word_nb++); /* fprintf(stdout, "read word %s %d\n", word, word_nb - 1); */ fprintf(stdout, "\r%d", word_nb - 1); for (a = 0; a < size; a++){ fread(&(we->array[k++]), sizeof(float), 1, f); @@ -214,7 +214,7 @@ word_emb *word_emb_load(char *filename) line_nb++; res = fscanf(f, "%s", word); if(res == 0) fprintf(stderr, "word embdedding file %s ill formed\n", filename); - hash_add(we->htable, strdup(word), word_nb); + hash_add(we->htable, word, word_nb); /* printf("word = %s word_nb = %d k = %d\n", word, word_nb, k); */ for(i=0; i < dim; i++){ res = fscanf(f, "%f", &(we->array[k++])); diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt index 7ba50b065165c74f28aaa019f8cb2e895846fb56..e92ea9ab8d4d842952c6c7909b11c7d17c667e8a 100644 --- a/maca_tools/CMakeLists.txt +++ b/maca_tools/CMakeLists.txt @@ -6,7 +6,7 @@ target_link_libraries(mcf2conll transparse) target_link_libraries(mcf2conll maca_common) install (TARGETS mcf2conll DESTINATION bin) -add_executable(fplm_suff ./src/fplm_suff.c) -target_link_libraries(fplm_suff maca_common) -install (TARGETS fplm_suff DESTINATION bin) +add_executable(maca_compute_l_rules ./src/maca_compute_l_rules.c) +target_link_libraries(maca_compute_l_rules maca_common) +install (TARGETS maca_compute_l_rules DESTINATION bin) diff --git a/maca_tools/src/fplm_suff.c b/maca_tools/src/fplm_suff.c deleted file mode 100644 index 941949f6b7a35f7f5827daf708a134e5c7e670f3..0000000000000000000000000000000000000000 --- a/maca_tools/src/fplm_suff.c +++ /dev/null @@ -1,186 +0,0 @@ -#include<stdio.h> -#include<stdlib.h> -#include<string.h> -#include<getopt.h> - -#include"util.h" -#include"char16.h" - - -typedef struct { - int help; - int verbose; - int debug_mode; - char *program_name; - char *fplm_filename; -} context; - -void context_free(context *ctx) -{ - if(ctx){ - if(ctx->program_name) - free(ctx->program_name); - if(ctx->fplm_filename) - free(ctx->fplm_filename); - free(ctx); - } -} - -context *context_new(void) -{ - context *ctx = (context *)memalloc(sizeof(context)); - - ctx->help = 0; - ctx->verbose = 0; - ctx->debug_mode = 0; - ctx->program_name = NULL; - ctx->fplm_filename = NULL; - return ctx; -} - -void context_general_help_message(context *ctx) -{ - fprintf(stderr, "usage: %s [options]\n", ctx->program_name); - fprintf(stderr, "Options:\n"); - fprintf(stderr, "\t-h --help : print this message\n"); - fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); - fprintf(stderr, "\t-f --fplm : fplm filename (read from stdin if absent)\n"); -} - -void fplm_suff_check_options(context *ctx){ - if(ctx->help){ - context_general_help_message(ctx); - exit(1); - } -} - -context *context_read_options(int argc, char *argv[]) -{ - int c; - int option_index = 0; - context *ctx = context_new(); - - ctx->program_name = strdup(argv[0]); - - static struct option long_options[4] = - { - {"help", no_argument, 0, 'h'}, - {"verbose", no_argument, 0, 'v'}, - {"debug", no_argument, 0, 'd'}, - {"fplm", required_argument, 0, 'f'}, - }; - optind = 0; - opterr = 0; - - while ((c = getopt_long (argc, argv, "hvdf:", long_options, &option_index)) != -1){ - switch (c) - { - case 'h': - ctx->help = 1; - break; - case 'v': - ctx->verbose = 1; - break; - case 'd': - ctx->debug_mode = 1; - break; - case 'f': - ctx->fplm_filename = strdup(optarg); - break; - } - } - return ctx; -} - - -int compute_classe(char16 *lemma_char16, char16 *form_char16) -{ - int i,j,k; - int lemma_suffix_length; - int form_suffix_length; - int lemma_length = char16_strlen(lemma_char16); - int form_length = char16_strlen(form_char16); - int *classe; - - for(i=0; (i < lemma_length) && (i < form_length); i++) - if(form_char16[i] != lemma_char16[i]) - break; - - lemma_suffix_length = lemma_length - i; - form_suffix_length = form_length - i; - - // printf("lemma suffix length = %d form suffix length = %d\n", lemma_suffix_length, form_suffix_length); - classe = (int *)memalloc((lemma_suffix_length + form_suffix_length + 2) * sizeof(int)); - - j = 0; - classe[j++] = form_suffix_length; - for(k=0; k < form_suffix_length; k++) - classe[j++] = form_char16[form_length - k - 1]; - classe[j++] = lemma_suffix_length; - for(k=0; k < lemma_suffix_length; k++) - classe[j++] = lemma_char16[i + k]; - - printf("%d ", classe[0]); - for(k=0; k < classe[0]; k++) - printf("%d ", classe[k+1]); - printf("%d ", classe[classe[0] + 1]); - for(k=0; k < classe[classe[0] + 1]; k++) - printf("%d ", classe[classe[0] + 1 + k+1]); - printf("\n"); - - return 0; -} - - - -int main(int argc, char *argv[]) -{ - context *ctx = context_read_options(argc, argv); - - /*if(ctx->help){ - context_general_help_message(ctx); - context_language_help_message(ctx); - context_fplm_help_message(ctx); - context_maca_data_path_help_message(ctx); - context_features_filename_help_message(ctx); - context_features_model_help_message(ctx); - exit(1); - }*/ - - char form_utf8[100]; - char *form_utf8_2; - char16 *form_char16; - char pos[100]; - char lemma_utf8[100]; - char *lemma_utf8_2; - char16 *lemma_char16; - char morpho[100]; - FILE *F_fplm = stdin; - - char buffer[1000]; - - if(ctx->fplm_filename) - F_fplm = myfopen(ctx->fplm_filename, "r"); - - while(fgets(buffer, 1000, F_fplm)){ - if(feof(F_fplm)) - break; - // printf("%s", buffer); - buffer[strlen(buffer) - 1] = '\0'; - sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form_utf8, pos, lemma_utf8, morpho); - // printf("form = %s pos = %s lemma = %s morpho = %s\n", form_utf8, pos, lemma_utf8, morpho); - // printf("%s -> %s ", form_utf8, lemma_utf8); - lemma_char16 = utf8tochar16(lemma_utf8); - form_char16 = utf8tochar16(form_utf8); - - form_utf8_2 = char16toutf8(form_char16); - lemma_utf8_2 = char16toutf8(lemma_char16); - - printf("lemma avant = %s lemme après = %s\n", lemma_utf8, lemma_utf8_2); - - compute_classe(lemma_char16, form_char16); - - } - if(ctx->fplm_filename) - fclose(F_fplm); -} diff --git a/maca_tools/src/maca_compute_l_rules.c b/maca_tools/src/maca_compute_l_rules.c new file mode 100644 index 0000000000000000000000000000000000000000..3d3248cfa04d4e3a76d8d35aaf6ab9074889955e --- /dev/null +++ b/maca_tools/src/maca_compute_l_rules.c @@ -0,0 +1,197 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<getopt.h> + +#include"l_rule.h" +#include"util.h" +#include"dico.h" + +typedef struct { + int help; + int verbose; + int debug_mode; + int strict_mode; + int threshold; + char *program_name; + char *fplm_filename; + char *l_rules_filename; + char *exceptions_filename; +} context; + +void context_free(context *ctx) +{ + if(ctx){ + if(ctx->program_name) + free(ctx->program_name); + if(ctx->fplm_filename) + free(ctx->fplm_filename); + free(ctx); + } +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->threshold = 100; + ctx->strict_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + ctx->l_rules_filename = NULL; + ctx->exceptions_filename = NULL; + return ctx; +} + +void help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-f --fplm <filename> : fplm filename\n"); + fprintf(stderr, "\t-s --strict : generate strict l_rules\n"); + fprintf(stderr, "\t-t --threshold <int> : threshold\n"); + fprintf(stderr, "\t-r --l_rules <filename> : file to stock l_rules\n"); + fprintf(stderr, "\t-e --exceptions <filename> : exceptions filename (fplm format)\n"); +} + +void check_options(context *ctx){ + if((ctx->help) + || !ctx->fplm_filename + || !ctx->l_rules_filename + || !ctx->exceptions_filename + ) + { + help_message(ctx); + exit(1); + } +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[8] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"strict", no_argument, 0, 's'}, + {"fplm", required_argument, 0, 'f'}, + {"threshold", required_argument, 0, 't'}, + {"l_rules", required_argument, 0, 'r'}, + {"exceptions", required_argument, 0, 'e'}, + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdsf:t:r:e:", long_options, &option_index)) != -1){ + switch (c) + { + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'd': + ctx->debug_mode = 1; + break; + case 's': + ctx->strict_mode = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'r': + ctx->l_rules_filename = strdup(optarg); + break; + case 'e': + ctx->exceptions_filename = strdup(optarg); + break; + case 't': + ctx->threshold = atoi(optarg); + break; + } + } + return ctx; +} + + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + int i; + cell *c; + char form[100]; + char pos[100]; + char lemma[100]; + char morpho[100]; + FILE *F_fplm = NULL; + FILE *F_exceptions = NULL; + char *l_rule; + char buffer[1000]; + dico *d_rules = dico_new((char *)"d_rules", 100); + hash *h_rules = hash_new(10000); + check_options(ctx); + + F_fplm = myfopen(ctx->fplm_filename, "r"); + + while(fgets(buffer, 1000, F_fplm)){ + if(feof(F_fplm)) + break; + /* printf("%s", buffer); */ + buffer[strlen(buffer) - 1] = '\0'; + sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form, pos, lemma, morpho); + + l_rule = compute_l_rule(lemma, form, ctx->strict_mode); + // char *new_lemma = apply_l_rule(form, l_rule); + + // printf("%s\t%s\t%s=%s\t%s\t%s\n", form, pos, lemma, new_lemma, morpho, l_rule); + // printf("%s\t%s\t%s\t%s\t%s\n", form, pos, lemma, morpho, l_rule); + hash_inc_val(h_rules, l_rule, 1); + + //free(new_lemma); + free(l_rule); + } + fclose(F_fplm); + + for(i=0; i < h_rules->size; i++){ + for(c = h_rules->array[i]; c; c = c->next) + if(c->val >= ctx->threshold){ + dico_add(d_rules, c->key); + } + + } + + F_fplm = myfopen(ctx->fplm_filename, "r"); + F_exceptions = myfopen(ctx->exceptions_filename, "w"); + while(fgets(buffer, 1000, F_fplm)){ + if(feof(F_fplm)) + break; + /* printf("%s", buffer); */ + buffer[strlen(buffer) - 1] = '\0'; + sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form, pos, lemma, morpho); + + l_rule = compute_l_rule(lemma, form, ctx->strict_mode); + // if((dico_string2int(d_rules, l_rule) == -1) && (strcmp(form, lemma))) + if((dico_string2int(d_rules, l_rule) == -1)) + fprintf(F_exceptions, "%s\t%s\t%s\t%s\t%s\n", form, pos, lemma, morpho, l_rule); + // fprintf(F_exceptions, "%s\t%s\t%s\t%s\n", form, pos, lemma, morpho); + free(l_rule); + } + fclose(F_fplm); + fclose(F_exceptions); + + dico_print(ctx->l_rules_filename, d_rules); + +} diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index f0c86e5e8eedc7fb7f208e497c7fe199cb8cd800..a6c732d37fa0d975dbb1a113f1c70873a2534fd5 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -1,4 +1,3 @@ - #include<stdio.h> #include<stdlib.h> #include<string.h> @@ -9,6 +8,7 @@ #include"context.h" #include"dico.h" #include"config.h" +#include"fplm.h" void maca_lemmatizer_help_message(context *ctx) { @@ -43,89 +43,6 @@ void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) } } -char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size) -{ - char form[1000]; - char pos[1000]; - - char lemma[1000]; - char morpho[1000]; - int num = 0; - char **lemma_array; - //int lemma_array_size = 10000; - *lemma_array_size = 10000; - char buffer[10000]; - int fields_nb; - FILE *f= myfopen(fplm_filename, "r"); - - lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *)); - - while(fgets(buffer, 10000, f)){ - fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); - /* if(!strcmp(form, "d")) */ - /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ - if(fields_nb != 4){ - if(debug_mode){ - fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); - fprintf(stderr, "incorrect fplm entry, skipping it\n"); - } - continue; - } - strcat(form, "/"); - strcat(form, pos); - // TODO: memory leak: if form is already in the hash, it is not added and the memory - // allocated by strdup() is leaked - // solutions: hash_add does the strdup() if necessary (check else where !) - // or return code to indicate whether form has been added or not - hash_add(form_pos_ht, strdup(form), num); - - if(num >= *lemma_array_size){ - *lemma_array_size = 2 * (*lemma_array_size) + 1; - lemma_array = (char **)realloc(lemma_array, (*lemma_array_size) * sizeof(char *)); - // initialize in order to be able to free correctly and the end - for(int i=num; i<*lemma_array_size; ++i) { - lemma_array[i] = NULL; - } - } - - /* if(lemma_array[num] == NULL) */ - lemma_array[num] = strdup(lemma); - num++; - } - /* fprintf(stderr, "%d entries loaded\n", num); */ - fclose(f); - return lemma_array; -} - - -char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) -{ - char form_pos[1000]; - int index_form_pos; - - strcpy(form_pos, form); - strcat(form_pos, "/"); - strcat(form_pos, pos); - index_form_pos = hash_get_val(form_pos_ht, form_pos); - - - if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */ - return lemma_array[index_form_pos]; - - strcpy(form_pos, form); - to_lower_string(form_pos); /* change form to lower case and look it up again */ - strcat(form_pos, "/"); - strcat(form_pos, pos); - index_form_pos = hash_get_val(form_pos_ht, form_pos); - if(index_form_pos != HASH_INVALID_VAL) - return lemma_array[index_form_pos]; - - /* even in lower case couple form/pos is not found, return the form as lemma */ - if(verbose) - fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); - - return form; -} /* a bit messy */ void print_word(word *w, mcd *mcd_struct, char *lemma) @@ -161,22 +78,18 @@ void print_word(word *w, mcd *mcd_struct, char *lemma) int main(int argc, char *argv[]) { context *ctx = context_read_options(argc, argv); - hash *form_pos_ht = hash_new(1000000); - char **lemma_array = NULL; word *b0; char lemma[200]; char form[200]; char pos[200]; config *c; + fplm_struct *fplm; + FILE *f; maca_lemmatizer_check_options(ctx); maca_lemmatizer_set_linguistic_resources_filenames(ctx); - - int lemma_array_size; - lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size); - - FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; - + fplm = fplm_load_file(ctx->fplm_filename, ctx->debug_mode); + f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; c = config_new(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ @@ -189,23 +102,13 @@ int main(int argc, char *argv[]) if(strlen(lemma) && strcmp(lemma, "_")) print_word(b0, ctx->mcd_struct, lemma); else - print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose)); - + print_word(b0, ctx->mcd_struct, fplm_lookup_lemma(fplm, form, pos, ctx->verbose)); word_buffer_move_right(c->bf); } - - hash_free(form_pos_ht); - - for(int i=0; i<lemma_array_size; ++i) { - if (lemma_array[i]) free(lemma_array[i]); - } - free(lemma_array); - - config_free(c); if (ctx->input_filename) fclose(f); context_free(ctx); - + fplm_free(fplm); return 0; } diff --git a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..91aa5a57efd02bbae6c45673e3a61feaa3c4b7ee --- /dev/null +++ b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c @@ -0,0 +1,144 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_tagger.h" +#include"oracle_tagger.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules) +{ + char lemma[200]; + char form[200]; + char *l_rule; + int l_rule_code; + + word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct)); + + l_rule = compute_l_rule(lemma, form, 1); + l_rule_code = dico_string2int(d_l_rule, l_rule); + free(l_rule); + return l_rule_code; +} + +int movement_lemmatizer(config *c, int feats) +{ + // word_set_feats(word_buffer_b0(c->bf), feats); + word_buffer_move_right(c->bf); + + return 1; +} + +void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); +} + +void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_lemmatizer_mcf2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file(FILE *output_file, context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + int l_rule; + /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ + + c = config_new(conll_file, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + + + l_rule = oracle_lemmatizer(c); + + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + + fprintf(output_file, "%d", l_rule); + feat_vec_print(output_file, fv); + movement_lemmatizer(c, l_rule); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + dico *d_l_rules; + ctx = context_read_options(argc, argv); + maca_trans_lemmatizer_mcf2cff_check_options(ctx); + + d_l_rules = dico_read("l_rules", "r"); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + generate_training_file(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} +