From 1c19c11d9f61beb73fbff12aadde05b1b85d552a Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 12 Apr 2018 12:22:59 +0200 Subject: [PATCH] added several tools to manipulate corpora in maca_corpora as well as conll2mcf --- CMakeLists.txt | 2 + maca_common/CMakeLists.txt | 5 +- maca_common/include/conll_lib.h | 102 ++++ maca_common/include/hash_str.h | 32 + maca_common/src/conll_lib.c | 561 ++++++++++++++++++ maca_common/src/hash_str.c | 118 ++++ maca_corpora/CMakeLists.txt | 2 + maca_corpora/exec/CMakeLists.txt | 12 + maca_corpora/exec/ftb2fr.c | 237 ++++++++ maca_corpora/exec/ptb2en.c | 275 +++++++++ maca_corpora/lib/CMakeLists.txt | 11 + maca_corpora/lib/include/ftb_lib.h | 38 ++ maca_corpora/lib/include/orfeo_lib.h | 17 + maca_corpora/lib/src/ftb_lib.c | 170 ++++++ maca_corpora/lib/src/orfeo_lib.c | 428 +++++++++++++ maca_tools/CMakeLists.txt | 12 +- maca_tools/src/conll2mcf.c | 158 +++++ maca_tools/src/conllu2mcf.c | 159 +++++ .../src/simple_decoder_parser_arc_eager.c | 3 + 19 files changed, 2337 insertions(+), 5 deletions(-) create mode 100644 maca_common/include/conll_lib.h create mode 100644 maca_common/include/hash_str.h create mode 100644 maca_common/src/conll_lib.c create mode 100644 maca_common/src/hash_str.c create mode 100644 maca_corpora/CMakeLists.txt create mode 100644 maca_corpora/exec/CMakeLists.txt create mode 100644 maca_corpora/exec/ftb2fr.c create mode 100644 maca_corpora/exec/ptb2en.c create mode 100644 maca_corpora/lib/CMakeLists.txt create mode 100644 maca_corpora/lib/include/ftb_lib.h create mode 100644 maca_corpora/lib/include/orfeo_lib.h create mode 100644 maca_corpora/lib/src/ftb_lib.c create mode 100644 maca_corpora/lib/src/orfeo_lib.c create mode 100644 maca_tools/src/conll2mcf.c create mode 100644 maca_tools/src/conllu2mcf.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 80f3d0f..bbac66d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ endif() include_directories(maca_common/include) include_directories(perceptron/lib/include) +include_directories(maca_corpora/lib/include) add_subdirectory(maca_common) add_subdirectory(maca_tools) @@ -42,6 +43,7 @@ add_subdirectory(maca_tokenizer) add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) +add_subdirectory(maca_corpora) #add_subdirectory(maca_graph_parser) if(MACA_EXPORT) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index e389629..de1eb4c 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -1,5 +1,7 @@ -set(SOURCES src/util.c +set(SOURCES + src/util.c src/hash.c + src/hash_str.c src/dico.c src/word_emb.c src/mcd.c @@ -17,6 +19,7 @@ set(SOURCES src/util.c src/fplm.c src/json_parser.c src/json_tree.c + src/conll_lib.c ) #compiling library diff --git a/maca_common/include/conll_lib.h b/maca_common/include/conll_lib.h new file mode 100644 index 0000000..077ca95 --- /dev/null +++ b/maca_common/include/conll_lib.h @@ -0,0 +1,102 @@ +/******************************************************************************* + Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + and Joseph Le Roux <joseph.le.roux@gmail.com> + conll_lib is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + conll_lib is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with conll_lib. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __CONLL_LIB__ +#define __CONLL_LIB__ + +#include <stdio.h> +#include "hash_str.h" + +#define MAX_WORDS_IN_SENTENCE 1000 +#define MAX_STR 10000 +#define MAX_LINE_LENGTH 50000 + +#define INCORRECT_SENTENCE_NUM_VALUE -1 +#define INCORRECT_PARSE_NUM_VALUE -1 +#define INCORRECT_LOGPROB_VALUE 10 +#define INCORRECT_ORACLE_VALUE -1 +#define INCORRECT_CONF_MEAS -1 +#define INCORRECT_LEX_AFF -1 + +typedef struct w +{ + unsigned id; /* Token counter, starting at 1 for each new sentence.*/ + char form[MAX_STR]; /* Word form or punctuation symbol.*/ + char lemma[MAX_STR]; /* Lemma or stem (depending on particular data set) of word form,*/ + /* or an underscore if not available.*/ + char cpostag[MAX_STR];/* Coarse-grained part-of-speech tag, where tagset depends on the language.*/ + char postag[MAX_STR]; /* Fine-grained part-of-speech tag, where the tagset depends on the language,*/ + /* or identical to the coarse-grained part-of-speech tag if not available.*/ + char feats[MAX_STR]; /* Unordered set of syntactic and/or morphological features (depending on the particular language)*/ + /*, separated by a vertical bar (|), or an underscore if not available.*/ + int head; /* Head of the current token, which is either a value of ID or zero ('0').*/ + char deprel[MAX_STR]; /* Dependency relation to the HEAD. The set of dependency relations depends on the particular language.*/ + /* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/ + unsigned phead;/* Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. */ + /* The dependency structure resulting from the PHEAD column is guaranteed to be projective */ + /* whereas the structures resulting from the HEAD column will be non-projective for some sentences */ + char pdeprel[MAX_STR]; /* Dependency relation to the PHEAD, or an underscore if not available. */ + /* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/ + char language[MAX_STR]; /* Language identifier */ + + double score; /* score of the dependency, not in the 2007 conll format */ + double lex_aff; /* lexical affinity of the dependent and the governor, not in the 2007 conll format */ + + struct w * mother; + struct w * daughters[MAX_WORDS_IN_SENTENCE]; + unsigned daughters_nb; + double conf_meas; +} conll_word; + + +typedef struct +{ + conll_word * root; + conll_word * words[MAX_WORDS_IN_SENTENCE]; + unsigned l; /* sentence length */ + unsigned num; /* sentence number */ +} conll_sentence; + + +conll_sentence *conll_allocate_sentence(void); +void conll_renumber_sentence(conll_sentence *s); +void conll_reset_sentence(conll_sentence *s); +void conll_free_sentence(conll_sentence *s); +int conll_load_sentence(FILE *f, conll_sentence *s); +void conll_print_sentence(conll_sentence *s); +void conll_print_sentence_mcf(conll_sentence *s, int coarse_pos); +void conll_print_sentence_mcf2(conll_sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel); +void conll_print_sentence_mcf3(conll_sentence *s, char *columns, int nb_col); +void conll_compact_sentence(conll_sentence *s); +conll_word *conll_allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel); +conll_word *conll_copy_word(conll_word *w); +void conll_add_daughter(conll_word *daughter, conll_word *mother); +void conll_remove_daughter(conll_sentence *s, int i); +void conll_remove_word_rec(conll_sentence *s, int i); +void conll_remove_subtree(conll_sentence *s, int root); +void conll_add_word(conll_sentence *s, conll_word *w, int pos, conll_word *gov); +void conll_split_node_in_two(conll_sentence *s, int pos, conll_word *gov, conll_word *dep, int pos_gov, int pos_dep); +void conll_change_pos(conll_sentence *s, hash_str *h_pos); +void conll_change_cpos(conll_sentence *s, hash_str *h_cpos); +void conll_change_fct(conll_sentence *s, hash_str *h_fct); +int conll_is_num(char *s); +void conll_renumber_sentence_offset(conll_sentence *s, int offset); +void conll_compute_relative_index_of_heads(conll_sentence *s); + + + +#endif diff --git a/maca_common/include/hash_str.h b/maca_common/include/hash_str.h new file mode 100644 index 0000000..1429a11 --- /dev/null +++ b/maca_common/include/hash_str.h @@ -0,0 +1,32 @@ +#ifndef __HASH_STR__ +#define __HASH_STR__ + +#define HASH_STR_INVALID_VAL NULL + +typedef struct _hash_str_cell +{ + char *key; + char *val; + struct _hash_str_cell *next; +} hash_str_cell; + +typedef struct +{ + int size; + int nbelem; + hash_str_cell **array; +} hash_str; + + +hash_str_cell *hash_str_cell_new(char *key, char *val, hash_str_cell *next); +void hash_str_cell_free(hash_str_cell *c); + +hash_str *hash_str_new(int size); +void hash_str_free(hash_str *h); +hash_str_cell *hash_str_lookup(hash_str *h, char *key); +char *hash_str_get_val(hash_str *h, char *key); +void hash_str_add(hash_str *h, char *key, char *val); +void hash_str_stats(hash_str *h); + + +#endif diff --git a/maca_common/src/conll_lib.c b/maca_common/src/conll_lib.c new file mode 100644 index 0000000..f5e2722 --- /dev/null +++ b/maca_common/src/conll_lib.c @@ -0,0 +1,561 @@ +/******************************************************************************* + Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + and Joseph Le Roux <joseph.le.roux@gmail.com> + conll_lib is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + conll_lib is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with conll_lib. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"conll_lib.h" + +int conll_parse_line(FILE *f, conll_sentence *s); + +void conll_compute_relative_index_of_heads(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + /* printf("i = %d head = %d\n", w->head); */ + /* roots keep 0 as index of head */ + if(w->head != 0) + w->head = w->head - i; + } +} + + +void conll_renumber_sentence(conll_sentence *s) +{ + int i; + conll_word *w; + for(i=0 ; i < s->l; i++){ + s->words[i]->id = i; + } + for(i=0 ; i < s->l; i++){ + w = s->words[i]; + if(w->mother) + w->head = w->mother->id; + else + w->head = 0; + } +} + + +void conll_reset_sentence(conll_sentence *s) +{ + int i; + for(i=0 ; i < s->l; i++){ + if(s->words[i]){ + free(s->words[i]); + s->words[i] = NULL; + } + } + s->words[0] = conll_allocate_word(0, "ROOT", "ROOT", "ROOT", "ROOT", "ROOT", -1, "ROOT"); + s->l = 1; + +} + +void conll_free_sentence(conll_sentence *s) +{ + int i; + for(i=0 ; i < s->l; i++){ + if(s->words[i]){ + /* free(s->words[i]); */ + } + } + free(s); +} + +conll_word *conll_copy_word(conll_word *w){ + return conll_allocate_word(w->id, w->form, w->lemma, w->cpostag, w->postag, w->feats, w->head, w->deprel); +} + +conll_word *conll_allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel) + +{ + conll_word *w = (conll_word *)malloc(sizeof(conll_word)); + + w->id = id; + strcpy(w->form, form); + strcpy(w->lemma, lemma); + strcpy(w->cpostag, cpostag); + strcpy(w->postag, postag); + strcpy(w->feats, feats); + w->head = head; + strcpy(w->deprel, deprel); + w->mother = NULL; + w->daughters_nb = 0; + return w; +} + +conll_sentence *conll_allocate_sentence(void) +{ + conll_sentence *s; + int i; + + s = (conll_sentence *)malloc(sizeof(conll_sentence)); + if(s == NULL){ + fprintf(stderr, "cannot allocate sentence\n"); + exit(1); + } + + s->num = INCORRECT_SENTENCE_NUM_VALUE; + s->l = 0; + for(i=0; i < MAX_WORDS_IN_SENTENCE; i++){ + s->words[i] = NULL; + } + return s; +} + +int conll_load_sentence(FILE *f, conll_sentence *s) +{ + int res; + int i; + if(feof(f)) return 0; + + conll_reset_sentence(s); + + for(res = conll_parse_line(f, s); res; res = conll_parse_line(f, s)); + + /* read an 'empty' parse (two succeding cr) */ + if(s->l == 0) return 0; + + /* build the tree structure */ + s->words[0]->mother = NULL; + for(i=1; i < s->l; ++i){ + if((s->words[i]->head >= 0) && (s->words[i]->head <= s->l)){ /* check that head attribute is not out of range */ + conll_add_daughter(s->words[i], s->words[s->words[i]->head]); + } + } + + return 1; +} + + + +/*----------------------------------------------------------------------------*/ +int conll_parse_line(FILE *f, conll_sentence *s) +{ + char buff[MAX_LINE_LENGTH]; + conll_word *w; + char head_str[100]; + char C9[100]; + char C10[100]; + + if(feof(f)) return 0; + + if (fgets(buff, MAX_LINE_LENGTH, f) == NULL) { + // fprintf(stderr, "cannot read file: empty ?"); + return 0; + } + + /* ignore empty line */ + if(buff[0] == '\n'){ + /* printf("\n"); */ + return 0; + } + + /* specific to conll_u */ + + /* ignore comments */ + if(buff[0] == '#'){ + return 0; + } + + { + /* ignore amalgams */ + int i; + for(i=0; (buff[i] != '\t') && (i < MAX_LINE_LENGTH); i++) + if(buff[i] == '-') return 1; + + } + + { + /* ignore ellipsis */ + int i; + for(i=0; (buff[i] != '\t') && (i < MAX_LINE_LENGTH); i++) + if(buff[i] == '.') return 1; + + } + + + + /* end of specific to conll_u */ + + + s->words[s->l] = w = (conll_word *)malloc(sizeof(conll_word)); + w->daughters_nb = 0; + s->l++; + + if(s->l < MAX_WORDS_IN_SENTENCE){ + /* read a dependency description */ + + + /* 1 A a _ DT _ 3 det _ _ */ + /* 2 severe severe _ JJ _ 3 amod _ _ */ + /* 3 storm storm _ NN _ 4 nsubj _ _ */ + /* 4 swept sweep _ VBD _ 26 ccomp _ _ */ + /* 5 through through _ IN _ 4 prep _ _ */ + + /* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */ + sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel, C9, C10, w->language); + /* printf("buff = %s\n", buff); + printf("form = %s\n", w->form); + printf("lemma = %s\n", w->lemma); + printf("cpostag = %s\n", w->cpostag); + printf("postag = %s\n", w->postag); + printf("feats = %s\n", w->feats); + printf("head_str = %s\n", head_str); + printf("language = %s\n", w->language); + printf("C10 = %s\n", C9); + printf("C9 = %s\n", C10);*/ + + /* w->cpostag[0] = w->postag[0]; */ + if(strcmp(head_str, "_")){ + w->head = atoi(head_str); + if(w->head == 0) s->root = w; + } + + } + + return 1; +} + +void conll_print_sentence_mcf2(conll_sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel) +{ + int i; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(print_id) + printf("%d\t", w->id); + if(print_form) + printf("%s\t", w->form); + if(print_lemma) + printf("%s\t", w->lemma); + if(print_cpostag) + printf("%s\t", w->cpostag); + if(print_postag) + printf("%s\t", w->postag); + if(print_feats) + printf("%s\t", w->feats); + if(print_head) + printf("%d\t", w->head); + if(print_deprel) + printf("%s\t", w->deprel); + if(i == s->l - 1) + fprintf(stdout, "1\n"); + else + fprintf(stdout, "0\n"); + } +} + +void conll_print_sentence_mcf3(conll_sentence *s, char *columns, int nb_col) +{ + int i,j; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + for(i=1; i<s->l; i++){ + w = s->words[i]; + for(j=0; j < nb_col; j++) + switch(columns[j]){ + case 'I': + printf("%d\t", w->id); + break; + case 'W': + printf("%s\t", w->form); + break; + case 'L': + printf("%s\t", w->lemma); + break; + case 'C': + printf("%s\t", w->cpostag); + break; + case 'P': + printf("%s\t", w->postag); + break; + case 'F': + printf("%s\t", w->feats); + break; + case 'H': + printf("%d\t", w->head); + break; + case 'D': + printf("%s\t", w->deprel); + break; + case 'G': + printf("%s\t", w->language); + break; + } + if(i == s->l - 1) + fprintf(stdout, "1\n"); + else + fprintf(stdout, "0\n"); + } +} + +void conll_print_sentence_mcf(conll_sentence *s, int coarse_pos) +{ + int i; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + /* fprintf(stdout, "%d", w->id); */ + fprintf(stdout, "%s", w->form); + if(coarse_pos) + fprintf(stdout, "\t%s", w->cpostag); + else + fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->lemma); + if(w->mother == NULL) + fprintf(stdout, "\t0"); + else + fprintf(stdout, "\t%d", w->mother->id - w->id); + fprintf(stdout, "\t%s", w->deprel); + if(i == s->l - 1) + fprintf(stdout, "\t1"); + else + fprintf(stdout, "\t0"); + fprintf(stdout, "\n"); + } +} + + +void conll_print_sentence(conll_sentence *s) +{ + int i; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + fprintf(stdout, "%d", w->id); + fprintf(stdout, "\t%s", w->form); + fprintf(stdout, "\t%s", w->lemma); + fprintf(stdout, "\t%s", w->cpostag); + fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->feats); + if(w->mother == NULL) + fprintf(stdout, "\t0"); + else + fprintf(stdout, "\t%d", w->mother->id); + fprintf(stdout, "\t%s", w->deprel); + fprintf(stdout, "\t_\t_\n"); + + } + printf("\n"); + +} + +void conll_compact_sentence(conll_sentence *s) +{ + int i,j; + for(i=0; i < s->l; i++){ + if(s->words[i] == NULL){ + for(j = i; j < s->l - 1; j++){ + s->words[j] = s->words[j+1]; + } + i--; + s->l--; + } + } +} + +void conll_add_daughter(conll_word *daughter, conll_word *mother) +{ + if(daughter){ + if(mother){ + daughter->mother = mother; + mother->daughters[mother->daughters_nb] = daughter; + mother->daughters_nb++; + } + else{ + daughter->mother = NULL; + } + } +} + +void conll_remove_daughter(conll_sentence *s, int i) +{ + int j,k; + conll_word *dep = s->words[i]; + conll_word *gov; + if(dep){ + gov = dep->mother; + if(gov){ + for(j=0; j < gov->daughters_nb; j++){ + if(gov->daughters[j] == dep){ + for(k=j; k < gov->daughters_nb - 1; k++){ + gov->daughters[k] = gov->daughters[k+1]; + } + gov->daughters_nb--; + } + } + } + } +} + +void conll_remove_word_rec(conll_sentence *s, int i) +{ + int j; + conll_word *w = s->words[i]; + + for(j=1; j < s->l; j++){ + if((s->words[j]) && (s->words[j]->mother == w)) + conll_remove_word_rec(s, j); + } + conll_remove_daughter(s, i); + free(w); + s->words[i] = NULL; +} + +void conll_remove_subtree(conll_sentence *s, int root) +{ + conll_remove_word_rec(s, root); + conll_compact_sentence(s); +} + +void conll_add_word(conll_sentence *s, conll_word *w, int index, conll_word *gov) +{ + int i; + if(s->words[index] != NULL){ + for(i=s->l; i>index; i--){ + s->words[i] = s->words[i-1]; + } + s->l++; + } + s->words[index] = w; + if(index >= s->l) s->l = index+1; + if(gov != NULL) + conll_add_daughter(w, gov); +} + +void conll_split_node_in_two(conll_sentence *s, int index, conll_word *gov, conll_word *dep, int index_gov, int index_dep) +{ + int i; + conll_word *w = s->words[index]; + conll_word *mother = w->mother; + + strcpy(gov->deprel, w->deprel); + for(i=1; i < s->l; i++){ + if(s->words[i]->mother == w) + conll_add_daughter(s->words[i], gov); + } + free(w); + s->words[index] = NULL; + conll_add_word(s, gov, index_gov, mother); + conll_add_word(s, dep, index_dep, gov); +} + + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +void conll_change_cpos(conll_sentence *s, hash_str *h_cpos) +{ + int i; + conll_word *w; + char *val; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + + val = hash_str_get_val (h_cpos, w->cpostag); + if(val){ + strcpy(w->cpostag, val); + } + else{ + fprintf(stderr, "ATTENTION: cpos %s inconnue\n", w->cpostag); + } + } + +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +void conll_change_pos(conll_sentence *s, hash_str *h_pos) +{ + int i; + conll_word *w; + char *val; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + + val = hash_str_get_val (h_pos, w->postag); + if(val){ + strcpy(w->postag, val); + } + else{ + fprintf(stderr, "ATTENTION: pos %s inconnue\n", w->cpostag); + } + } + +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + + + +void conll_change_fct(conll_sentence *s, hash_str *h_fct) +{ + int i; + conll_word *w; + char *val; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + + val = hash_str_get_val (h_fct, w->deprel); + if(val){ + strcpy(w->deprel, val); + } + else{ + fprintf(stderr, "ATTENTION: fct %s inconnue\n", w->deprel); + } + } + +} + +int conll_is_num(char *s) +{ + int i; + int l; + if(s == NULL) return 0; + l = strlen(s); + if((l == 1) && (s[0] == ',')) return 0; + for(i=0; i <l; i++) + if(((s[i] < '0') || (s[i] > '9')) && (s[i] != ',')) + return 0; + return 1; +} + + +void conll_renumber_sentence_offset(conll_sentence *s, int offset) +{ + int i; + for(i=0 ; i < s->l; i++){ + s->words[i]->id = i + offset; + } +} diff --git a/maca_common/src/hash_str.c b/maca_common/src/hash_str.c new file mode 100644 index 0000000..4bc2752 --- /dev/null +++ b/maca_common/src/hash_str.c @@ -0,0 +1,118 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"hash_str.h" +#include"util.h" + +hash_str_cell *hash_str_cell_new(char *key, char *val, hash_str_cell *next) +{ + hash_str_cell *c = (hash_str_cell *)memalloc(sizeof(hash_str_cell)); + c->val = val; + c->key = key; + c->next = next; + return c; +} + +void hash_str_cell_free(hash_str_cell *c) +{ + if(c == NULL) return; + hash_str_cell_free(c->next); + free(c->key); + free(c->val); + free(c); +} + + +hash_str *hash_str_new(int size) +{ + int i; + hash_str *h = (hash_str *)memalloc(sizeof(hash_str)); + h->size = size; + h->nbelem = 0; + h->array = (hash_str_cell **)memalloc(size * sizeof(hash_str_cell *)); + for(i=0; i < size; i++) + h->array[i] = NULL; + return h; +} + +void hash_str_free(hash_str *h) +{ + int i; + for(i=0; i < h->size; i++) + hash_str_cell_free(h->array[i]); + free(h); +} + +int hash_str_func(char *key, int size) +{ + int i; + int l = strlen(key); + int val = key[0]; + for(i=1; i < l; i++) + val = val + i *i * abs(key[i]); + return val % size; +} + +hash_str_cell *hash_str_lookup(hash_str *h, char *key) +{ + int index = hash_str_func(key, h->size); + hash_str_cell *c; + /* printf("index = %d\n", index); */ + + for(c=h->array[index]; c; c = c->next){ + /* printf("dans la boucle index = %d c = %d\n", index, h->array[index]); */ + if(!strcmp(key, c->key)) + return c; + } + return NULL; +} + +char *hash_str_get_val(hash_str *h, char *key) +{ + int index = hash_str_func(key, h->size); + hash_str_cell *c; + for(c=h->array[index]; c; c = c->next) + if(!strcmp(key, c->key)) + return c->val; + return HASH_STR_INVALID_VAL; +} + +void hash_str_add(hash_str *h, char *key, char *val) +{ + int index; + /* printf("add couple (%s %s)\n", key, val); */ + + if(hash_str_lookup(h, key)) return; + index = hash_str_func(key, h->size); + h->array[index] = hash_str_cell_new(key, val, h->array[index]); + h->nbelem++; +} + +int hash_str_cell_nb(hash_str_cell *c) +{ + if(c == NULL) return 0; + return 1 + hash_str_cell_nb(c->next); +} + +void hash_str_stats(hash_str *h) +{ + int max = 0; + int i,l; + int *table; + int nb; + + for(i=0; i < h->size; i++) + if((l = hash_str_cell_nb(h->array[i])) > max) + max = l; + nb = max + 1; + table = (int *)memalloc(nb * sizeof(int)); + for(i=0; i < nb; i++) + table[i] = 0; + for(i=0; i < h->size; i++) + table[hash_str_cell_nb(h->array[i])]++; + + for(i=0; i < nb; i++) + printf("%d %d\n", i, table[i]); + + +} diff --git a/maca_corpora/CMakeLists.txt b/maca_corpora/CMakeLists.txt new file mode 100644 index 0000000..8d88845 --- /dev/null +++ b/maca_corpora/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(lib) +add_subdirectory(exec) diff --git a/maca_corpora/exec/CMakeLists.txt b/maca_corpora/exec/CMakeLists.txt new file mode 100644 index 0000000..d07dd6b --- /dev/null +++ b/maca_corpora/exec/CMakeLists.txt @@ -0,0 +1,12 @@ +#compiling, linking and installing executables + +add_executable(ptb2en ptb2en.c) +target_link_libraries(ptb2en maca_common) +target_link_libraries(ptb2en maca_corpora) +install (TARGETS ptb2en DESTINATION bin) + +add_executable(ftb2fr ftb2fr.c) +target_link_libraries(ftb2fr maca_common) +target_link_libraries(ftb2fr maca_corpora) +install (TARGETS ftb2fr DESTINATION bin) + diff --git a/maca_corpora/exec/ftb2fr.c b/maca_corpora/exec/ftb2fr.c new file mode 100644 index 0000000..6d422ba --- /dev/null +++ b/maca_corpora/exec/ftb2fr.c @@ -0,0 +1,237 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" +#include"ftb_lib.h" + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + hash_str *h_pos; + hash_str *h_fct; +} options; + +void change_pos_fr(conll_sentence *s, hash_str *h_pos) +{ + int i; + conll_word *w; + char *val; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + + if(!strcmp(w->postag, "ADV")){ + if(!strcmp(w->feats, "s=neg")) + strcpy(w->postag, "advneg"); + else + strcpy(w->postag, "adv"); + continue; + } + + if(!strcmp(w->postag, "PONCT")){ + if(!strcmp(w->feats, "s=s")) + strcpy(w->postag, "poncts"); + else + strcpy(w->postag, "ponctw"); + continue; + } + + val = hash_str_get_val (h_pos, w->postag); + if(val){ + strcpy(w->postag, val); + } + else{ + fprintf(stderr, "ATTENTION: pos %s inconnue\n", w->postag); + } + } +} + + +void change_pos_and_cpos_of_dot(conll_sentence *s, options *op) +{ + int i; + conll_word *w; + + for(i=1; i < s->l; i++){ + w = s->words[i]; + if(w){ + if(!strcmp(s->words[i]->form, ".")){ + strcpy(s->words[i]->postag, "poncts"); + strcpy(s->words[i]->cpostag, "poncts"); + } + } + } +} + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + op->filename = NULL; + op->fd_parses = NULL; + op->verbose_level = 0; + op->snum = 100000000; + op->h_pos = hash_str_new(100); + + hash_str_add(op->h_pos, strdup("ADJ"), strdup("adj")); + hash_str_add(op->h_pos, strdup("ADJWH"), strdup("adj")); + hash_str_add(op->h_pos, strdup("ADV"), strdup("adv")); + hash_str_add(op->h_pos, strdup("ADVWH"), strdup("adv")); + hash_str_add(op->h_pos, strdup("CC"), strdup("coo")); + hash_str_add(op->h_pos, strdup("CLO"), strdup("clo")); + hash_str_add(op->h_pos, strdup("CLR"), strdup("clr")); + hash_str_add(op->h_pos, strdup("CLS"), strdup("cln")); + hash_str_add(op->h_pos, strdup("CS"), strdup("csu")); + hash_str_add(op->h_pos, strdup("DET"), strdup("det")); + hash_str_add(op->h_pos, strdup("DETWH"), strdup("det")); + hash_str_add(op->h_pos, strdup("ET"), strdup("etr")); + hash_str_add(op->h_pos, strdup("I"), strdup("pres")); + hash_str_add(op->h_pos, strdup("NC"), strdup("nc")); + hash_str_add(op->h_pos, strdup("NPP"), strdup("np")); + hash_str_add(op->h_pos, strdup("P"), strdup("prep")); + hash_str_add(op->h_pos, strdup("P+D"), strdup("prep")); + hash_str_add(op->h_pos, strdup("PONCT"), strdup("")); + hash_str_add(op->h_pos, strdup("P+PRO"), strdup("prep")); + hash_str_add(op->h_pos, strdup("PREF"), strdup("pref")); + hash_str_add(op->h_pos, strdup("PRO"), strdup("pro")); + hash_str_add(op->h_pos, strdup("PROREL"), strdup("prorel")); + hash_str_add(op->h_pos, strdup("PROWH"), strdup("pri")); + hash_str_add(op->h_pos, strdup("V"), strdup("v")); + hash_str_add(op->h_pos, strdup("VIMP"), strdup("v")); + hash_str_add(op->h_pos, strdup("VINF"), strdup("vinf")); + hash_str_add(op->h_pos, strdup("VPP"), strdup("vppart")); + hash_str_add(op->h_pos, strdup("VPR"), strdup("vprespart")); + hash_str_add(op->h_pos, strdup("VS"), strdup("v")); + + op->h_fct = hash_str_new(100); + + hash_str_add(op->h_fct, strdup("aff"), strdup("aff")); + hash_str_add(op->h_fct, strdup("a_obj"), strdup("a_obj")); + hash_str_add(op->h_fct, strdup("arg"), strdup("arg")); + hash_str_add(op->h_fct, strdup("ato"), strdup("ato")); + hash_str_add(op->h_fct, strdup("ats"), strdup("ats")); + hash_str_add(op->h_fct, strdup("aux_caus"), strdup("aux_caus")); + hash_str_add(op->h_fct, strdup("aux_pass"), strdup("aux_pass")); + hash_str_add(op->h_fct, strdup("aux_tps"), strdup("aux_tps")); + hash_str_add(op->h_fct, strdup("comp"), strdup("comp")); + hash_str_add(op->h_fct, strdup("coord"), strdup("coord")); + hash_str_add(op->h_fct, strdup("de_obj"), strdup("de_obj")); + hash_str_add(op->h_fct, strdup("dep"), strdup("dep")); + hash_str_add(op->h_fct, strdup("dep_coord"), strdup("dep_coord")); + hash_str_add(op->h_fct, strdup("det"), strdup("det")); + hash_str_add(op->h_fct, strdup("missinghead"), strdup("missinghead")); + hash_str_add(op->h_fct, strdup("mod"), strdup("mod")); + hash_str_add(op->h_fct, strdup("mod_rel"), strdup("mod_rel")); + hash_str_add(op->h_fct, strdup("obj"), strdup("obj")); + hash_str_add(op->h_fct, strdup("obj1"), strdup("obj")); + hash_str_add(op->h_fct, strdup("p_obj"), strdup("p_obj")); + hash_str_add(op->h_fct, strdup("ponct"), strdup("ponct")); + hash_str_add(op->h_fct, strdup("root"), strdup("root")); + hash_str_add(op->h_fct, strdup("suj"), strdup("suj")); +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); +} + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + } + + while ((c = getopt (argc, argv, "hf:n:v:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + } +} + + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + conll_sentence *s = conll_allocate_sentence(); + int snum = 0; + int res; + + parse_options(argc, argv, &op); + print_options(&op); + for(res = conll_load_sentence(op.fd_parses, s); res && (snum < op.snum); res = conll_load_sentence(op.fd_parses, s)){ + s->num = snum; + + /* if(s->l > 200) continue; */ + // if(!sentence_ends_with_poncts(s)) continue; + if(ftb_number_of_roots_in_sentence(s) != 1) continue; + if(ftb_sentence_contains_missinghead(s)) continue; + + snum++; + + /* change_pos_and_cpos_of_dot(s, &op); */ + ftb_change_form_and_lemma_of_numbers(s); + change_pos_fr(s, op.h_pos); + /* change_label_of_last_dep(s); */ + ftb_retokenize_three_dots(s); + ftb_tokenize_dot(s, "titre", "poncts", "abbrev"); + conll_renumber_sentence(s); + // conll_compute_relative_index_of_heads(s); + conll_print_sentence(s); + // print_sentence_no_newline(s); + + } + fprintf(stderr, "\n"); + fclose(op.fd_parses); + conll_free_sentence(s); + return 0; +} diff --git a/maca_corpora/exec/ptb2en.c b/maca_corpora/exec/ptb2en.c new file mode 100644 index 0000000..d5545d7 --- /dev/null +++ b/maca_corpora/exec/ptb2en.c @@ -0,0 +1,275 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + hash_str *h_pos; + hash_str *h_fct; +} options; + +void tokenize_dot_ptb(conll_sentence *s, char *dep_postag, char *label) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + /* printf("form = %s\n", w->form); */ + if((strlen(w->form) > 1) + && (strcmp(w->form, "...")) + && (w->form[strlen(w->form) - 1] == '.')){ + conll_word *abbrev = conll_copy_word(w); + abbrev->form[strlen(abbrev->form) - 1] = '\0'; + // strcpy(abbrev->postag, gov_postag); + // strcpy(abbrev->cpostag, gov_postag); + if(w->lemma[strlen(w->lemma) - 1] == '.') abbrev->lemma[strlen(abbrev->lemma) - 1] = '\0'; + /* conll_word *dot = conll_allocate_word(i, ".", ".", "poncts", "poncts", "NULL", -1, "abbrev"); */ + conll_word *dot = conll_allocate_word(i, ".", ".", dep_postag, dep_postag, "_", -1, label); + + conll_split_node_in_two(s, i, abbrev, dot, i, i+1); + + } + + } +} + +void change_pos_and_cpos_of_dot(conll_sentence *s, options *op) +{ + int i; + conll_word *w; + + for(i=1; i < s->l; i++){ + w = s->words[i]; + if(w){ + if(!strcmp(s->words[i]->form, ".")){ + strcpy(s->words[i]->postag, "."); + strcpy(s->words[i]->cpostag, "."); + } + } + } +} + +void print_sentence_no_newline_en(conll_sentence *s) +{ + int i; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + /* fprintf(stdout, "%d", w->id); */ + /* fprintf(stdout, "\t%s", w->form); */ + fprintf(stdout, "%s", w->form); + fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->lemma); + /* fprintf(stdout, "\t%s", w->cpostag); */ + + fprintf(stdout, "\t%d", w->head); + + /* if(w->mother == NULL) + fprintf(stdout, "\t0"); + else + fprintf(stdout, "\t%d", w->mother->id - w->id);*/ + /* + else{ + if(strcmp(w->deprel, "root")) + fprintf(stdout, "\t%d", w->mother->id - w->id); + else + fprintf(stdout, "\t%d", w->mother->id - w->id); + fprintf(stdout, "\t%d", 0); + }*/ + fprintf(stdout, "\t%s", w->deprel); + /* if(!strcmp(w->deprel, "eos")) */ + /* if(!strcmp(w->deprel, "ponct") && !strcmp(w->postag, "poncts")) */ + if(i == s->l - 1) + fprintf(stdout, "\t1"); + else + fprintf(stdout, "\t0"); + + + fprintf(stdout, "\n"); + + /* fprintf(stdout, "\t_\t_\n"); */ + + } + +} + + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + op->filename = NULL; + op->fd_parses = NULL; + op->verbose_level = 0; + op->snum = 100000000; + op->h_pos = hash_str_new(100); + + hash_str_add(op->h_pos, strdup("ADJ"), strdup("adj")); + hash_str_add(op->h_pos, strdup("ADJWH"), strdup("adj")); + hash_str_add(op->h_pos, strdup("ADV"), strdup("adv")); + hash_str_add(op->h_pos, strdup("ADVWH"), strdup("adv")); + hash_str_add(op->h_pos, strdup("CC"), strdup("coo")); + hash_str_add(op->h_pos, strdup("CLO"), strdup("clo")); + hash_str_add(op->h_pos, strdup("CLR"), strdup("clr")); + hash_str_add(op->h_pos, strdup("CLS"), strdup("cln")); + hash_str_add(op->h_pos, strdup("CS"), strdup("csu")); + hash_str_add(op->h_pos, strdup("DET"), strdup("det")); + hash_str_add(op->h_pos, strdup("DETWH"), strdup("det")); + hash_str_add(op->h_pos, strdup("ET"), strdup("etr")); + hash_str_add(op->h_pos, strdup("I"), strdup("pres")); + hash_str_add(op->h_pos, strdup("NC"), strdup("nc")); + hash_str_add(op->h_pos, strdup("NPP"), strdup("np")); + hash_str_add(op->h_pos, strdup("P"), strdup("prep")); + hash_str_add(op->h_pos, strdup("P+D"), strdup("prep")); + hash_str_add(op->h_pos, strdup("PONCT"), strdup("")); + hash_str_add(op->h_pos, strdup("P+PRO"), strdup("prep")); + hash_str_add(op->h_pos, strdup("PREF"), strdup("pref")); + hash_str_add(op->h_pos, strdup("PRO"), strdup("pro")); + hash_str_add(op->h_pos, strdup("PROREL"), strdup("prorel")); + hash_str_add(op->h_pos, strdup("PROWH"), strdup("pri")); + hash_str_add(op->h_pos, strdup("V"), strdup("v")); + hash_str_add(op->h_pos, strdup("VIMP"), strdup("v")); + hash_str_add(op->h_pos, strdup("VINF"), strdup("vinf")); + hash_str_add(op->h_pos, strdup("VPP"), strdup("vppart")); + hash_str_add(op->h_pos, strdup("VPR"), strdup("vprespart")); + hash_str_add(op->h_pos, strdup("VS"), strdup("v")); + + op->h_fct = hash_str_new(100); + + hash_str_add(op->h_fct, strdup("aff"), strdup("aff")); + hash_str_add(op->h_fct, strdup("a_obj"), strdup("a_obj")); + hash_str_add(op->h_fct, strdup("arg"), strdup("arg")); + hash_str_add(op->h_fct, strdup("ato"), strdup("ato")); + hash_str_add(op->h_fct, strdup("ats"), strdup("ats")); + hash_str_add(op->h_fct, strdup("aux_caus"), strdup("aux_caus")); + hash_str_add(op->h_fct, strdup("aux_pass"), strdup("aux_pass")); + hash_str_add(op->h_fct, strdup("aux_tps"), strdup("aux_tps")); + hash_str_add(op->h_fct, strdup("comp"), strdup("comp")); + hash_str_add(op->h_fct, strdup("coord"), strdup("coord")); + hash_str_add(op->h_fct, strdup("de_obj"), strdup("de_obj")); + hash_str_add(op->h_fct, strdup("dep"), strdup("dep")); + hash_str_add(op->h_fct, strdup("dep_coord"), strdup("dep_coord")); + hash_str_add(op->h_fct, strdup("det"), strdup("det")); + hash_str_add(op->h_fct, strdup("missinghead"), strdup("missinghead")); + hash_str_add(op->h_fct, strdup("mod"), strdup("mod")); + hash_str_add(op->h_fct, strdup("mod_rel"), strdup("mod_rel")); + hash_str_add(op->h_fct, strdup("obj"), strdup("obj")); + hash_str_add(op->h_fct, strdup("obj1"), strdup("obj")); + hash_str_add(op->h_fct, strdup("p_obj"), strdup("p_obj")); + hash_str_add(op->h_fct, strdup("ponct"), strdup("ponct")); + hash_str_add(op->h_fct, strdup("root"), strdup("root")); + hash_str_add(op->h_fct, strdup("suj"), strdup("suj")); +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); +} + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + } + + while ((c = getopt (argc, argv, "hf:n:v:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + } +} + + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + conll_sentence *s = conll_allocate_sentence(); + int snum = 0; + int res; + + parse_options(argc, argv, &op); + print_options(&op); + for(res = conll_load_sentence(op.fd_parses, s); res && (snum < op.snum); res = conll_load_sentence(op.fd_parses, s)){ + s->num = snum; + + /* if(s->l > 200) continue; */ + // if(!sentence_ends_with_poncts(s)) continue; + // if(number_of_roots_in_sentence(s) != 1) continue; + //if(sentence_contains_missinghead(s)) continue; + + snum++; + + change_pos_and_cpos_of_dot(s, &op); + //change_form_and_lemma_of_numbers(s); + //change_pos_fr(s, op.h_pos); + /* change_label_of_last_dep(s); */ + //retokenize_three_dots(s); + tokenize_dot_ptb(s, ".", "ABBREV"); + conll_renumber_sentence(s); + // conll_compute_relative_index_of_heads(s); + // print_sentence_no_newline_en(s); + conll_print_sentence(s); + + } + fprintf(stderr, "\n"); + fclose(op.fd_parses); + conll_free_sentence(s); + return 0; +} diff --git a/maca_corpora/lib/CMakeLists.txt b/maca_corpora/lib/CMakeLists.txt new file mode 100644 index 0000000..a0ca33d --- /dev/null +++ b/maca_corpora/lib/CMakeLists.txt @@ -0,0 +1,11 @@ +set(SOURCES + src/ftb_lib.c + src/orfeo_lib.c +) + +#compiling library +include_directories(src) +add_library(maca_corpora STATIC ${SOURCES}) + +find_library(M_LIB m) +target_link_libraries(maca_corpora ${M_LIB}) diff --git a/maca_corpora/lib/include/ftb_lib.h b/maca_corpora/lib/include/ftb_lib.h new file mode 100644 index 0000000..a8f238d --- /dev/null +++ b/maca_corpora/lib/include/ftb_lib.h @@ -0,0 +1,38 @@ +/******************************************************************************* + Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + and Joseph Le Roux <joseph.le.roux@gmail.com> + conll_lib is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + conll_lib is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with conll_lib. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __FTB_LIB__ +#define __FTB_LIB__ + +#include <stdio.h> +#include <stdlib.h> + +#include "conll_lib.h" +#include "ftb_lib.h" + +int ftb_sentence_ends_with_poncts(conll_sentence *s); +int ftb_number_of_roots_in_sentence(conll_sentence *s); +int ftb_sentence_contains_missinghead(conll_sentence *s); +void ftb_change_form_and_lemma_of_numbers(conll_sentence *s); +void ftb_change_label_of_last_dep(conll_sentence *s); +void ftb_retokenize_three_dots(conll_sentence *s); +void ftb_tokenize_dot(conll_sentence *s, char *gov_postag, char *dep_postag, char *label); +void ftb_print_sentence_no_newline(conll_sentence *s); +int ftb_get_root_index(conll_sentence *s); +void ftb_change_root_head(conll_sentence *s, int new_head_index); +void ftb_compute_relative_index_of_heads(conll_sentence *s); +#endif diff --git a/maca_corpora/lib/include/orfeo_lib.h b/maca_corpora/lib/include/orfeo_lib.h new file mode 100644 index 0000000..5f32182 --- /dev/null +++ b/maca_corpora/lib/include/orfeo_lib.h @@ -0,0 +1,17 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include"conll_lib.h" + +void orfeo_traite_mots_composes(conll_sentence *s); +void orfeo_traite_nombres(conll_sentence *s); +void orfeo_traite_amalgames(conll_sentence *s); + + + + + + + + diff --git a/maca_corpora/lib/src/ftb_lib.c b/maca_corpora/lib/src/ftb_lib.c new file mode 100644 index 0000000..fbf6630 --- /dev/null +++ b/maca_corpora/lib/src/ftb_lib.c @@ -0,0 +1,170 @@ +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include "conll_lib.h" +#include "ftb_lib.h" + +void ftb_change_root_head(conll_sentence *s, int new_head_index) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(!strcmp(w->deprel, "root")){ + w->head = new_head_index; + break; + } + } +} + +int ftb_get_root_index(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(!strcmp(w->deprel, "root")) + return i; + } + return -1; +} + + +int ftb_sentence_ends_with_poncts(conll_sentence *s) +{ + conll_word *w; + + w = s->words[s->l-1]; + if(!strcmp(w->postag, "PONCT") && !strcmp(w->feats, "s=s")) + return 1; + return 0; +} + +int ftb_number_of_roots_in_sentence(conll_sentence *s) +{ + int i; + conll_word *w; + int root_nb = 0; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(!strcmp(w->deprel, "root")) + root_nb++; + } + return root_nb; +} + +int ftb_sentence_contains_missinghead(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(!strcmp(w->deprel, "missinghead")) + return 1; + } + return 0; +} + + +void ftb_change_form_and_lemma_of_numbers(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(conll_is_num(w->form)){ + strcpy(w->form, "_NUM_"); + strcpy(w->lemma, "_NUM_"); + } + + } +} + +void ftb_change_label_of_last_dep(conll_sentence *s) +{ + if(strcmp(s->words[s->l - 1]->deprel, "root")) + strcpy(s->words[s->l - 1]->deprel, "eos"); +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +void ftb_retokenize_three_dots(conll_sentence *s) +{ + int i; + conll_word *w; + int l = s->l; + for(i=1; i < l-2; i++){ + w = s->words[i]; + if(w){ + if(!strcmp(s->words[i]->form, ".") && !strcmp(s->words[i+1]->form, ".") && !strcmp(s->words[i+2]->form, ".")){ + strcpy(s->words[i]->form, "..."); + strcpy(s->words[i]->lemma, "..."); + conll_remove_word_rec(s, i+1); + conll_remove_word_rec(s, i+2); + /* fprintf(stderr, "retokenize ...\n"); */ + } + } + } + conll_compact_sentence(s); +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +void ftb_tokenize_dot(conll_sentence *s, char *gov_postag, char *dep_postag, char *label) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + /* printf("form = %s\n", w->form); */ + if((strlen(w->form) > 1) + && (strcmp(w->form, "...")) + && (w->form[strlen(w->form) - 1] == '.')){ + conll_word *abbrev = conll_copy_word(w); + abbrev->form[strlen(abbrev->form) - 1] = '\0'; + strcpy(abbrev->postag, gov_postag); /*titre*/ + strcpy(abbrev->cpostag, gov_postag); + if(w->lemma[strlen(w->lemma) - 1] == '.') abbrev->lemma[strlen(abbrev->lemma) - 1] = '\0'; + /* conll_word *dot = allocate_word(i, ".", ".", "poncts", "poncts", "NULL", -1, "abbrev"); */ + conll_word *dot = conll_allocate_word(i, ".", ".", dep_postag, dep_postag, "NULL", -1, label); + + conll_split_node_in_two(s, i, abbrev, dot, i, i+1); + + } + + } +} + + +void ftb_print_sentence_no_newline(conll_sentence *s) +{ + int i; + conll_word *w; + + if((s->l == 1) || (s->l == 0)) return; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + fprintf(stdout, "%s", w->form); + fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->feats); + fprintf(stdout, "\t%s", w->lemma); + fprintf(stdout, "\t%d", w->head); + fprintf(stdout, "\t%s", w->deprel); + if(i == s->l - 1) + fprintf(stdout, "\t1"); + else + fprintf(stdout, "\t0"); + fprintf(stdout, "\n"); + } + +} diff --git a/maca_corpora/lib/src/orfeo_lib.c b/maca_corpora/lib/src/orfeo_lib.c new file mode 100644 index 0000000..18d8906 --- /dev/null +++ b/maca_corpora/lib/src/orfeo_lib.c @@ -0,0 +1,428 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + + +void traite_au_revoir(conll_sentence *s, int pos) +{ + conll_word *au = conll_allocate_word(-1, "au", "au", "prep", "prep", "NULL", -1, "NULL"); + conll_word *revoir = conll_allocate_word(-1, "revoir", "revoir", "nc", "nc", "NULL", -1, "OBJ"); + conll_split_node_in_two(s, pos, au, revoir, pos, pos+1); + +} + +void traite_ADV_que(conll_sentence *s, char *form, int pos) +{ + int i,j; + char form_adv[30]; + char form_que[30]; + int l = strlen(form); + + + for(i=0; (i<l) && (form[i] != '_'); i++){ + form_adv[i] = form[i]; + } + form_adv[i] = '\0'; + i++; + + for(j=0; (i<l); i++, j++){ + form_que[j] = form[i]; + } + form_que[j] = '\0'; + + conll_word *que = conll_allocate_word(-1, form_que, "que", "CSU", "CSU", "NULL", -1, "NULL"); + conll_word *adv = conll_allocate_word(-1, form_adv, form_adv, "ADV", "ADV", "NULL", -1, "MORPH"); + + conll_split_node_in_two(s, pos, que, adv, pos+1, pos); + +} + +int chaine_possede_un_underscore(char *m) +{ + int i; + int l = strlen(m); + for(i=0; i < l; i++){ + if(m[i] == '_'){ + return 1; + } + } + return 0; +} + + +int chaine_possede_un_plus(char *m) +{ + int i; + int l = strlen(m); + for(i=0; i < l; i++){ + if(m[i] == '+'){ + return 1; + } + } + return 0; +} + + + +int chaine_possede_un_moins(char *m) +{ + int i; + int l = strlen(m); + for(i=0; i < l; i++){ + if(m[i] == '-'){ + return 1; + } + } + return 0; +} + + +int chaine_possede_un_plus_ou_un_moins(char *m) +{ + int i; + int l = strlen(m); + for(i=0; i < l; i++){ + if((m[i] == '-') || (m[i] == '+')){ + return 1; + } + } + return 0; +} + + +void orfeo_traite_mots_composes(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(chaine_possede_un_plus(w->form)){ + if(!strcmp(w->form, "au+revoir")) traite_au_revoir(s, i); + } + else if(chaine_possede_un_underscore(w->form)){ + if(!strcmp(w->form, "bien_que") || !strcmp(w->form, "bien_qu'") || !strcmp(w->form, "Bien_que") || !strcmp(w->form, "Bien_qu'") + || !strcmp(w->form, "ainsi_que") || !strcmp(w->form, "ainsi_qu'") || !strcmp(w->form, "Ainsi_que") || !strcmp(w->form, "Ainsi_qu'") + || !strcmp(w->form, "autant_que") || !strcmp(w->form, "autant_qu'") || !strcmp(w->form, "Autant_que") || !strcmp(w->form, "Autant_qu'") + || !strcmp(w->form, "alors_que") || !strcmp(w->form, "alors_qu'") || !strcmp(w->form, "Alors_que") || !strcmp(w->form, "Alors_qu'") + || !strcmp(w->form, "maintenant_que") || !strcmp(w->form, "mainenant_qu'") || !strcmp(w->form, "Maintenant_que") || !strcmp(w->form, "Mainenant_qu'") + || !strcmp(w->form, "encore_que") || !strcmp(w->form, "encore_qu'") || !strcmp(w->form, "Encore_que") || !strcmp(w->form, "Encore_qu'") + || !strcmp(w->form, "plus_que") || !strcmp(w->form, "plus_qu'") || !strcmp(w->form, "Plus_que") || !strcmp(w->form, "Plus_qu'") + || !strcmp(w->form, "tant_que") || !strcmp(w->form, "tant_qu'") || !strcmp(w->form, "Tant_que") || !strcmp(w->form, "Tant_qu'")) + traite_ADV_que(s, w->form, i); + } + } +} +/*---------------------------------------------------------------------------------*/ + + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +int chaine_est_un_chiffre(char *s) +{ + + if(!strcmp(s, "et")) return 1; + if(!strcmp(s, "zéro")) return 1; + if(!strcmp(s, "un")) return 1; + if(!strcmp(s, "deux")) return 1; + if(!strcmp(s, "trois")) return 1; + if(!strcmp(s, "quatre")) return 1; + if(!strcmp(s, "cinq")) return 1; + if(!strcmp(s, "six")) return 1; + if(!strcmp(s, "sept")) return 1; + if(!strcmp(s, "huit")) return 1; + if(!strcmp(s, "neuf")) return 1; + if(!strcmp(s, "dix")) return 1; + if(!strcmp(s, "onze")) return 1; + if(!strcmp(s, "douze")) return 1; + if(!strcmp(s, "treize")) return 1; + if(!strcmp(s, "quatorze")) return 1; + if(!strcmp(s, "quinze")) return 1; + if(!strcmp(s, "seize")) return 1; + if(!strcmp(s, "vingt")) return 1; + if(!strcmp(s, "vingts")) return 1; + if(!strcmp(s, "trente")) return 1; + if(!strcmp(s, "quarante")) return 1; + if(!strcmp(s, "cinquante")) return 1; + if(!strcmp(s, "soixante")) return 1; + if(!strcmp(s, "cent")) return 1; + if(!strcmp(s, "cents")) return 1; + if(!strcmp(s, "mille")) return 1; + if(!strcmp(s, "milles")) return 1; + if(!strcmp(s, "million")) return 1; + if(!strcmp(s, "millions")) return 1; + if(!strcmp(s, "milliard")) return 1; + if(!strcmp(s, "milliards")) return 1; + return 0; +} + + +int chaine_est_un_chiffre_sauf_un(char *s) +{ + + if(!strcmp(s, "zéro")) return 1; + if(!strcmp(s, "deux")) return 1; + if(!strcmp(s, "trois")) return 1; + if(!strcmp(s, "quatre")) return 1; + if(!strcmp(s, "cinq")) return 1; + if(!strcmp(s, "six")) return 1; + if(!strcmp(s, "sept")) return 1; + if(!strcmp(s, "huit")) return 1; + if(!strcmp(s, "neuf")) return 1; + if(!strcmp(s, "dix")) return 1; + if(!strcmp(s, "onze")) return 1; + if(!strcmp(s, "douze")) return 1; + if(!strcmp(s, "treize")) return 1; + if(!strcmp(s, "quatorze")) return 1; + if(!strcmp(s, "quinze")) return 1; + if(!strcmp(s, "seize")) return 1; + if(!strcmp(s, "vingt")) return 1; + if(!strcmp(s, "trente")) return 1; + if(!strcmp(s, "quarante")) return 1; + if(!strcmp(s, "cinquante")) return 1; + if(!strcmp(s, "soixante")) return 1; + if(!strcmp(s, "cent")) return 1; + if(!strcmp(s, "cents")) return 1; + if(!strcmp(s, "mille")) return 1; + if(!strcmp(s, "milles")) return 1; + if(!strcmp(s, "million")) return 1; + if(!strcmp(s, "millions")) return 1; + if(!strcmp(s, "milliard")) return 1; + if(!strcmp(s, "milliards")) return 1; + return 0; +} + + + +/*---------------------------------------------------------------------------------*/ + +int chaine_composee_de_digits(char *orig) +{ + int i; + int l = strlen(orig); + + if(!strcmp(orig, ",")) return 0; + + for(i=0; i<l; i++) + if(((orig[i] > '9') || (orig[i] < '0')) && (orig[i] != ',')) + return 0; + + return 1; +} + + +int chaine_est_un_nombre(char *orig) +{ + char *c, *s; + /* printf("w = %s\n", s); */ + + + if(chaine_composee_de_digits(orig)) return 1; + if(chaine_est_un_chiffre_sauf_un(orig)) return 1; + if(!chaine_possede_un_plus_ou_un_moins(orig)) return 0; + if(!strcmp(orig, "-")) return 0; + s = strdup(orig); + for(c = strtok(s, "+-"); c; c = strtok(NULL, "+-")){ + if(!chaine_est_un_chiffre(c)){ + free(s); + return 0; + } + } + + free(s); + return 1; +} + +/*---------------------------------------------------------------------------------*/ + +void orfeo_traite_nombres(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + w = s->words[i]; + if(chaine_est_un_nombre(w->form)){ + /* printf("word = %s lemma = %s\n", w->form, w->lemma); */ + strcpy(w->lemma, "NUM"); + } + } +} + + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ + +void traite_amalgame_du(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "de"); + strcpy(w->lemma, "de"); + w2 = conll_allocate_word(i, "le", "le", "DET", "DET", "NULL", 0, "SPEC"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_des(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "de"); + strcpy(w->lemma, "de"); + w2 = conll_allocate_word(i, "les", "le", "DET", "DET", "NULL", 0, "SPEC"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_au(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "à"); + strcpy(w->lemma, "à"); + w2 = conll_allocate_word(i, "le", "le", "DET", "DET", "NULL", 0, "SPEC"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_aux(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "à"); + strcpy(w->lemma, "à"); + w2 = conll_allocate_word(i, "les", "le", "DET", "DET", "NULL", 0, "SPEC"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_auquel(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "à"); + strcpy(w->lemma, "à"); + w2 = conll_allocate_word(i, "lequel", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_auxquels(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "à"); + strcpy(w->lemma, "à"); + w2 = conll_allocate_word(i, "lesquels", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_auxquelles(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "à"); + strcpy(w->lemma, "à"); + w2 = conll_allocate_word(i, "lesquelles", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_duquel(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "de"); + strcpy(w->lemma, "de"); + w2 = conll_allocate_word(i, "lequel", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_desquels(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "de"); + strcpy(w->lemma, "de"); + w2 = conll_allocate_word(i, "lesquels", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void traite_amalgame_desquelles(conll_sentence *s, int i) +{ + conll_word *w, *w2, *dep1; + w = s->words[i]; + if((w->mother) && (w->daughters_nb > 0)){ + dep1 = w->daughters[0]; + strcpy(w->form, "de"); + strcpy(w->lemma, "de"); + w2 = conll_allocate_word(i, "lesquelles", "lequel", "PRQ", "PRQ", "NULL", 0, "PRQ"); + conll_add_word(s, w2, i+1, dep1); + } +} + +void orfeo_traite_amalgames(conll_sentence *s) +{ + int i; + conll_word *w; + + for(i=1; i<s->l; i++){ + /* printf("************ l = %d\n", s->l); + printf("************ i = %d form = %s\n", i, w->form);*/ + w = s->words[i]; + if(!strcmp(w->form, "du")) traite_amalgame_du(s, i); + else if(!strcmp(w->form, "des")) traite_amalgame_des(s, i); + else if(!strcmp(w->form, "au")) traite_amalgame_au(s, i); + else if(!strcmp(w->form, "aux")) traite_amalgame_aux(s, i); + else if(!strcmp(w->form, "auquel")) traite_amalgame_auquel(s, i); + else if(!strcmp(w->form, "auxquels")) traite_amalgame_auxquels(s, i); + else if(!strcmp(w->form, "auxquelles")) traite_amalgame_auxquelles(s, i); + else if(!strcmp(w->form, "duquel")) traite_amalgame_duquel(s, i); + else if(!strcmp(w->form, "desquels")) traite_amalgame_desquels(s, i); + else if(!strcmp(w->form, "desquelles")) traite_amalgame_desquelles(s, i); + + + } +} + + + + + + + + + + diff --git a/maca_tools/CMakeLists.txt b/maca_tools/CMakeLists.txt index bbf7680..df0ee2e 100644 --- a/maca_tools/CMakeLists.txt +++ b/maca_tools/CMakeLists.txt @@ -5,14 +5,18 @@ target_link_libraries(scenes_roots2fann maca_common) install (TARGETS scenes_roots2fann DESTINATION bin) add_executable(mcf2conll ./src/mcf2conll.c) -target_link_libraries(mcf2conll perceptron) -target_link_libraries(mcf2conll transparse) target_link_libraries(mcf2conll maca_common) install (TARGETS mcf2conll DESTINATION bin) +add_executable(conll2mcf ./src/conll2mcf.c) +target_link_libraries(conll2mcf maca_common) +install (TARGETS conll2mcf DESTINATION bin) + +add_executable(conllu2mcf ./src/conllu2mcf.c) +target_link_libraries(conllu2mcf maca_common) +install (TARGETS conllu2mcf DESTINATION bin) + add_executable(mcf2orfeo ./src/mcf2orfeo.c) -target_link_libraries(mcf2orfeo perceptron) -target_link_libraries(mcf2orfeo transparse) target_link_libraries(mcf2orfeo maca_common) install (TARGETS mcf2orfeo DESTINATION bin) diff --git a/maca_tools/src/conll2mcf.c b/maca_tools/src/conll2mcf.c new file mode 100644 index 0000000..17c13a5 --- /dev/null +++ b/maca_tools/src/conll2mcf.c @@ -0,0 +1,158 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + +#define NB_COL 7 + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + char columns[NB_COL]; +} options; + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + int i; + op->filename = NULL; + op->fd_parses = stdin; + op->verbose_level = 0; + op->snum = 100000000; + for(i=0; i < NB_COL; i++) + op->columns[i] = '0'; +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); + + fprintf(stderr, " -1 : content of column 1 in the mcf file produced\n"); + fprintf(stderr, " -2 : content of column 2 in the mcf file produced\n"); + fprintf(stderr, " -3 : content of column 3 in the mcf file produced\n"); + fprintf(stderr, " -4 : content of column 4 in the mcf file produced\n"); + fprintf(stderr, " -5 : content of column 5 in the mcf file produced\n"); + fprintf(stderr, " -6 : content of column 6 in the mcf file produced\n"); + fprintf(stderr, " -7 : content of column 7 in the mcf file produced\n"); + fprintf(stderr, " : values of options -1 to -7 must be one of\n"); + fprintf(stderr, " : I for id\n"); + fprintf(stderr, " : W for form\n"); + fprintf(stderr, " : L for lemma\n"); + fprintf(stderr, " : C for coarse part of speech\n"); + fprintf(stderr, " : P for part of speech\n"); + fprintf(stderr, " : F for features\n"); + fprintf(stderr, " : H for head\n"); + fprintf(stderr, " : D for deprel\n"); + +} + + + + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + /* + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + }*/ + + while ((c = getopt (argc, argv, "hIWLCPFHDf:n:v:1:2:3:4:5:6:7:8:9:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case '1': + op->columns[0] = optarg[0]; + break; + case '2': + op->columns[1] = optarg[0]; + break; + case '3': + op->columns[2] = optarg[0]; + break; + case '4': + op->columns[3] = optarg[0]; + break; + case '5': + op->columns[4] = optarg[0]; + break; + case '6': + op->columns[5] = optarg[0]; + break; + case '7': + op->columns[6] = optarg[0]; + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + /* if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + }*/ +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + conll_sentence *s = conll_allocate_sentence(); + int snum = 0; + int res; + parse_options(argc, argv, &op); + + print_options(&op); + + + for(res = conll_load_sentence(op.fd_parses, s); res && (snum < op.snum); res = conll_load_sentence(op.fd_parses, s)){ + s->num = snum; + snum++; + conll_compute_relative_index_of_heads(s); + conll_print_sentence_mcf3(s, op.columns, NB_COL); + } + if(op.filename) + fclose(op.fd_parses); + conll_free_sentence(s); + return 0; +} diff --git a/maca_tools/src/conllu2mcf.c b/maca_tools/src/conllu2mcf.c new file mode 100644 index 0000000..faabdb0 --- /dev/null +++ b/maca_tools/src/conllu2mcf.c @@ -0,0 +1,159 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + +#define NB_COL 7 + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + char columns[NB_COL]; +} options; + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + int i; + op->filename = NULL; + op->fd_parses = stdin; + op->verbose_level = 0; + op->snum = 100000000; + for(i=0; i < NB_COL; i++) + op->columns[i] = '0'; +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); + + fprintf(stderr, " -1 : content of column 1 in the mcf file produced\n"); + fprintf(stderr, " -2 : content of column 2 in the mcf file produced\n"); + fprintf(stderr, " -3 : content of column 3 in the mcf file produced\n"); + fprintf(stderr, " -4 : content of column 4 in the mcf file produced\n"); + fprintf(stderr, " -5 : content of column 5 in the mcf file produced\n"); + fprintf(stderr, " -6 : content of column 6 in the mcf file produced\n"); + fprintf(stderr, " -7 : content of column 7 in the mcf file produced\n"); + fprintf(stderr, " : values of options -1 to -7 must be one of\n"); + fprintf(stderr, " : I for id\n"); + fprintf(stderr, " : W for form\n"); + fprintf(stderr, " : L for lemma\n"); + fprintf(stderr, " : C for coarse part of speech\n"); + fprintf(stderr, " : P for part of speech\n"); + fprintf(stderr, " : F for features\n"); + fprintf(stderr, " : H for head\n"); + fprintf(stderr, " : D for deprel\n"); + fprintf(stderr, " : G for language\n"); + +} + + + + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + /* + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + }*/ + + while ((c = getopt (argc, argv, "hIWLCPFHDf:n:v:1:2:3:4:5:6:7:8:9:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case '1': + op->columns[0] = optarg[0]; + break; + case '2': + op->columns[1] = optarg[0]; + break; + case '3': + op->columns[2] = optarg[0]; + break; + case '4': + op->columns[3] = optarg[0]; + break; + case '5': + op->columns[4] = optarg[0]; + break; + case '6': + op->columns[5] = optarg[0]; + break; + case '7': + op->columns[6] = optarg[0]; + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + /* if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + }*/ +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + conll_sentence *s = conll_allocate_sentence(); + int snum = 0; + int res; + parse_options(argc, argv, &op); + + print_options(&op); + + + for(res = conll_load_sentence(op.fd_parses, s); res && (snum < op.snum); res = conll_load_sentence(op.fd_parses, s)){ + s->num = snum; + snum++; + conll_compute_relative_index_of_heads(s); + conll_print_sentence_mcf3(s, op.columns, NB_COL); + } + if(op.filename) + fclose(op.fd_parses); + conll_free_sentence(s); + return 0; +} diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c index f4646dd..a36bc54 100644 --- a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c @@ -9,6 +9,7 @@ #include"config2feat_vec.h" #include"feature_table.h" #include"dico.h" +#include"word.h" void print_word_buffer_old(config *c, dico *dico_labels, mcd *mcd_struct) { @@ -150,7 +151,9 @@ void simple_decoder_parser_arc_eager(context *ctx) if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){ word_set_sent_seg(stack_top(config_get_stack(c)), -1); + word_set_gov(stack_top(config_get_stack(c)), 0); movement_parser_eos(c); + while(movement_parser_reduce(c)); while(movement_parser_root(c, root_label)); if(ctx->debug_mode) printf("force EOS\n"); -- GitLab