Commit 1c19c11d authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added several tools to manipulate corpora in maca_corpora as well as conll2mcf

parent 4b6343c8
......@@ -32,6 +32,7 @@ endif()
include_directories(maca_common/include)
include_directories(perceptron/lib/include)
include_directories(maca_corpora/lib/include)
add_subdirectory(maca_common)
add_subdirectory(maca_tools)
......@@ -42,6 +43,7 @@ add_subdirectory(maca_tokenizer)
add_subdirectory(maca_lexer)
add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
add_subdirectory(maca_corpora)
#add_subdirectory(maca_graph_parser)
if(MACA_EXPORT)
......
set(SOURCES src/util.c
set(SOURCES
src/util.c
src/hash.c
src/hash_str.c
src/dico.c
src/word_emb.c
src/mcd.c
......@@ -17,6 +19,7 @@ set(SOURCES src/util.c
src/fplm.c
src/json_parser.c
src/json_tree.c
src/conll_lib.c
)
#compiling library
......
/*******************************************************************************
Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
and Joseph Le Roux <joseph.le.roux@gmail.com>
conll_lib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
conll_lib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with conll_lib. If not, see <http://www.gnu.org/licenses/>.
*******************************************************************************/
#ifndef __CONLL_LIB__
#define __CONLL_LIB__
#include <stdio.h>
#include "hash_str.h"
#define MAX_WORDS_IN_SENTENCE 1000
#define MAX_STR 10000
#define MAX_LINE_LENGTH 50000
#define INCORRECT_SENTENCE_NUM_VALUE -1
#define INCORRECT_PARSE_NUM_VALUE -1
#define INCORRECT_LOGPROB_VALUE 10
#define INCORRECT_ORACLE_VALUE -1
#define INCORRECT_CONF_MEAS -1
#define INCORRECT_LEX_AFF -1
typedef struct w
{
unsigned id; /* Token counter, starting at 1 for each new sentence.*/
char form[MAX_STR]; /* Word form or punctuation symbol.*/
char lemma[MAX_STR]; /* Lemma or stem (depending on particular data set) of word form,*/
/* or an underscore if not available.*/
char cpostag[MAX_STR];/* Coarse-grained part-of-speech tag, where tagset depends on the language.*/
char postag[MAX_STR]; /* Fine-grained part-of-speech tag, where the tagset depends on the language,*/
/* or identical to the coarse-grained part-of-speech tag if not available.*/
char feats[MAX_STR]; /* Unordered set of syntactic and/or morphological features (depending on the particular language)*/
/*, separated by a vertical bar (|), or an underscore if not available.*/
int head; /* Head of the current token, which is either a value of ID or zero ('0').*/
char deprel[MAX_STR]; /* Dependency relation to the HEAD. The set of dependency relations depends on the particular language.*/
/* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/
unsigned phead;/* Projective head of current token, which is either a value of ID or zero ('0'), or an underscore if not available. */
/* The dependency structure resulting from the PHEAD column is guaranteed to be projective */
/* whereas the structures resulting from the HEAD column will be non-projective for some sentences */
char pdeprel[MAX_STR]; /* Dependency relation to the PHEAD, or an underscore if not available. */
/* Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'.*/
char language[MAX_STR]; /* Language identifier */
double score; /* score of the dependency, not in the 2007 conll format */
double lex_aff; /* lexical affinity of the dependent and the governor, not in the 2007 conll format */
struct w * mother;
struct w * daughters[MAX_WORDS_IN_SENTENCE];
unsigned daughters_nb;
double conf_meas;
} conll_word;
typedef struct
{
conll_word * root;
conll_word * words[MAX_WORDS_IN_SENTENCE];
unsigned l; /* sentence length */
unsigned num; /* sentence number */
} conll_sentence;
conll_sentence *conll_allocate_sentence(void);
void conll_renumber_sentence(conll_sentence *s);
void conll_reset_sentence(conll_sentence *s);
void conll_free_sentence(conll_sentence *s);
int conll_load_sentence(FILE *f, conll_sentence *s);
void conll_print_sentence(conll_sentence *s);
void conll_print_sentence_mcf(conll_sentence *s, int coarse_pos);
void conll_print_sentence_mcf2(conll_sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel);
void conll_print_sentence_mcf3(conll_sentence *s, char *columns, int nb_col);
void conll_compact_sentence(conll_sentence *s);
conll_word *conll_allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel);
conll_word *conll_copy_word(conll_word *w);
void conll_add_daughter(conll_word *daughter, conll_word *mother);
void conll_remove_daughter(conll_sentence *s, int i);
void conll_remove_word_rec(conll_sentence *s, int i);
void conll_remove_subtree(conll_sentence *s, int root);
void conll_add_word(conll_sentence *s, conll_word *w, int pos, conll_word *gov);
void conll_split_node_in_two(conll_sentence *s, int pos, conll_word *gov, conll_word *dep, int pos_gov, int pos_dep);
void conll_change_pos(conll_sentence *s, hash_str *h_pos);
void conll_change_cpos(conll_sentence *s, hash_str *h_cpos);
void conll_change_fct(conll_sentence *s, hash_str *h_fct);
int conll_is_num(char *s);
void conll_renumber_sentence_offset(conll_sentence *s, int offset);
void conll_compute_relative_index_of_heads(conll_sentence *s);
#endif
#ifndef __HASH_STR__
#define __HASH_STR__
#define HASH_STR_INVALID_VAL NULL
typedef struct _hash_str_cell
{
char *key;
char *val;
struct _hash_str_cell *next;
} hash_str_cell;
typedef struct
{
int size;
int nbelem;
hash_str_cell **array;
} hash_str;
hash_str_cell *hash_str_cell_new(char *key, char *val, hash_str_cell *next);
void hash_str_cell_free(hash_str_cell *c);
hash_str *hash_str_new(int size);
void hash_str_free(hash_str *h);
hash_str_cell *hash_str_lookup(hash_str *h, char *key);
char *hash_str_get_val(hash_str *h, char *key);
void hash_str_add(hash_str *h, char *key, char *val);
void hash_str_stats(hash_str *h);
#endif
/*******************************************************************************
Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
and Joseph Le Roux <joseph.le.roux@gmail.com>
conll_lib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
conll_lib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with conll_lib. If not, see <http://www.gnu.org/licenses/>.
*******************************************************************************/
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"conll_lib.h"
int conll_parse_line(FILE *f, conll_sentence *s);
void conll_compute_relative_index_of_heads(conll_sentence *s)
{
int i;
conll_word *w;
for(i=1; i<s->l; i++){
w = s->words[i];
/* printf("i = %d head = %d\n", w->head); */
/* roots keep 0 as index of head */
if(w->head != 0)
w->head = w->head - i;
}
}
void conll_renumber_sentence(conll_sentence *s)
{
int i;
conll_word *w;
for(i=0 ; i < s->l; i++){
s->words[i]->id = i;
}
for(i=0 ; i < s->l; i++){
w = s->words[i];
if(w->mother)
w->head = w->mother->id;
else
w->head = 0;
}
}
void conll_reset_sentence(conll_sentence *s)
{
int i;
for(i=0 ; i < s->l; i++){
if(s->words[i]){
free(s->words[i]);
s->words[i] = NULL;
}
}
s->words[0] = conll_allocate_word(0, "ROOT", "ROOT", "ROOT", "ROOT", "ROOT", -1, "ROOT");
s->l = 1;
}
void conll_free_sentence(conll_sentence *s)
{
int i;
for(i=0 ; i < s->l; i++){
if(s->words[i]){
/* free(s->words[i]); */
}
}
free(s);
}
conll_word *conll_copy_word(conll_word *w){
return conll_allocate_word(w->id, w->form, w->lemma, w->cpostag, w->postag, w->feats, w->head, w->deprel);
}
conll_word *conll_allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel)
{
conll_word *w = (conll_word *)malloc(sizeof(conll_word));
w->id = id;
strcpy(w->form, form);
strcpy(w->lemma, lemma);
strcpy(w->cpostag, cpostag);
strcpy(w->postag, postag);
strcpy(w->feats, feats);
w->head = head;
strcpy(w->deprel, deprel);
w->mother = NULL;
w->daughters_nb = 0;
return w;
}
conll_sentence *conll_allocate_sentence(void)
{
conll_sentence *s;
int i;
s = (conll_sentence *)malloc(sizeof(conll_sentence));
if(s == NULL){
fprintf(stderr, "cannot allocate sentence\n");
exit(1);
}
s->num = INCORRECT_SENTENCE_NUM_VALUE;
s->l = 0;
for(i=0; i < MAX_WORDS_IN_SENTENCE; i++){
s->words[i] = NULL;
}
return s;
}
int conll_load_sentence(FILE *f, conll_sentence *s)
{
int res;
int i;
if(feof(f)) return 0;
conll_reset_sentence(s);
for(res = conll_parse_line(f, s); res; res = conll_parse_line(f, s));
/* read an 'empty' parse (two succeding cr) */
if(s->l == 0) return 0;
/* build the tree structure */
s->words[0]->mother = NULL;
for(i=1; i < s->l; ++i){
if((s->words[i]->head >= 0) && (s->words[i]->head <= s->l)){ /* check that head attribute is not out of range */
conll_add_daughter(s->words[i], s->words[s->words[i]->head]);
}
}
return 1;
}
/*----------------------------------------------------------------------------*/
int conll_parse_line(FILE *f, conll_sentence *s)
{
char buff[MAX_LINE_LENGTH];
conll_word *w;
char head_str[100];
char C9[100];
char C10[100];
if(feof(f)) return 0;
if (fgets(buff, MAX_LINE_LENGTH, f) == NULL) {
// fprintf(stderr, "cannot read file: empty ?");
return 0;
}
/* ignore empty line */
if(buff[0] == '\n'){
/* printf("\n"); */
return 0;
}
/* specific to conll_u */
/* ignore comments */
if(buff[0] == '#'){
return 0;
}
{
/* ignore amalgams */
int i;
for(i=0; (buff[i] != '\t') && (i < MAX_LINE_LENGTH); i++)
if(buff[i] == '-') return 1;
}
{
/* ignore ellipsis */
int i;
for(i=0; (buff[i] != '\t') && (i < MAX_LINE_LENGTH); i++)
if(buff[i] == '.') return 1;
}
/* end of specific to conll_u */
s->words[s->l] = w = (conll_word *)malloc(sizeof(conll_word));
w->daughters_nb = 0;
s->l++;
if(s->l < MAX_WORDS_IN_SENTENCE){
/* read a dependency description */
/* 1 A a _ DT _ 3 det _ _ */
/* 2 severe severe _ JJ _ 3 amod _ _ */
/* 3 storm storm _ NN _ 4 nsubj _ _ */
/* 4 swept sweep _ VBD _ 26 ccomp _ _ */
/* 5 through through _ IN _ 4 prep _ _ */
/* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */
sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel, C9, C10, w->language);
/* printf("buff = %s\n", buff);
printf("form = %s\n", w->form);
printf("lemma = %s\n", w->lemma);
printf("cpostag = %s\n", w->cpostag);
printf("postag = %s\n", w->postag);
printf("feats = %s\n", w->feats);
printf("head_str = %s\n", head_str);
printf("language = %s\n", w->language);
printf("C10 = %s\n", C9);
printf("C9 = %s\n", C10);*/
/* w->cpostag[0] = w->postag[0]; */
if(strcmp(head_str, "_")){
w->head = atoi(head_str);
if(w->head == 0) s->root = w;
}
}
return 1;
}
void conll_print_sentence_mcf2(conll_sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel)
{
int i;
conll_word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
if(print_id)
printf("%d\t", w->id);
if(print_form)
printf("%s\t", w->form);
if(print_lemma)
printf("%s\t", w->lemma);
if(print_cpostag)
printf("%s\t", w->cpostag);
if(print_postag)
printf("%s\t", w->postag);
if(print_feats)
printf("%s\t", w->feats);
if(print_head)
printf("%d\t", w->head);
if(print_deprel)
printf("%s\t", w->deprel);
if(i == s->l - 1)
fprintf(stdout, "1\n");
else
fprintf(stdout, "0\n");
}
}
void conll_print_sentence_mcf3(conll_sentence *s, char *columns, int nb_col)
{
int i,j;
conll_word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
for(j=0; j < nb_col; j++)
switch(columns[j]){
case 'I':
printf("%d\t", w->id);
break;
case 'W':
printf("%s\t", w->form);
break;
case 'L':
printf("%s\t", w->lemma);
break;
case 'C':
printf("%s\t", w->cpostag);
break;
case 'P':
printf("%s\t", w->postag);
break;
case 'F':
printf("%s\t", w->feats);
break;
case 'H':
printf("%d\t", w->head);
break;
case 'D':
printf("%s\t", w->deprel);
break;
case 'G':
printf("%s\t", w->language);
break;
}
if(i == s->l - 1)
fprintf(stdout, "1\n");
else
fprintf(stdout, "0\n");
}
}
void conll_print_sentence_mcf(conll_sentence *s, int coarse_pos)
{
int i;
conll_word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
/* fprintf(stdout, "%d", w->id); */
fprintf(stdout, "%s", w->form);
if(coarse_pos)
fprintf(stdout, "\t%s", w->cpostag);
else
fprintf(stdout, "\t%s", w->postag);
fprintf(stdout, "\t%s", w->lemma);
if(w->mother == NULL)
fprintf(stdout, "\t0");
else
fprintf(stdout, "\t%d", w->mother->id - w->id);
fprintf(stdout, "\t%s", w->deprel);
if(i == s->l - 1)
fprintf(stdout, "\t1");
else
fprintf(stdout, "\t0");
fprintf(stdout, "\n");
}
}
void conll_print_sentence(conll_sentence *s)
{
int i;
conll_word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
fprintf(stdout, "%d", w->id);
fprintf(stdout, "\t%s", w->form);
fprintf(stdout, "\t%s", w->lemma);
fprintf(stdout, "\t%s", w->cpostag);
fprintf(stdout, "\t%s", w->postag);
fprintf(stdout, "\t%s", w->feats);
if(w->mother == NULL)
fprintf(stdout, "\t0");
else
fprintf(stdout, "\t%d", w->mother->id);
fprintf(stdout, "\t%s", w->deprel);
fprintf(stdout, "\t_\t_\n");
}
printf("\n");
}
void conll_compact_sentence(conll_sentence *s)
{
int i,j;
for(i=0; i < s->l; i++){
if(s->words[i] == NULL){
for(j = i; j < s->l - 1; j++){
s->words[j] = s->words[j+1];
}
i--;
s->l--;
}
}
}
void conll_add_daughter(conll_word *daughter, conll_word *mother)
{
if(daughter){
if(mother){
daughter->mother = mother;
mother->daughters[mother->daughters_nb] = daughter;
mother->daughters_nb++;
}
else{
daughter->mother = NULL;
}
}
}
void conll_remove_daughter(conll_sentence *s, int i)
{
int j,k;
conll_word *dep = s->words[i];
conll_word *gov;
if(dep){
gov = dep->mother;
if(gov){
for(j=0; j < gov->daughters_nb; j++){
if(gov->daughters[j] == dep){
for(k=j; k < gov->daughters_nb - 1; k++){
gov->daughters[k] = gov->daughters[k+1];
}
gov->daughters_nb--;
}
}
}
}
}
void conll_remove_word_rec(conll_sentence *s, int i)
{
int j;
conll_word *w = s->words[i];
for(j=1; j < s->l; j++){
if((s->words[j]) && (s->words[j]->mother == w))
conll_remove_word_rec(s, j);
}
conll_remove_daughter(s, i);
free(w);
s->words[i] = NULL;
}
void conll_remove_subtree(conll_sentence *s, int root)
{
conll_remove_word_rec(s, root);
conll_compact_sentence(s);
}
void conll_add_word(conll_sentence *s, conll_word *w, int index, conll_word *gov)
{
int i;