Select Git revision
-
thomas.blanc.2@etu.univ-amu.fr authoredthomas.blanc.2@etu.univ-amu.fr authored
conll_lib.c 10.97 KiB
/*******************************************************************************
Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
and Joseph Le Roux <joseph.le.roux@gmail.com>
conll_lib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
conll_lib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with conll_lib. If not, see <http://www.gnu.org/licenses/>.
*******************************************************************************/
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"conll_lib.h"
int parse_line(FILE *f, sentence *s);
void renumber_sentence(sentence *s)
{
int i;
word *w;
for(i=0 ; i < s->l; i++){
s->words[i]->id = i;
}
for(i=0 ; i < s->l; i++){
w = s->words[i];
if(w->mother)
w->head = w->mother->id;
else
w->head = 0;
}
}
void reset_sentence(sentence *s)
{
int i;
for(i=0 ; i < s->l; i++){
if(s->words[i]){
free(s->words[i]);
s->words[i] = NULL;
}
}
s->words[0] = allocate_word(0, "ROOT", "ROOT", "ROOT", "ROOT", "ROOT", -1, "ROOT");
s->l = 1;
}
void free_sentence(sentence *s)
{
int i;
for(i=0 ; i < s->l; i++){
if(s->words[i]){
/* free(s->words[i]); */
}
}
free(s);
}
word *copy_word(word *w){
return allocate_word(w->id, w->form, w->lemma, w->cpostag, w->postag, w->feats, w->head, w->deprel);
}
word *allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel)
{
word *w = malloc(sizeof(word));
w->id = id;
strcpy(w->form, form);
strcpy(w->lemma, lemma);
strcpy(w->cpostag, cpostag);
strcpy(w->postag, postag);
strcpy(w->feats, feats);
w->head = head;
strcpy(w->deprel, deprel);
w->mother = NULL;
w->daughters_nb = 0;
return w;
}
sentence *allocate_sentence(void)
{
sentence *s;
int i;
s = malloc(sizeof(sentence));
if(s == NULL){
fprintf(stderr, "cannot allocate sentence\n");
exit(1);
}
s->num = INCORRECT_SENTENCE_NUM_VALUE;
s->l = 0;
for(i=0; i < MAX_WORDS_IN_SENTENCE; i++){
s->words[i] = NULL;
}
return s;
}
int load_sentence(FILE *f, sentence *s)
{
int res;
int i;
if(feof(f)) return 0;
reset_sentence(s);
for(res = parse_line(f, s); res; res = parse_line(f, s));
/* read an 'empty' parse (two succeding cr) */
if(s->l == 0) return 0;
/* build the tree structure */
s->words[0]->mother = NULL;
for(i=1; i < s->l; ++i){
if((s->words[i]->head >= 0) && (s->words[i]->head <= s->l)){ /* check that head attribute is not out of range */
add_daughter(s->words[i], s->words[s->words[i]->head]);
}
}
return 1;
}
/*----------------------------------------------------------------------------*/
int parse_line(FILE *f, sentence *s)
{
char buff[MAX_LINE_LENGTH];
word *w;
char head_str[100];
if(feof(f)) return 0;
if (fgets(buff, MAX_LINE_LENGTH, f) == NULL) {
// fprintf(stderr, "cannot read file: empty ?");
return 0;
}
/* read an empty line */
if(buff[0] == '\n'){
/* printf("\n"); */
return 0;
}
s->words[s->l] = w = malloc(sizeof(word));
w->daughters_nb = 0;
s->l++;
if(s->l < MAX_WORDS_IN_SENTENCE){
/* read a dependency description */
/* 1 A a _ DT _ 3 det _ _ */
/* 2 severe severe _ JJ _ 3 amod _ _ */
/* 3 storm storm _ NN _ 4 nsubj _ _ */
/* 4 swept sweep _ VBD _ 26 ccomp _ _ */
/* 5 through through _ IN _ 4 prep _ _ */
/* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */
sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel);
/* printf("form = %s\n", w->form);
printf("lemma = %s\n", w->lemma);
printf("cpostag = %s\n", w->cpostag);
printf("postag = %s\n", w->postag);
printf("feats = %s\n", w->feats);
printf("head_str = %s\n", head_str);*/
/* w->cpostag[0] = w->postag[0]; */
if(strcmp(head_str, "_")){
w->head = atoi(head_str);
if(w->head == 0) s->root = w;
}
}
return 1;
}
void print_sentence_mcf2(sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel)
{
int i;
word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
if(print_id)
printf("%d\t", w->id);
if(print_form)
printf("%s\t", w->form);
if(print_lemma)
printf("%s\t", w->lemma);
if(print_cpostag)
printf("%s\t", w->cpostag);
if(print_postag)
printf("%s\t", w->postag);
if(print_feats)
printf("%s\t", w->feats);
if(print_head)
printf("%d\t", w->head);
if(print_deprel)
printf("%s\t", w->deprel);
if(i == s->l - 1)
fprintf(stdout, "1\n");
else
fprintf(stdout, "0\n");
}
}
void print_sentence_mcf3(sentence *s, char *columns, int nb_col)
{
int i,j;
word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
for(j=0; j < nb_col; j++)
switch(columns[j]){
case 'I':
printf("%d\t", w->id);
break;
case 'W':
printf("%s\t", w->form);
break;
case 'L':
printf("%s\t", w->lemma);
break;
case 'C':
printf("%s\t", w->cpostag);
break;
case 'P':
printf("%s\t", w->postag);
break;
case 'F':
printf("%s\t", w->feats);
break;
case 'H':
printf("%d\t", w->head);
break;
case 'D':
printf("%s\t", w->deprel);
break;
}
if(i == s->l - 1)
fprintf(stdout, "1\n");
else
fprintf(stdout, "0\n");
}
}
void print_sentence_mcf(sentence *s, int coarse_pos)
{
int i;
word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
/* fprintf(stdout, "%d", w->id); */
fprintf(stdout, "%s", w->form);
if(coarse_pos)
fprintf(stdout, "\t%s", w->cpostag);
else
fprintf(stdout, "\t%s", w->postag);
fprintf(stdout, "\t%s", w->lemma);
if(w->mother == NULL)
fprintf(stdout, "\t0");
else
fprintf(stdout, "\t%d", w->mother->id - w->id);
fprintf(stdout, "\t%s", w->deprel);
if(i == s->l - 1)
fprintf(stdout, "\t1");
else
fprintf(stdout, "\t0");
fprintf(stdout, "\n");
}
}
void print_sentence(sentence *s)
{
int i;
word *w;
if((s->l == 1) || (s->l == 0)) return;
for(i=1; i<s->l; i++){
w = s->words[i];
fprintf(stdout, "%d", w->id);
fprintf(stdout, "\t%s", w->form);
fprintf(stdout, "\t%s", w->lemma);
fprintf(stdout, "\t%s", w->cpostag);
fprintf(stdout, "\t%s", w->postag);
fprintf(stdout, "\t%s", w->feats);
if(w->mother == NULL)
fprintf(stdout, "\t0");
else
fprintf(stdout, "\t%d", w->mother->id);
fprintf(stdout, "\t%s", w->deprel);
fprintf(stdout, "\t_\t_\n");
}
printf("\n");
}
void compact_sentence(sentence *s)
{
int i,j;
for(i=0; i < s->l; i++){
if(s->words[i] == NULL){
for(j = i; j < s->l - 1; j++){
s->words[j] = s->words[j+1];
}
i--;
s->l--;
}
}
}
void add_daughter(word *daughter, word *mother)
{
if(daughter){
if(mother){
daughter->mother = mother;
mother->daughters[mother->daughters_nb] = daughter;
mother->daughters_nb++;
}
else{
daughter->mother = NULL;
}
}
}
void remove_daughter(sentence *s, int i)
{
int j,k;
word *dep = s->words[i];
word *gov;
if(dep){
gov = dep->mother;
if(gov){
for(j=0; j < gov->daughters_nb; j++){
if(gov->daughters[j] == dep){
for(k=j; k < gov->daughters_nb - 1; k++){
gov->daughters[k] = gov->daughters[k+1];
}
gov->daughters_nb--;
}
}
}
}
}
void remove_word_rec(sentence *s, int i)
{
int j;
word *w = s->words[i];
for(j=1; j < s->l; j++){
if((s->words[j]) && (s->words[j]->mother == w))
remove_word_rec(s, j);
}
remove_daughter(s, i);
free(w);
s->words[i] = NULL;
}
void remove_subtree(sentence *s, int root)
{
remove_word_rec(s, root);
compact_sentence(s);
}
void add_word(sentence *s, word *w, int index, word *gov)
{
int i;
if(s->words[index] != NULL){
for(i=s->l; i>index; i--){
s->words[i] = s->words[i-1];
}
s->l++;
}
s->words[index] = w;
if(index >= s->l) s->l = index+1;
if(gov != NULL)
add_daughter(w, gov);
}
void split_node_in_two(sentence *s, int index, word *gov, word *dep, int index_gov, int index_dep)
{
int i;
word *w = s->words[index];
word *mother = w->mother;
strcpy(gov->deprel, w->deprel);
for(i=1; i < s->l; i++){
if(s->words[i]->mother == w)
add_daughter(s->words[i], gov);
}
free(w);
s->words[index] = NULL;
add_word(s, gov, index_gov, mother);
add_word(s, dep, index_dep, gov);
}
/*---------------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------------*/
void change_cpos(sentence *s, hash_str *h_cpos)
{
int i;
word *w;
char *val;
for(i=1; i<s->l; i++){
w = s->words[i];
val = hash_str_get_val (h_cpos, w->cpostag);
if(val){
strcpy(w->cpostag, val);
}
else{
fprintf(stderr, "ATTENTION: cpos %s inconnue\n", w->cpostag);
}
}
}
/*---------------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------------*/
void change_pos(sentence *s, hash_str *h_pos)
{
int i;
word *w;
char *val;
for(i=1; i<s->l; i++){
w = s->words[i];
val = hash_str_get_val (h_pos, w->postag);
if(val){
strcpy(w->postag, val);
}
else{
fprintf(stderr, "ATTENTION: pos %s inconnue\n", w->cpostag);
}
}
}
/*---------------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------------*/
void change_fct(sentence *s, hash_str *h_fct)
{
int i;
word *w;
char *val;
for(i=1; i<s->l; i++){
w = s->words[i];
val = hash_str_get_val (h_fct, w->deprel);
if(val){
strcpy(w->deprel, val);
}
else{
fprintf(stderr, "ATTENTION: fct %s inconnue\n", w->deprel);
}
}
}
int is_num(char *s)
{
int i;
int l;
if(s == NULL) return 0;
l = strlen(s);
if((l == 1) && (s[0] == ',')) return 0;
for(i=0; i <l; i++)
if(((s[i] < '0') || (s[i] > '9')) && (s[i] != ','))
return 0;
return 1;
}
void renumber_sentence_offset(sentence *s, int offset)
{
int i;
for(i=0 ; i < s->l; i++){
s->words[i]->id = i + offset;
}
}