/******************************************************************************* Copyright (C) 2010 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> and Joseph Le Roux <joseph.le.roux@gmail.com> conll_lib is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. conll_lib is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with conll_lib. If not, see <http://www.gnu.org/licenses/>. *******************************************************************************/ #include<stdio.h> #include<stdlib.h> #include<string.h> #include"conll_lib.h" int parse_line(FILE *f, sentence *s); void renumber_sentence(sentence *s) { int i; word *w; for(i=0 ; i < s->l; i++){ s->words[i]->id = i; } for(i=0 ; i < s->l; i++){ w = s->words[i]; if(w->mother) w->head = w->mother->id; else w->head = 0; } } void reset_sentence(sentence *s) { int i; for(i=0 ; i < s->l; i++){ if(s->words[i]){ free(s->words[i]); s->words[i] = NULL; } } s->words[0] = allocate_word(0, "ROOT", "ROOT", "ROOT", "ROOT", "ROOT", -1, "ROOT"); s->l = 1; } void free_sentence(sentence *s) { int i; for(i=0 ; i < s->l; i++){ if(s->words[i]){ /* free(s->words[i]); */ } } free(s); } word *copy_word(word *w){ return allocate_word(w->id, w->form, w->lemma, w->cpostag, w->postag, w->feats, w->head, w->deprel); } word *allocate_word(unsigned id, char *form, char *lemma, char *cpostag, char *postag, char *feats, unsigned head, char *deprel) { word *w = malloc(sizeof(word)); w->id = id; strcpy(w->form, form); strcpy(w->lemma, lemma); strcpy(w->cpostag, cpostag); strcpy(w->postag, postag); strcpy(w->feats, feats); w->head = head; strcpy(w->deprel, deprel); w->mother = NULL; w->daughters_nb = 0; return w; } sentence *allocate_sentence(void) { sentence *s; int i; s = malloc(sizeof(sentence)); if(s == NULL){ fprintf(stderr, "cannot allocate sentence\n"); exit(1); } s->num = INCORRECT_SENTENCE_NUM_VALUE; s->l = 0; for(i=0; i < MAX_WORDS_IN_SENTENCE; i++){ s->words[i] = NULL; } return s; } int load_sentence(FILE *f, sentence *s) { int res; int i; if(feof(f)) return 0; reset_sentence(s); for(res = parse_line(f, s); res; res = parse_line(f, s)); /* read an 'empty' parse (two succeding cr) */ if(s->l == 0) return 0; /* build the tree structure */ s->words[0]->mother = NULL; for(i=1; i < s->l; ++i){ if((s->words[i]->head >= 0) && (s->words[i]->head <= s->l)){ /* check that head attribute is not out of range */ add_daughter(s->words[i], s->words[s->words[i]->head]); } } return 1; } /*----------------------------------------------------------------------------*/ int parse_line(FILE *f, sentence *s) { char buff[MAX_LINE_LENGTH]; word *w; char head_str[100]; if(feof(f)) return 0; if (fgets(buff, MAX_LINE_LENGTH, f) == NULL) { // fprintf(stderr, "cannot read file: empty ?"); return 0; } /* read an empty line */ if(buff[0] == '\n'){ /* printf("\n"); */ return 0; } s->words[s->l] = w = malloc(sizeof(word)); w->daughters_nb = 0; s->l++; if(s->l < MAX_WORDS_IN_SENTENCE){ /* read a dependency description */ /* 1 A a _ DT _ 3 det _ _ */ /* 2 severe severe _ JJ _ 3 amod _ _ */ /* 3 storm storm _ NN _ 4 nsubj _ _ */ /* 4 swept sweep _ VBD _ 26 ccomp _ _ */ /* 5 through through _ IN _ 4 prep _ _ */ /* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */ sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); /* printf("form = %s\n", w->form); printf("lemma = %s\n", w->lemma); printf("cpostag = %s\n", w->cpostag); printf("postag = %s\n", w->postag); printf("feats = %s\n", w->feats); printf("head_str = %s\n", head_str);*/ /* w->cpostag[0] = w->postag[0]; */ if(strcmp(head_str, "_")){ w->head = atoi(head_str); if(w->head == 0) s->root = w; } } return 1; } void print_sentence_mcf2(sentence *s, int print_id, int print_form, int print_lemma, int print_cpostag, int print_postag, int print_feats, int print_head, int print_deprel) { int i; word *w; if((s->l == 1) || (s->l == 0)) return; for(i=1; i<s->l; i++){ w = s->words[i]; if(print_id) printf("%d\t", w->id); if(print_form) printf("%s\t", w->form); if(print_lemma) printf("%s\t", w->lemma); if(print_cpostag) printf("%s\t", w->cpostag); if(print_postag) printf("%s\t", w->postag); if(print_feats) printf("%s\t", w->feats); if(print_head) printf("%d\t", w->head); if(print_deprel) printf("%s\t", w->deprel); if(i == s->l - 1) fprintf(stdout, "1\n"); else fprintf(stdout, "0\n"); } } void print_sentence_mcf3(sentence *s, char *columns, int nb_col) { int i,j; word *w; if((s->l == 1) || (s->l == 0)) return; for(i=1; i<s->l; i++){ w = s->words[i]; for(j=0; j < nb_col; j++) switch(columns[j]){ case 'I': printf("%d\t", w->id); break; case 'W': printf("%s\t", w->form); break; case 'L': printf("%s\t", w->lemma); break; case 'C': printf("%s\t", w->cpostag); break; case 'P': printf("%s\t", w->postag); break; case 'F': printf("%s\t", w->feats); break; case 'H': printf("%d\t", w->head); break; case 'D': printf("%s\t", w->deprel); break; } if(i == s->l - 1) fprintf(stdout, "1\n"); else fprintf(stdout, "0\n"); } } void print_sentence_mcf(sentence *s, int coarse_pos) { int i; word *w; if((s->l == 1) || (s->l == 0)) return; for(i=1; i<s->l; i++){ w = s->words[i]; /* fprintf(stdout, "%d", w->id); */ fprintf(stdout, "%s", w->form); if(coarse_pos) fprintf(stdout, "\t%s", w->cpostag); else fprintf(stdout, "\t%s", w->postag); fprintf(stdout, "\t%s", w->lemma); if(w->mother == NULL) fprintf(stdout, "\t0"); else fprintf(stdout, "\t%d", w->mother->id - w->id); fprintf(stdout, "\t%s", w->deprel); if(i == s->l - 1) fprintf(stdout, "\t1"); else fprintf(stdout, "\t0"); fprintf(stdout, "\n"); } } void print_sentence(sentence *s) { int i; word *w; if((s->l == 1) || (s->l == 0)) return; for(i=1; i<s->l; i++){ w = s->words[i]; fprintf(stdout, "%d", w->id); fprintf(stdout, "\t%s", w->form); fprintf(stdout, "\t%s", w->lemma); fprintf(stdout, "\t%s", w->cpostag); fprintf(stdout, "\t%s", w->postag); fprintf(stdout, "\t%s", w->feats); if(w->mother == NULL) fprintf(stdout, "\t0"); else fprintf(stdout, "\t%d", w->mother->id); fprintf(stdout, "\t%s", w->deprel); fprintf(stdout, "\t_\t_\n"); } printf("\n"); } void compact_sentence(sentence *s) { int i,j; for(i=0; i < s->l; i++){ if(s->words[i] == NULL){ for(j = i; j < s->l - 1; j++){ s->words[j] = s->words[j+1]; } i--; s->l--; } } } void add_daughter(word *daughter, word *mother) { if(daughter){ if(mother){ daughter->mother = mother; mother->daughters[mother->daughters_nb] = daughter; mother->daughters_nb++; } else{ daughter->mother = NULL; } } } void remove_daughter(sentence *s, int i) { int j,k; word *dep = s->words[i]; word *gov; if(dep){ gov = dep->mother; if(gov){ for(j=0; j < gov->daughters_nb; j++){ if(gov->daughters[j] == dep){ for(k=j; k < gov->daughters_nb - 1; k++){ gov->daughters[k] = gov->daughters[k+1]; } gov->daughters_nb--; } } } } } void remove_word_rec(sentence *s, int i) { int j; word *w = s->words[i]; for(j=1; j < s->l; j++){ if((s->words[j]) && (s->words[j]->mother == w)) remove_word_rec(s, j); } remove_daughter(s, i); free(w); s->words[i] = NULL; } void remove_subtree(sentence *s, int root) { remove_word_rec(s, root); compact_sentence(s); } void add_word(sentence *s, word *w, int index, word *gov) { int i; if(s->words[index] != NULL){ for(i=s->l; i>index; i--){ s->words[i] = s->words[i-1]; } s->l++; } s->words[index] = w; if(index >= s->l) s->l = index+1; if(gov != NULL) add_daughter(w, gov); } void split_node_in_two(sentence *s, int index, word *gov, word *dep, int index_gov, int index_dep) { int i; word *w = s->words[index]; word *mother = w->mother; strcpy(gov->deprel, w->deprel); for(i=1; i < s->l; i++){ if(s->words[i]->mother == w) add_daughter(s->words[i], gov); } free(w); s->words[index] = NULL; add_word(s, gov, index_gov, mother); add_word(s, dep, index_dep, gov); } /*---------------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------------*/ void change_cpos(sentence *s, hash_str *h_cpos) { int i; word *w; char *val; for(i=1; i<s->l; i++){ w = s->words[i]; val = hash_str_get_val (h_cpos, w->cpostag); if(val){ strcpy(w->cpostag, val); } else{ fprintf(stderr, "ATTENTION: cpos %s inconnue\n", w->cpostag); } } } /*---------------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------------*/ void change_pos(sentence *s, hash_str *h_pos) { int i; word *w; char *val; for(i=1; i<s->l; i++){ w = s->words[i]; val = hash_str_get_val (h_pos, w->postag); if(val){ strcpy(w->postag, val); } else{ fprintf(stderr, "ATTENTION: pos %s inconnue\n", w->cpostag); } } } /*---------------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------------*/ void change_fct(sentence *s, hash_str *h_fct) { int i; word *w; char *val; for(i=1; i<s->l; i++){ w = s->words[i]; val = hash_str_get_val (h_fct, w->deprel); if(val){ strcpy(w->deprel, val); } else{ fprintf(stderr, "ATTENTION: fct %s inconnue\n", w->deprel); } } } int is_num(char *s) { int i; int l; if(s == NULL) return 0; l = strlen(s); if((l == 1) && (s[0] == ',')) return 0; for(i=0; i <l; i++) if(((s[i] < '0') || (s[i] > '9')) && (s[i] != ',')) return 0; return 1; } void renumber_sentence_offset(sentence *s, int offset) { int i; for(i=0 ; i < s->l; i++){ s->words[i]->id = i + offset; } }