Commit 61309227 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

Merge branch 'master' of gitlab.lif.univ-mrs.fr:alexis.nasr/macaon2

parents b662e2f8 1dd223d1
......@@ -8,9 +8,11 @@
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 47
#define MCD_WF_NB 48
#define MCD_WF_ID 0
#define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */
#define MCD_WF_LENGTH 47
#define MCD_WF_FORM 1
#define MCD_WF_LEMMA 2
#define MCD_WF_CPOS 3
......@@ -129,6 +131,8 @@ Xtra*/
#define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_ID]
#define mcd_get_offset_col(m) (m)->wf2col[MCD_WF_OFFSET]
#define mcd_get_length_col(m) (m)->wf2col[MCD_WF_LENGTH]
#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM]
#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA]
#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS]
......
......@@ -51,6 +51,8 @@ typedef struct _word {
#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
#define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET])
#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH])
#define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
#define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA])
#define word_get_cpos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS])
......
......@@ -54,7 +54,7 @@ void word_buffer_print(FILE *f, word_buffer *wb);
void word_buffer_print_compact(FILE *f, word_buffer *wb);
int word_buffer_read_sentence(word_buffer *bw);
word_buffer *word_buffer_load_mcf(char *mcf_filename, mcd *mcd_struct);
int word_buffer_locate_token(word_buffer *wb, int offset);
/*
int word_buffer_is_empty(word_buffer *wb);
int word_buffer_is_last(word_buffer *wb);
......
......@@ -500,6 +500,8 @@ dico_vec *mcd_build_dico_vec(mcd *mcd_struct)
int mcd_wf_code(char *wf)
{
if(!strcmp(wf, "INDEX")) return MCD_WF_ID;
if(!strcmp(wf, "OFFSET")) return MCD_WF_OFFSET;
if(!strcmp(wf, "LENGTH")) return MCD_WF_LENGTH;
if(!strcmp(wf, "FORM")) return MCD_WF_FORM;
if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA;
if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS;
......
......@@ -156,6 +156,37 @@ int word_buffer_read_sentence(word_buffer *wb)
return wb->nbelem ;
}
int word_buffer_locate_token(word_buffer *wb, int offset)
{
int c, first, last, middle;
word *w_middle;
first = 0;
last = wb->nbelem - 1;
middle = (first+last)/2;
while (first <= last) {
// printf("first = %d middle = %d last = %d\n", first, middle, last);
w_middle = word_buffer_get_word_n(wb, middle);
// printf("w middle = %d current offset = %d\n", w_middle, word_get_offset(w_middle));
if (word_get_offset(w_middle) < offset)
first = middle + 1;
else if (word_get_offset(w_middle) == offset) {
// printf("%d found at location %d.\n", offset, middle+1);
break;
}
else
last = middle - 1;
middle = (first + last)/2;
}
if (first > last){
// printf("Not found! %d is not present in the list.\n", offset);
return -1;
}
return middle;
}
/*int word_buffer_end(word_buffer *wb)
{
return (wb->current_index >= wb->nbelem)? 1 : 0;
......
......@@ -10,27 +10,39 @@ extern int print_offset;
extern int print_token_length;
void maca_tokenizer_segment(char *separator, char *text_matched){
int first = 1;
if(token_length != 0){
if(print_offset){
if(first == 1) first = 0; else printf("\t");
printf("%d", offset);
}
if(print_token_length){
if(first == 1) first = 0; else printf("\t");
printf("%d", utf8_strlen(token));
}
if(first == 1) first = 0; else printf("\t");
printf("%s", token);
if(print_offset)
printf("\t%d", offset);
if(print_token_length)
printf("\t%d", utf8_strlen(token));
printf("\n");
}
offset += utf8_strlen(token);
token_length = 0;
token[0] = 0;
first = 1;
if(strlen(separator) != 0){
if(print_offset){
if(first == 1) first = 0; else printf("\t");
printf("%d", offset);
}
if(print_token_length){
if(first == 1) first = 0; else printf("\t");
printf("%d", (int) utf8_strlen(separator));
}
if(first == 1) first = 0; else printf("\t");
printf("%s", separator);
if(print_offset)
printf("\t%d", offset);
if(print_token_length)
printf("\t%d", (int) utf8_strlen(separator));
printf("\n");
}
offset += utf8_strlen(text_matched);
}
......
......@@ -129,11 +129,130 @@ context *json2mcf_context_read_options(int argc, char *argv[])
return ctx;
}
void json2mcf_print_word_buffer(FILE *f, word_buffer *wb)
{
int i;
word *w;
int col_nb = 0;
mcd *mcd_struct = word_buffer_get_mcd(wb);
char *string;
int wf;
for(i=0; i < wb->nbelem; i++){
w = word_buffer_get_word_n(wb, i);
for(col_nb=0; col_nb < mcd_struct->nb_col;col_nb++){
wf = mcd_struct->wf[col_nb];
if(col_nb > 0) {fprintf(f, "\t");}
if(mcd_struct->representation[col_nb] == MCD_REPRESENTATION_INT){
fprintf(f, "%d", w->wf_array[wf]);
}
if(mcd_struct->representation[col_nb] == MCD_REPRESENTATION_VOCAB){
string = mcd_get_str(mcd_struct, w->wf_array[wf], col_nb);
// string = dico_int2string(mcd_struct->dico_array[col_nb], w->wf_array[wf]);
if(string)
fprintf(f, "%s", string);
else
fprintf(f, "_");
}
}
fprintf(f, "\n");
}
}
/* if((mcd_get_gov_col(mcd_struct) == -1)
&& (mcd_get_label_col(mcd_struct) == -1)
&& (mcd_get_sent_seg_col(mcd_struct) == -1)){
printf("%s\t", word_get_input(w));
printf("%d\t", word_get_gov(w));
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
printf("%s\t", label) ;
else
printf("_\t");
if(word_get_sent_seg(w) == 1)
printf("1\n") ;
else
printf("0\n");
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_gov_col(mcd_struct)){
printf("%d", word_get_gov(w));
}
else
if(col_nb == mcd_get_label_col(mcd_struct)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
printf("%s", label) ;
else
printf("_");
}
else
if(col_nb == mcd_get_sent_seg_col(mcd_struct)){
if(word_get_sent_seg(w) == 1)
printf("1") ;
else
printf("0");
}
else{
word_print_col_n(stdout, w, col_nb);
}
col_nb++;
token = strtok(NULL, "\t");
}
if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){
printf("\t%d", word_get_gov(w));
}
if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
printf("\t%s", label) ;
else
printf("\t_");
}
if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){
if(word_get_sent_seg(w) == 1)
printf("\t1") ;
else
printf("\t0");
}
printf("\n");
free(buffer);
}
}
}
*/
void update_segment(word_buffer *wb, int start, int end, char *label, char *status_seg, char *status_lab)
{
int index;
word *w;
int label_code;
dico *d;
if(status_lab && !strcmp(status_lab, "GOLD")){
printf("updating label of segment [%d-%d] with \"%s\"\n", start, end, label);
index = word_buffer_locate_token(wb, start);
w = word_buffer_get_word_n(wb, index);
d = word_buffer_get_mcd(wb)->dico_array[MCD_WF_CPOS];
if(d)
label_code = dico_string2int(d, label);
if(label_code == -1)
fprintf(stderr, "label %s unknown\n", label);
else
word_set_pos(w, label_code);
}
}
void process_segment(json_attr_val *avl, word_buffer *wb)
{
int start, end;
......@@ -149,14 +268,14 @@ void process_segment(json_attr_val *avl, word_buffer *wb)
if(!strcmp(av->attr, "status_lab")){status_lab = av->val->u.string; continue;}
}
update_segment(wb, start, end, label, status_seg, status_lab);
// printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab);
// printf("segment : start = %d end = %d label = %s status_seg = %s status_lab = %s\n", start, end, label, status_seg, status_lab);
}
void process_segments(json_struct *segments, word_buffer *wb)
{
json_struct *segment;
printf("process_segments\n");
// printf("process_segments\n");
for(segment = segments->u.first; segment != NULL; segment = segment->next){
process_segment(segment->u.attr_val_list, wb);
}
......@@ -185,7 +304,7 @@ void process_link(json_attr_val *avl, word_buffer *wb)
void process_links(json_struct *segments, word_buffer *wb)
{
json_struct *link;
printf("process_links\n");
// printf("process_links\n");
for(link = segments->u.first; link != NULL; link = link->next){
process_link(link->u.attr_val_list, wb);
}
......@@ -195,9 +314,9 @@ void process_links(json_struct *segments, word_buffer *wb)
void process_document(json_struct *document, word_buffer *wb)
{
json_attr_val *avl = NULL;
printf("process_document\n");
// printf("process_document\n");
for(avl = document->u.attr_val_list; avl != NULL; avl = avl->next){
if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
// if(!strcmp(avl->attr, (char *)"id")) printf("id = %s\n", avl->val->u.string);
if(!strcmp(avl->attr, (char *)"segments")) process_segments(avl->val, wb);
if(!strcmp(avl->attr, (char *)"links")) process_links(avl->val, wb);
}
......@@ -206,7 +325,7 @@ void process_document(json_struct *document, word_buffer *wb)
void process_documents(json_struct *documents, word_buffer *wb)
{
json_struct *document;
printf("process_documents\n");
// printf("process_documents\n");
for(document = documents->u.first; document != NULL; document = document->next){
process_document(document, wb);
}
......@@ -228,6 +347,7 @@ int main(int argc, char *argv[])
json_attr_val *avl = NULL;
json2mcf_check_options(ctx);
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename);
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
root = json_parse_full(ctx->json_filename, 0);
......@@ -236,14 +356,14 @@ int main(int argc, char *argv[])
exit(1);
}
for(avl = root->u.attr_val_list; avl != NULL; avl = avl->next){
printf("section %s\n", avl->attr);
// printf("section %s\n", avl->attr);
if(!strcmp(avl->attr, (char *)"documents")){
process_documents(avl->val, wb);
}
}
//json_print_struct(stdout, root);
json2mcf_print_word_buffer(stdout, wb);
json_free_struct(root);
json2mcf_context_free(ctx);
return 0;
......
......@@ -144,17 +144,19 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"labels_segment\": \"");
fprintf(output_file, "\"labels_segment\": [");
for(i=0; i < dico_pos->nbelem; i++){
fprintf(output_file, " %s", dico_pos->array[i]);
if(i != 0) fprintf(output_file, ", ");
fprintf(output_file, "\"%s\"", dico_pos->array[i]);
}
fprintf(output_file, "\",\n");
fprintf(output_file, "],\n");
fprintf(output_file, "\"labels_link\": \"");
fprintf(output_file, "\"labels_link\": [");
for(i=0; i < dico_label->nbelem; i++){
fprintf(output_file, " %s", dico_label->array[i]);
if(i != 0) fprintf(output_file, ", ");
fprintf(output_file, "\"%s\"", dico_label->array[i]);
}
fprintf(output_file, "\"\n");
fprintf(output_file, "]\n");
fprintf(output_file, "},\n");
......@@ -167,67 +169,73 @@ void print_header(FILE *output_file, mcd *mcd_struct)
}
void print_link(FILE *output_file, word *w, int index, int gov_col, int label_col)
void print_link(FILE *output_file, word_buffer *wb, int index)
{
fprintf(output_file, "{");
fprintf(output_file, "\"orig\": %d, ", index);
fprintf(output_file, "\"dest\":");
if(gov_col){
if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
fprintf(output_file, "0");
else
fprintf(output_file, "%d", word_get_gov(w) + index);
}
else
fprintf(output_file, "_");
fprintf(output_file, ", ");
fprintf(output_file, "\"label\": \"");
if(label_col != -1)
word_print_col_n(output_file, w, label_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_link\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\"");
fprintf(output_file, "}");
int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb));
int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
int index_col = mcd_get_index_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{");
// fprintf(output_file, "\"orig\": %d, ", word_get_offset(w));
fprintf(output_file, "\"orig\": %d, ", index);
fprintf(output_file, "\"dest\":");
if(gov_col){
if((word_get_gov(w) == 0) || ((word_get_gov(w) + index) < 0))
fprintf(output_file, "0");
else{
word *gov = word_buffer_get_word_n(wb, word_get_gov(w) + index);
// fprintf(output_file, "%d", word_get_offset(gov));
fprintf(output_file, "%d", word_get_gov(w) + index);
}
}
else
fprintf(output_file, "_");
fprintf(output_file, ", ");
fprintf(output_file, "\"label\": \"");
if(label_col != -1)
word_print_col_n(output_file, w, label_col);
else
fprintf(output_file, "_");
fprintf(output_file, "\", ");
fprintf(output_file, "\"status_link\": \"\", ");
fprintf(output_file, "\"status_lab\": \"\", ");
fprintf(output_file, "\"timestamp\": \"\", ");
fprintf(output_file, "\"author\": \"\", ");
fprintf(output_file, "\"target\": \"\"");
fprintf(output_file, "}");
}
void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int gov_col = mcd_get_gov_col(word_buffer_get_mcd(wb));
int label_col = mcd_get_label_col(word_buffer_get_mcd(wb));
int first_link = 1;
fprintf(output_file, "\"links\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_link == 1)
first_link = 0;
else
fprintf(output_file, ",");
if(first_link == 1) first_link = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_link(output_file, w, index - index_first_word + 1, gov_col, label_col);
print_link(output_file, wb, index);
}
fprintf(output_file," ]");
}
void print_segment(FILE *output_file, word *w, int index, int pos_col)
void print_segment(FILE *output_file, word_buffer *wb, int index)
{
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"label\": \"");
......@@ -248,28 +256,27 @@ void print_segment(FILE *output_file, word *w, int index, int pos_col)
void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_segment == 1)
first_segment = 0;
else
fprintf(output_file, ",");
if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, w, index - index_first_word + 1, pos_col);
print_segment(output_file, wb, index);
}
fprintf(output_file," ],\n");
}
void print_token(FILE *output_file, word *w, int index, int form_col)
void print_token(FILE *output_file, word_buffer *wb, int index)
{
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int offset_col = mcd_get_offset_col(word_buffer_get_mcd(wb));
int length_col = mcd_get_length_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
fprintf(output_file, "\"id\": %d, ", index);
fprintf(output_file, "\"id\": %d, ", word_get_offset(w));
fprintf(output_file, "\"word\": \"");
if(form_col != -1)
word_print_col_n(output_file, w, form_col);
......@@ -285,20 +292,14 @@ void print_token(FILE *output_file, word *w, int index, int form_col)
void print_tokens(FILE *output_file, word_buffer *wb, int index_first_word, int index_last_word)
{
word *w;
int index;
int form_col = mcd_get_form_col(word_buffer_get_mcd(wb));
int first_token = 1;
fprintf(output_file, "\"tokens\": [");
for(index = index_first_word; index <= index_last_word; index++){
w = word_buffer_get_word_n(wb, index);
if(first_token == 1)
first_token = 0;
else
fprintf(output_file, ",");
if(first_token == 1) first_token = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_token(output_file, w, index - index_first_word + 1, form_col);
print_token(output_file, wb, index);
}
fprintf(output_file," ],\n");
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment