Commit 68ee1829 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed a bug in word.h

parents 1d536e6c ee98b081
......@@ -8,7 +8,7 @@
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 48
#define MCD_WF_NB 51
#define MCD_WF_ID 0
#define MCD_WF_OFFSET 0 /* ID and OFFSET are synonymous */
......@@ -61,6 +61,12 @@
#define MCD_WF_Person 45
#define MCD_WF_Tense 46
#define MCD_WF_FILE 48
#define MCD_WF_DIRECTORY 49
#define MCD_WF_SPEAKER 50
/*Abbr
AdpType
AdvType
......
......@@ -63,6 +63,11 @@ typedef struct _word {
#define word_get_label(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL])
#define word_get_stag(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG])
#define word_get_sent_seg(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SENT_SEG])
#define word_get_file(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_FILE])
#define word_get_directory(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_DIRECTORY])
#define word_get_speaker(w) (((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_SPEAKER])
#define word_get_A(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_A])
#define word_get_B(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_B])
#define word_get_C(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_C])
......
......@@ -512,6 +512,12 @@ int mcd_wf_code(char *wf)
/* if(!strcmp(wf, "INT")) return MCD_WF_INT; */
if(!strcmp(wf, "GOV")) return MCD_WF_GOV;
if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG;
if(!strcmp(wf, "FILE")) return MCD_WF_FILE;
if(!strcmp(wf, "DIRECTORY")) return MCD_WF_DIRECTORY;
if(!strcmp(wf, "SPEAKER")) return MCD_WF_SPEAKER;
if(!strcmp(wf, "A")) return MCD_WF_A;
if(!strcmp(wf, "B")) return MCD_WF_B;
if(!strcmp(wf, "C")) return MCD_WF_C;
......
......@@ -22,7 +22,6 @@ word *word_new(char *input)
w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV;
w->form = NULL;
w->form_char16 = NULL;
w->index = -1;
w->signature = -1;
w->is_root = 0;
......
......@@ -12,6 +12,7 @@ extern char *token;
/*%option noyywrap*/
%%
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);}
......
......@@ -2,6 +2,9 @@
#include<stdlib.h>
#include<string.h>
#include<getopt.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include"mcd.h"
#include"util.h"
......@@ -16,6 +19,7 @@ typedef struct {
char *mcf_filename;
char *mcd_filename;
mcd *mcd_struct;
char *root_dir;
} context;
void mcf2json_context_free(context *ctx)
......@@ -31,6 +35,8 @@ void mcf2json_context_free(context *ctx)
free(ctx->mcd_filename);
if(ctx->mcd_struct)
mcd_free(ctx->mcd_struct);
if(ctx->root_dir)
free(ctx->root_dir);
free(ctx);
}
}
......@@ -47,6 +53,7 @@ context *mcf2json_context_new(void)
ctx->mcf_filename = NULL;
ctx->mcd_filename = NULL;
ctx->mcd_struct = NULL;
ctx->root_dir = NULL;
return ctx;
}
......@@ -58,7 +65,7 @@ void mcf2json_context_general_help_message(context *ctx)
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-C --mcd : mcd filename\n");
fprintf(stderr, "\t-i --mcf : mcf filename (read from stdin if absent)\n");
fprintf(stderr, "\t-o --conll : conll filename (write to stdout if absent)\n");
fprintf(stderr, "\t-r --root : root directory of the json files\n");
}
void mcf2json_check_options(context *ctx){
......@@ -81,14 +88,14 @@ context *mcf2json_context_read_options(int argc, char *argv[])
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'o'},
{"mcd", required_argument, 0, 'C'},
{"mcf", required_argument, 0, 'i'},
{"root", required_argument, 0, 'r'},
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdo:C:i:", long_options, &option_index)) != -1){
while ((c = getopt_long (argc, argv, "hvdC:i:r:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
......@@ -100,15 +107,15 @@ context *mcf2json_context_read_options(int argc, char *argv[])
case 'v':
ctx->verbose = 1;
break;
case 'o':
ctx->conll_filename = strdup(optarg);
break;
case 'i':
ctx->mcf_filename = strdup(optarg);
break;
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'r':
ctx->root_dir = strdup(optarg);
break;
}
}
......@@ -118,7 +125,6 @@ context *mcf2json_context_read_options(int argc, char *argv[])
else{
ctx->mcd_struct = mcd_build_wpmlgfs();
}
return ctx;
}
......@@ -129,7 +135,7 @@ void print_footer(FILE *output_file)
}
void print_header(FILE *output_file, mcd *mcd_struct)
void print_header(FILE *output_file, mcd *mcd_struct, char *filename)
{
int pos_col = mcd_get_pos_col(mcd_struct);
int label_col = mcd_get_label_col(mcd_struct);
......@@ -143,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "\"header\":{\n");
fprintf(output_file, "\"id\": \"\",\n");
fprintf(output_file, "\"timestamp\": \"\",\n");
fprintf(output_file, "\"filename\": \"%s\",\n", filename);
fprintf(output_file, "\"labels_segment\": [");
for(i=0; i < dico_pos->nbelem; i++){
......@@ -160,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf(output_file, "},\n");
fprintf(output_file, "\"annotations\":{\n");
fprintf(output_file, "\"annotation\":{\n");
fprintf(output_file, "\"name\": \"\",\n");
fprintf(output_file, "\"time_start\": \"\",\n");
fprintf(output_file, "\"time_end\": \"\"\n");
......@@ -227,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
}
void print_segment(FILE *output_file, word_buffer *wb, int index)
void print_segment(FILE *output_file, word_buffer *wb, int index_first_word, int index)
{
int pos_col = mcd_get_pos_col(word_buffer_get_mcd(wb));
word *w = word_buffer_get_word_n(wb, index);
fprintf(output_file, "{ ");
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf(output_file, "\"start\": %d, ", index);
fprintf(output_file, "\"start\": %d, ", index - index_first_word);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf(output_file, "\"end\": %d, ", index);
fprintf(output_file, "\"end\": %d, ", index - index_first_word);
fprintf(output_file, "\"label\": \"");
if(pos_col != -1)
......@@ -258,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
{
int index;
int first_segment = 1;
fprintf(output_file, "\"segments\": [");
for(index = index_first_word; index <= index_last_word; index++){
if(first_segment == 1) first_segment = 0; else fprintf(output_file, ",");
fprintf(output_file, "\n");
print_segment(output_file, wb, index);
print_segment(output_file, wb, index_first_word, index);
}
fprintf(output_file," ],\n");
}
......@@ -317,7 +324,7 @@ void print_sentence(FILE *output_file, int sentence_nb, word_buffer *wb, int ind
int main(int argc, char *argv[])
{
FILE *output_file;
FILE *output_file = NULL;
context *ctx = mcf2json_context_read_options(argc, argv);
word_buffer *wb = NULL;
word *w = NULL;
......@@ -326,41 +333,107 @@ int main(int argc, char *argv[])
int index_first_word;
int index_last_word;
int sentence_nb = 0;
char current_directory[1000];
char current_file[1000];
char previous_directory[1000];
char previous_file[1000];
char filename_for_header[1000];
char *root_directory = NULL;
char destination_file[1000];
char destination_dir[1000];
struct stat st = {0};
mcf2json_check_options(ctx);
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->mcf_filename);
output_file = (ctx->conll_filename)? myfopen_no_exit(ctx->conll_filename, "w"): stdout;
wb = word_buffer_load_mcf(ctx->mcf_filename, ctx->mcd_struct);
print_header(output_file, ctx->mcd_struct);
do{
w = word_buffer_b0(wb);
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
if(ctx->root_dir){
if(stat(ctx->root_dir, &st) == -1) {
mkdir(ctx->root_dir, 0700);
fprintf(stderr, "creating directory %s\n", ctx->root_dir);
}
} while(word_buffer_move_right(wb));
print_footer(output_file);
if(ctx->conll_filename)
do{
w = word_buffer_b0(wb);
if(w == NULL) break;
word_sprint_col_n(current_directory, w, ctx->mcd_struct->wf2col[MCD_WF_DIRECTORY]);
word_sprint_col_n(current_file, w, ctx->mcd_struct->wf2col[MCD_WF_FILE]);
if(strcmp(current_directory, previous_directory)){
strcpy(destination_dir, ctx->root_dir);
strcat(destination_dir, "/");
strcat(destination_dir, current_directory);
if (stat(destination_dir, &st) == -1) {
mkdir(destination_dir, 0700);
fprintf(stderr, "creating directory %s\n", destination_dir);
}
}
if(strcmp(current_file, previous_file)){
strcpy(destination_file, destination_dir);
strcat(destination_file, "/");
strcat(destination_file, current_file);
strcat(destination_file, ".json");
fprintf(stderr, "creating file %s\n", destination_file);
if(output_file){
print_footer(output_file);
fclose(output_file);
}
output_file = myfopen_no_exit(destination_file, "w");
strcpy(filename_for_header, current_directory);
strcat(filename_for_header, "/");
strcat(filename_for_header, current_file);
strcat(filename_for_header, ".json");
print_header(output_file, ctx->mcd_struct, filename_for_header);
first_sentence = 1;
}
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
}
strcpy(previous_file, current_file);
strcpy(previous_directory, current_directory);
} while(word_buffer_move_right(wb));
print_footer(output_file);
fclose(output_file);
mcf2json_context_free(ctx);
}
else{ //ctx->root_dir is NULL dump everything to stdout
output_file = stdout;
print_header(output_file, ctx->mcd_struct, "");
do{
w = word_buffer_b0(wb);
if(new_sentence){
new_sentence = 0;
sentence_nb++;
index_first_word = word_buffer_get_current_index(wb);
}
if(word_get_sent_seg(w)){
index_last_word = word_buffer_get_current_index(wb);
new_sentence = 1;
if(first_sentence == 1)
first_sentence = 0;
else
fprintf(output_file, ",");
fprintf(output_file, "\n");
print_sentence(output_file, sentence_nb, wb, index_first_word, index_last_word);
}
} while(word_buffer_move_right(wb));
print_footer(output_file);
}
mcf2json_context_free(ctx);
return 0;
}
......@@ -63,18 +63,19 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label)
/* s0 is the root of the sentence */
if((s0_label == root_label)
// && (word_get_label(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != root_label)
&& check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index)
&& check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index)
){
return MVT_PARSER_ROOT;
}
/* word on the top of the stack is an end of sentence marker */
if((word_get_sent_seg(word_buffer_get_word_n(ref, s0_index)) == 1)
// && (word_get_sent_seg(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != 1)
&& check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index)
&& check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index)
){
return MVT_PARSER_EOS;
}
/* LEFT ARC b0 is the governor and s0 the dependent */
if(s0_gov_index == b0_index){
return movement_parser_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index)));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment