Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Alexis Nasr
macaon2
Commits
ee98b081
Commit
ee98b081
authored
Feb 07, 2018
by
Alexis Nasr
Browse files
fixed some details in mcf2json modified tokenizer to ignore comments
parent
35f18a2d
Changes
3
Hide whitespace changes
Inline
Side-by-side
maca_tokenizer/src/en_tok_rules.l
View file @
ee98b081
...
...
@@ -12,6 +12,7 @@ extern char *token;
/*%option noyywrap*/
%%
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
[ \t]+ {maca_tokenizer_segment((char *)"", yytext);}
[ ]*\. {maca_tokenizer_segment((char *)".", yytext);}
...
...
maca_tokenizer/src/fr_tok_rules.l
View file @
ee98b081
...
...
@@ -26,7 +26,7 @@ nosepar [^ \t\n]
if(defait_amalgames){
BEGIN(state_defait_amalgames);
}
#.* ECHO;
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);}
\. {maca_tokenizer_segment((char *)".", yytext);}
...
...
maca_tools/src/mcf2json.c
View file @
ee98b081
...
...
@@ -135,7 +135,7 @@ void print_footer(FILE *output_file)
}
void
print_header
(
FILE
*
output_file
,
mcd
*
mcd_struct
)
void
print_header
(
FILE
*
output_file
,
mcd
*
mcd_struct
,
char
*
filename
)
{
int
pos_col
=
mcd_get_pos_col
(
mcd_struct
);
int
label_col
=
mcd_get_label_col
(
mcd_struct
);
...
...
@@ -149,6 +149,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf
(
output_file
,
"
\"
header
\"
:{
\n
"
);
fprintf
(
output_file
,
"
\"
id
\"
:
\"\"
,
\n
"
);
fprintf
(
output_file
,
"
\"
timestamp
\"
:
\"\"
,
\n
"
);
fprintf
(
output_file
,
"
\"
filename
\"
:
\"
%s
\"
,
\n
"
,
filename
);
fprintf
(
output_file
,
"
\"
labels_segment
\"
: ["
);
for
(
i
=
0
;
i
<
dico_pos
->
nbelem
;
i
++
){
...
...
@@ -166,7 +167,7 @@ void print_header(FILE *output_file, mcd *mcd_struct)
fprintf
(
output_file
,
"},
\n
"
);
fprintf
(
output_file
,
"
\"
annotation
s
\"
:{
\n
"
);
fprintf
(
output_file
,
"
\"
annotation
\"
:{
\n
"
);
fprintf
(
output_file
,
"
\"
name
\"
:
\"\"
,
\n
"
);
fprintf
(
output_file
,
"
\"
time_start
\"
:
\"\"
,
\n
"
);
fprintf
(
output_file
,
"
\"
time_end
\"
:
\"\"\n
"
);
...
...
@@ -233,16 +234,16 @@ void print_links(FILE *output_file, word_buffer *wb, int index_first_word, int i
}
void
print_segment
(
FILE
*
output_file
,
word_buffer
*
wb
,
int
index
)
void
print_segment
(
FILE
*
output_file
,
word_buffer
*
wb
,
int
index_first_word
,
int
index
)
{
int
pos_col
=
mcd_get_pos_col
(
word_buffer_get_mcd
(
wb
));
word
*
w
=
word_buffer_get_word_n
(
wb
,
index
);
fprintf
(
output_file
,
"{ "
);
/* fprintf(output_file, "\"start\": %d, ", word_get_offset(w)); */
fprintf
(
output_file
,
"
\"
start
\"
: %d, "
,
index
);
fprintf
(
output_file
,
"
\"
start
\"
: %d, "
,
index
-
index_first_word
);
/* fprintf(output_file, "\"end\": %d, ", word_get_offset(w) + word_get_length(w) - 1); */
fprintf
(
output_file
,
"
\"
end
\"
: %d, "
,
index
);
fprintf
(
output_file
,
"
\"
end
\"
: %d, "
,
index
-
index_first_word
);
fprintf
(
output_file
,
"
\"
label
\"
:
\"
"
);
if
(
pos_col
!=
-
1
)
...
...
@@ -264,12 +265,12 @@ void print_segments(FILE *output_file, word_buffer *wb, int index_first_word, in
{
int
index
;
int
first_segment
=
1
;
fprintf
(
output_file
,
"
\"
segments
\"
: ["
);
for
(
index
=
index_first_word
;
index
<=
index_last_word
;
index
++
){
if
(
first_segment
==
1
)
first_segment
=
0
;
else
fprintf
(
output_file
,
","
);
fprintf
(
output_file
,
"
\n
"
);
print_segment
(
output_file
,
wb
,
index
);
print_segment
(
output_file
,
wb
,
index_first_word
,
index
);
}
fprintf
(
output_file
,
" ],
\n
"
);
}
...
...
@@ -336,6 +337,7 @@ int main(int argc, char *argv[])
char
current_file
[
1000
];
char
previous_directory
[
1000
];
char
previous_file
[
1000
];
char
filename_for_header
[
1000
];
char
*
root_directory
=
NULL
;
char
destination_file
[
1000
];
char
destination_dir
[
1000
];
...
...
@@ -377,7 +379,12 @@ int main(int argc, char *argv[])
fclose
(
output_file
);
}
output_file
=
myfopen_no_exit
(
destination_file
,
"w"
);
print_header
(
output_file
,
ctx
->
mcd_struct
);
strcpy
(
filename_for_header
,
current_directory
);
strcat
(
filename_for_header
,
"/"
);
strcat
(
filename_for_header
,
current_file
);
strcat
(
filename_for_header
,
".json"
);
print_header
(
output_file
,
ctx
->
mcd_struct
,
filename_for_header
);
first_sentence
=
1
;
}
if
(
new_sentence
){
new_sentence
=
0
;
...
...
@@ -404,7 +411,7 @@ int main(int argc, char *argv[])
else
{
//ctx->root_dir is NULL dump everything to stdout
output_file
=
stdout
;
print_header
(
output_file
,
ctx
->
mcd_struct
);
print_header
(
output_file
,
ctx
->
mcd_struct
,
""
);
do
{
w
=
word_buffer_b0
(
wb
);
if
(
new_sentence
){
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment