Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
old_macaon
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Franck Dary
old_macaon
Commits
ca56eef3
Commit
ca56eef3
authored
Apr 10, 2019
by
Franck Dary
Browse files
Options
Downloads
Patches
Plain Diff
Added program macaon_compute_l_rules
parent
0307121c
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
maca_common/CMakeLists.txt
+5
-0
5 additions, 0 deletions
maca_common/CMakeLists.txt
maca_common/src/macaon_compute_l_rules.cpp
+131
-0
131 additions, 0 deletions
maca_common/src/macaon_compute_l_rules.cpp
maca_common/src/util.cpp
+18
-20
18 additions, 20 deletions
maca_common/src/util.cpp
with
154 additions
and
20 deletions
maca_common/CMakeLists.txt
+
5
−
0
View file @
ca56eef3
FILE
(
GLOB SOURCES src/*.cpp
)
FILE
(
GLOB SOURCES src/*.cpp
)
add_executable
(
macaon_compute_l_rules src/macaon_compute_l_rules.cpp
)
target_link_libraries
(
macaon_compute_l_rules
${
Boost_PROGRAM_OPTIONS_LIBRARY
}
)
target_link_libraries
(
macaon_compute_l_rules maca_common
)
install
(
TARGETS macaon_compute_l_rules DESTINATION bin
)
#compiling library
#compiling library
add_library
(
maca_common STATIC
${
SOURCES
}
)
add_library
(
maca_common STATIC
${
SOURCES
}
)
target_link_libraries
(
maca_common fasttext
)
target_link_libraries
(
maca_common fasttext
)
This diff is collapsed.
Click to expand it.
maca_common/src/macaon_compute_l_rules.cpp
0 → 100644
+
131
−
0
View file @
ca56eef3
/// \file macaon_compute_l_rules.cpp
/// \author Franck Dary
/// @version 1.0
/// @date 2019-04-10
#include
<cstdio>
#include
<cstdlib>
#include
<iostream>
#include
"File.hpp"
#include
"util.hpp"
#include
<boost/program_options.hpp>
namespace
po
=
boost
::
program_options
;
/// @brief Get the list of mandatory and optional program arguments.
///
/// @return The lists.
po
::
options_description
getOptionsDescription
()
{
po
::
options_description
desc
(
"Command-Line Arguments "
);
po
::
options_description
req
(
"Required"
);
req
.
add_options
()
(
"fplm,f"
,
po
::
value
<
std
::
string
>
()
->
required
(),
"fplm file that contains words and their lemmas"
)
(
"exceptions,e"
,
po
::
value
<
std
::
string
>
()
->
required
(),
"Output filename for exceptions"
)
(
"rules,r"
,
po
::
value
<
std
::
string
>
()
->
required
(),
"Output filename for rules"
)
(
"threshold,t"
,
po
::
value
<
int
>
()
->
required
(),
"Number of times a rule must be used in the fplm before it is outputted"
);
po
::
options_description
opt
(
"Optional"
);
opt
.
add_options
()
(
"help,h"
,
"Produce this help message"
)
(
"strict,s"
,
"TODO : find what it does"
)
(
"debug,d"
,
"Print infos on stderr"
);
desc
.
add
(
req
).
add
(
opt
);
return
desc
;
}
/// @brief Store the program arguments inside a variables_map
///
/// @param od The description of all the possible options.
/// @param argc The number of arguments given to this program.
/// @param argv The values of arguments given to this program.
///
/// @return The variables map
po
::
variables_map
checkOptions
(
po
::
options_description
&
od
,
int
argc
,
char
**
argv
)
{
po
::
variables_map
vm
;
try
{
po
::
store
(
po
::
parse_command_line
(
argc
,
argv
,
od
),
vm
);}
catch
(
std
::
exception
&
e
)
{
std
::
cerr
<<
"Error: "
<<
e
.
what
()
<<
"
\n
"
;
od
.
print
(
std
::
cerr
);
exit
(
1
);
}
if
(
vm
.
count
(
"help"
))
{
std
::
cout
<<
od
<<
"
\n
"
;
exit
(
0
);
}
try
{
po
::
notify
(
vm
);}
catch
(
std
::
exception
&
e
)
{
std
::
cerr
<<
"Error: "
<<
e
.
what
()
<<
"
\n
"
;
od
.
print
(
std
::
cerr
);
exit
(
1
);
}
return
vm
;
}
/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
///
/// @param argc The number of arguments given to this program.
/// @param argv[] Array of arguments given to this program.
///
/// @return 0 if there was no crash.
int
main
(
int
argc
,
char
*
argv
[])
{
auto
od
=
getOptionsDescription
();
po
::
variables_map
vm
=
checkOptions
(
od
,
argc
,
argv
);
std
::
string
fplmFilename
=
vm
[
"fplm"
].
as
<
std
::
string
>
();
std
::
string
exceptionsFilename
=
vm
[
"exceptions"
].
as
<
std
::
string
>
();
std
::
string
rulesFilename
=
vm
[
"rules"
].
as
<
std
::
string
>
();
int
threshold
=
vm
[
"threshold"
].
as
<
int
>
();
bool
strict
=
vm
.
count
(
"strict"
)
==
0
?
false
:
true
;
File
fplm
(
fplmFilename
,
"r"
);
char
buffer
[
100000
];
std
::
map
<
std
::
string
,
int
>
rules
;
while
(
fscanf
(
fplm
.
getDescriptor
(),
"%[^
\n
]
\n
"
,
buffer
)
==
1
)
{
auto
splited
=
split
(
buffer
,
'\t'
);
if
(
splited
.
size
()
!=
4
)
{
fprintf
(
stderr
,
"ERROR (%s) : fplm line
\'
%s
\'
wrong format. Aborting.
\n
"
,
ERRINFO
,
buffer
);
exit
(
1
);
}
auto
form
=
splited
[
0
];
auto
lemma
=
splited
[
2
];
auto
rule
=
getRule
(
form
,
lemma
);
rules
[
rule
]
++
;
}
File
rulesFile
(
rulesFilename
,
"w"
);
File
exceptionsFile
(
exceptionsFilename
,
"w"
);
for
(
auto
&
it
:
rules
)
{
if
(
it
.
second
>=
threshold
)
fprintf
(
rulesFile
.
getDescriptor
(),
"%s
\n
"
,
it
.
first
.
c_str
());
else
fprintf
(
exceptionsFile
.
getDescriptor
(),
"%s
\n
"
,
it
.
first
.
c_str
());
}
return
0
;
}
This diff is collapsed.
Click to expand it.
maca_common/src/util.cpp
+
18
−
20
View file @
ca56eef3
...
@@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto)
...
@@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto)
std
::
string
from
=
toLowerCase
(
Ufrom
);
std
::
string
from
=
toLowerCase
(
Ufrom
);
std
::
string
to
=
toLowerCase
(
Uto
);
std
::
string
to
=
toLowerCase
(
Uto
);
unsigned
int
prefixFrom
=
0
;
int
fromL
=
getNbSymbols
(
from
);
unsigned
int
prefixTo
=
0
;
int
toL
=
getNbSymbols
(
to
);
int
minL
=
std
::
min
(
fromL
,
toL
);
for
(;
prefixFrom
<
Ufrom
.
size
()
&&
prefixTo
<
Uto
.
size
();)
int
longestCommonPrefix
=
0
;
{
if
(
from
[
prefixFrom
]
==
to
[
prefixTo
]
)
for
(
int
i
=
0
;
i
<
minL
;
i
++
)
{
{
prefixFrom
++
;
int
limitFrom
=
getEndIndexOfNthSymbol
(
from
,
i
);
prefixTo
++
;
int
limitTo
=
getEndIndexOfNthSymbol
(
to
,
i
);
continue
;
}
if
(
limitFrom
==
limitTo
&&
!
memcmp
(
from
.
c_str
(),
to
.
c_str
(),
limitFrom
+
1
))
longestCommonPrefix
++
;
else
break
;
break
;
}
}
std
::
string
rule
;
int
prefixEndIndex
=
getEndIndexOfNthSymbol
(
from
,
longestCommonPrefix
-
1
);
rule
.
push_back
(
'@'
);
int
suffixStartIndex
=
prefixEndIndex
+
1
;
for
(
unsigned
int
i
=
prefixFrom
;
i
<
from
.
size
();
i
++
)
rule
.
push_back
(
from
[
i
]);
rule
.
push_back
(
'@'
);
for
(
unsigned
int
i
=
prefixTo
;
i
<
to
.
size
();
i
++
)
rule
.
push_back
(
to
[
i
]);
if
(
rule
.
size
()
>=
20
)
std
::
string
toDelete
(
from
.
begin
()
+
suffixStartIndex
,
from
.
end
());
rule
=
"@@"
;
std
::
string
toAdd
(
to
.
begin
()
+
suffixStartIndex
,
to
.
end
())
;
return
rule
;
return
"@"
+
toDelete
+
"@"
+
toAdd
;
}
}
bool
ruleIsAppliable
(
const
std
::
string
&
Ufrom
,
const
std
::
string
&
rule
)
bool
ruleIsAppliable
(
const
std
::
string
&
Ufrom
,
const
std
::
string
&
rule
)
...
@@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n)
...
@@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n)
auto
it
=
s
.
begin
();
auto
it
=
s
.
begin
();
for
(
int
i
=
0
;
i
<
n
+
1
;
i
++
)
for
(
int
i
=
0
;
i
<
n
+
1
;
i
++
)
try
{
utf8
::
next
(
it
,
s
.
end
());}
try
{
utf8
::
next
(
it
,
s
.
end
());}
catch
(
utf8
::
not_enough_room
&
)
{
return
i
==
n
?
s
.
end
()
-
s
.
begin
()
:
-
1
;}
catch
(
utf8
::
not_enough_room
&
)
{
return
i
==
n
?
s
.
end
()
-
s
.
begin
()
-
1
:
-
1
;}
return
(
it
-
1
)
-
s
.
begin
();
return
(
it
-
1
)
-
s
.
begin
();
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment