diff --git a/CMakeLists.txt b/CMakeLists.txt index e305461dd134521c086308e1c4307ec633bafb57..1bef268a08f064ccfea71dfabc9a1bb132e21d2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,12 +15,12 @@ set(CMAKE_VERBOSE_MAKEFILE 0) set(CMAKE_CXX_STANDARD 20) if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Debug) -# set(CMAKE_BUILD_TYPE Release) +# set(CMAKE_BUILD_TYPE Debug) + set(CMAKE_BUILD_TYPE Release) endif() set(CMAKE_CXX_FLAGS "-Wall -Wextra") -set(CMAKE_CXX_FLAGS_DEBUG "-g3 -Ofast") +set(CMAKE_CXX_FLAGS_DEBUG "-g3") set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") include_directories(common/include) diff --git a/common/include/utf8string.hpp b/common/include/utf8string.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0a3ff542140a7cf2ee89366b03a9ec4d967109ed --- /dev/null +++ b/common/include/utf8string.hpp @@ -0,0 +1,66 @@ +#ifndef UTF8STRING__H +#define UTF8STRING__H + +#include <string> +#include <vector> +#include <array> +#include <fmt/core.h> + +namespace util +{ + +class utf8char : public std::array<char, 4> +{ + public : + + utf8char(); + utf8char & operator=(char other); + utf8char & operator=(const std::string & other); + bool operator==(char other); + bool operator==(const std::string & other); + bool operator!=(char other); +}; + +class utf8string : public std::vector<utf8char> +{ + public : + + utf8string & operator=(const std::string & other); + utf8string & operator=(const char * const other); + bool operator==(const std::string & other); +}; + +}; + +template <> +struct fmt::formatter<util::utf8char> +{ + constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const util::utf8char & c, FormatContext & ctx) + { + std::string result = ""; + for (char cc : c) + if (cc) + result += cc; + return format_to(ctx.out(), "{}", result); + } +}; + +template <> +struct fmt::formatter<util::utf8string> +{ + constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const util::utf8string & s, FormatContext & ctx) + { + std::string result; + for (auto & c : s) + result = fmt::format("{}{}", result, c); + return format_to(ctx.out(), "{}", result); + } +}; + +#endif diff --git a/common/include/util.hpp b/common/include/util.hpp index 3351b45d2b53580a47f077db613a119fd84c6b14..17d4ee8add73219e0a4f4798aa89a885f110c8dd 100644 --- a/common/include/util.hpp +++ b/common/include/util.hpp @@ -22,50 +22,53 @@ #include <experimental/source_location> #include <boost/flyweight.hpp> #include "utf8.hpp" +#include "utf8string.hpp" namespace util { -typedef std::array<char, 4> utf8char; -typedef std::basic_string<utf8char> utf8string; - -std::string_view getFilenameFromPath(std::string_view s); - -std::vector<std::string_view> split(std::string_view s, char delimiter); -utf8string splitAsUtf8(std::string_view s); void warning(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); void error(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); void error(const std::exception & e, const std::experimental::source_location & location = std::experimental::source_location::current()); void myThrow(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); +std::string_view getFilenameFromPath(std::string_view s); + +std::vector<std::string_view> split(std::string_view s, char delimiter); + +utf8string splitAsUtf8(std::string_view s); + std::string int2HumanStr(int number); -bool isEmpty(const std::string & s); -bool isEmpty(const boost::flyweight<std::string> & s); -}; +template <typename T> +bool isEmpty(const std::vector<T> & s) +{ + return s.empty(); +} -template <> -struct fmt::formatter<std::experimental::source_location> +template <typename T> +bool isEmpty(const std::basic_string<T> & s) { - constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } + return s.empty(); +} + +template <typename T> +bool isEmpty(const boost::flyweight<T> & s) +{ + return isEmpty(s.get()); +} - template <typename FormatContext> - auto format(const std::experimental::source_location & d, FormatContext & ctx) - { - return format_to(ctx.out(), "{},l.{},'{}'", util::getFilenameFromPath(d.file_name()), d.line(), d.function_name()); - } }; template <> -struct fmt::formatter<util::utf8char> +struct fmt::formatter<std::experimental::source_location> { constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } template <typename FormatContext> - auto format(const util::utf8char & c, FormatContext & ctx) + auto format(const std::experimental::source_location & d, FormatContext & ctx) { - char * asPtr = (char*)&c; - return format_to(ctx.out(), "{}{}{}{}", asPtr[0] ? asPtr[0] : '\0', asPtr[1] ? asPtr[1] : '\0', asPtr[2] ? asPtr[2] : '\0', asPtr[3] ? asPtr[3] : '\0'); + return format_to(ctx.out(), "{},l.{},'{}'", util::getFilenameFromPath(d.file_name()), d.line(), d.function_name()); } }; @@ -81,7 +84,4 @@ struct fmt::formatter<boost::flyweight<T>> } }; -std::string_view operator+(std::string_view a, std::string_view b); -void operator+=(std::string_view & a, std::string_view b); - #endif diff --git a/common/src/utf8string.cpp b/common/src/utf8string.cpp new file mode 100644 index 0000000000000000000000000000000000000000..688b607ce626a8bf2bd4ecc24ed14c5f1cff3c46 --- /dev/null +++ b/common/src/utf8string.cpp @@ -0,0 +1,67 @@ +#include "utf8string.hpp" +#include "util.hpp" + +util::utf8char::utf8char() +{ + for (auto & val : (*this)) + val = '\0'; +} + +util::utf8char & util::utf8char::operator=(char other) +{ + (*this)[0] = other; + return *this; +} + +util::utf8char & util::utf8char::operator=(const std::string & other) +{ + auto splited = splitAsUtf8(other); + if (splited.size() > 1) + myThrow(fmt::format("Assigning invalid utf8 character '{}'", other)); + return *this = splited[0]; +} + +bool util::utf8char::operator==(const std::string & other) +{ + auto splited = splitAsUtf8(other); + if (splited.size() > 1) + myThrow(fmt::format("Comparing with invalid utf8 character '{}'", other)); + return *this == splited[0]; +} + +bool util::utf8char::operator==(char other) +{ + return (*this)[0] == other && !(*this)[1] && !(*this)[2] && !(*this)[3]; +} + +bool util::utf8char::operator!=(char other) +{ + return ! (*this==other); +} + +util::utf8string & util::utf8string::operator=(const std::string & other) +{ + auto splited = splitAsUtf8(other); + resize(splited.size()); + for (unsigned int i = 0; i < splited.size(); i++) + (*this)[i] = splited[i]; + return *this; +} + +util::utf8string & util::utf8string::operator=(const char * const other) +{ + return operator=(std::string(other)); +} + +bool util::utf8string::operator==(const std::string & other) +{ + if (size() != other.size()) + return false; + + for (unsigned int i = 0; i < other.size(); i++) + if ((*this)[i] != other[i]) + return false; + + return true; +} + diff --git a/common/src/util.cpp b/common/src/util.cpp index 0c53e98eee7555e16ee766550524678c810b38b5..288b9a96fd6e14f436f2cc974236634bda701964 100644 --- a/common/src/util.cpp +++ b/common/src/util.cpp @@ -42,17 +42,18 @@ utf8string splitAsUtf8(std::string_view s) utf8string result; const char * beginPtr = s.data(); const char * currentPtr = beginPtr; - const char * endPtr = s.data()+s.size()-1; - - if (!utf8::is_valid(beginPtr, endPtr)) - myThrow("Not a valid utf8 input"); + const char * endPtr = s.data()+s.size(); while (currentPtr < endPtr) { - utf8::next(currentPtr, endPtr); + try {utf8::next(currentPtr, endPtr);} + catch (std::exception &) + { + break; + } if (currentPtr - beginPtr > 4 || currentPtr - beginPtr == 0) myThrow(fmt::format("Invalid utf8 character at index {}", beginPtr-s.data())); - utf8char c = {}; + utf8char c; for (int i = 0; i < currentPtr - beginPtr; i++) ((char*)&c)[i] = beginPtr[i]; beginPtr = currentPtr; @@ -98,28 +99,5 @@ std::string int2HumanStr(int number) return result; } -bool isEmpty(const std::string & s) -{ - return s.empty(); -} - -bool isEmpty(const boost::flyweight<std::string> & s) -{ - return s.get().empty(); -} - }; -std::string_view operator+(std::string_view a, std::string_view b) -{ - if (b.data() <= a.data()) - util::myThrow(fmt::format("std::string_view a+b with b <= a")); - - return std::string_view(a.data(), b.data()-a.data()+b.size()); -} - -void operator+=(std::string_view & a, std::string_view b) -{ - a = a + b; -} - diff --git a/dev/src/dev.cpp b/dev/src/dev.cpp index 5ebd43520b719da2059b7c543f5690742faa6976..9eba067713160ff2f7ddd9bae3edcd5dc2ccd793 100644 --- a/dev/src/dev.cpp +++ b/dev/src/dev.cpp @@ -13,10 +13,10 @@ int main(int argc, char * argv[]) std::vector<SubConfig> configs; - for (int i = 0; i < 1; i++) + for (int i = 0; i < 2; i++) configs.emplace_back(config); - + configs[0].print(stdout); fmt::print(stderr, "ok\n"); std::scanf("%*c"); diff --git a/reading_machine/include/Config.hpp b/reading_machine/include/Config.hpp index a83449649de57ccb55af914753c199896ebedf22..1819615fd48c0d04ed51e1a687b4fc6246f8e0d7 100644 --- a/reading_machine/include/Config.hpp +++ b/reading_machine/include/Config.hpp @@ -5,6 +5,7 @@ #include <string> #include <vector> #include <boost/flyweight.hpp> +#include "util.hpp" class Config { @@ -17,12 +18,13 @@ class Config private : - //using String = boost::flyweight<std::string>; - using String = std::string; - std::vector<String> lines; + using String = boost::flyweight<std::string>; + using Utf8String = boost::flyweight<util::utf8string>; using ValueIterator = std::vector<String>::iterator; using ConstValueIterator = std::vector<String>::const_iterator; + std::vector<String> lines; + protected : virtual std::size_t getNbColumns() const = 0; @@ -39,15 +41,19 @@ class Config void resizeLines(unsigned int nbLines); String & get(const std::string & colName, int lineIndex, int hypothesisIndex); String & get(int colIndex, int lineIndex, int hypothesisIndex); + const String & getConst(const std::string & colName, int lineIndex, int hypothesisIndex) const; + const String & getConst(int colIndex, int lineIndex, int hypothesisIndex) const; String & getLastNotEmpty(const std::string & colName, int lineIndex); String & getLastNotEmpty(int colIndex, int lineIndex); + const String & getLastNotEmptyConst(const std::string & colName, int lineIndex) const; + const String & getLastNotEmptyConst(int colIndex, int lineIndex) const; ValueIterator getIterator(int colIndex, int lineIndex, int hypothesisIndex); ConstValueIterator getConstIterator(int colIndex, int lineIndex, int hypothesisIndex) const; public : virtual ~Config() {} - void print(FILE * dest); + void print(FILE * dest) const; }; #endif diff --git a/reading_machine/include/SubConfig.hpp b/reading_machine/include/SubConfig.hpp index daaa0cce2b5e749bbaa419dccb34a5fb24715364..5a6db8e4eebac5b864ca83948bfde3d5e70787d0 100644 --- a/reading_machine/include/SubConfig.hpp +++ b/reading_machine/include/SubConfig.hpp @@ -25,7 +25,7 @@ class SubConfig : public Config public : SubConfig(BaseConfig & model); - void update(); + bool update(); }; #endif diff --git a/reading_machine/src/Config.cpp b/reading_machine/src/Config.cpp index cddcdfebb65c12dc8063293148d04903154db963..839cca9139c9f21775341e830916baf5a6575411 100644 --- a/reading_machine/src/Config.cpp +++ b/reading_machine/src/Config.cpp @@ -27,23 +27,33 @@ Config::String & Config::get(const std::string & colName, int lineIndex, int hyp return get(getColIndex(colName), lineIndex, hypothesisIndex); } +const Config::String & Config::getConst(const std::string & colName, int lineIndex, int hypothesisIndex) const +{ + return getConst(getColIndex(colName), lineIndex, hypothesisIndex); +} + Config::String & Config::get(int colIndex, int lineIndex, int hypothesisIndex) { return *getIterator(colIndex, lineIndex, hypothesisIndex); } +const Config::String & Config::getConst(int colIndex, int lineIndex, int hypothesisIndex) const +{ + return *getConstIterator(colIndex, lineIndex, hypothesisIndex); +} + std::size_t Config::getNbLines() const { return lines.size() / getIndexOfCol(getNbColumns()); } -void Config::print(FILE * dest) +void Config::print(FILE * dest) const { for (unsigned int line = 0; line < getNbLines(); line++) { for (unsigned int i = 0; i < getNbColumns()-1; i++) - fmt::print(dest, "{}{}", getLastNotEmpty(i, line), i < getNbColumns()-2 ? "\t" : "\n"); - if (getLastNotEmpty(EOSColName, line) == EOSSymbol1) + fmt::print(dest, "{}{}", getLastNotEmptyConst(i, getFirstLineIndex()+line), i < getNbColumns()-2 ? "\t" : "\n"); + if (getLastNotEmptyConst(EOSColName, getFirstLineIndex()+line) == EOSSymbol1) fmt::print(dest, "\n"); } } @@ -51,6 +61,18 @@ void Config::print(FILE * dest) Config::String & Config::getLastNotEmpty(int colIndex, int lineIndex) { int baseIndex = getIndexOfLine(lineIndex-getFirstLineIndex()) + getIndexOfCol(colIndex); + + for (int i = nbHypothesesMax; i > 0; --i) + if (!util::isEmpty(lines[baseIndex+i])) + return lines[baseIndex+i]; + + return lines[baseIndex]; +} + +const Config::String & Config::getLastNotEmptyConst(int colIndex, int lineIndex) const +{ + int baseIndex = getIndexOfLine(lineIndex-getFirstLineIndex()) + getIndexOfCol(colIndex); + for (int i = nbHypothesesMax; i > 0; --i) if (!util::isEmpty(lines[baseIndex+i])) return lines[baseIndex+i]; @@ -63,6 +85,11 @@ Config::String & Config::getLastNotEmpty(const std::string & colName, int lineIn return getLastNotEmpty(getColIndex(colName), lineIndex); } +const Config::String & Config::getLastNotEmptyConst(const std::string & colName, int lineIndex) const +{ + return getLastNotEmptyConst(getColIndex(colName), lineIndex); +} + Config::ValueIterator Config::getIterator(int colIndex, int lineIndex, int hypothesisIndex) { return lines.begin() + getIndexOfLine(lineIndex-getFirstLineIndex()) + getIndexOfCol(colIndex) + hypothesisIndex; diff --git a/reading_machine/src/SubConfig.cpp b/reading_machine/src/SubConfig.cpp index 75c6de628b581890770d2a4063c86ee7210a6e35..af9c3af22e48bf96faa6f3be142d06479a9c10f6 100644 --- a/reading_machine/src/SubConfig.cpp +++ b/reading_machine/src/SubConfig.cpp @@ -4,47 +4,33 @@ SubConfig::SubConfig(BaseConfig & model) : model(model) { firstLineIndex = 0; update(); - update(); - print(stdout); } -void SubConfig::update() +bool SubConfig::update() { - fmt::print(stderr, "Begin\n"); unsigned int currentLastLineIndex = firstLineIndex + getNbLines(); if (currentLastLineIndex >= model.getNbLines()-1) - return; + return false; - unsigned int newFirstLineIndex = 0.8*currentLastLineIndex; + unsigned int newFirstLineIndex = firstLineIndex + 0.8*(currentLastLineIndex-firstLineIndex); unsigned int newLastLineIndex = std::min(newFirstLineIndex + spanSize, model.getNbLines()); unsigned int newLineNumber = newLastLineIndex - newFirstLineIndex; - fmt::print(stderr, "FirstlineIndex = {}\n", firstLineIndex); - fmt::print(stderr, "newFirstlineIndex = {}\n", newFirstLineIndex); - if (getNbLines() < newLineNumber) - { - fmt::print(stderr, "Resizing because {} < {}\n", getNbLines(), newLineNumber); resizeLines(newLineNumber); - } { auto linesBegin = getIterator(0, firstLineIndex, 0); auto firstToSave = getConstIterator(0, newFirstLineIndex, 0); auto lastToSave = getConstIterator(0, currentLastLineIndex, 0); - fmt::print(stderr, "Copying from {} to {} into {}\n", newFirstLineIndex, currentLastLineIndex, firstLineIndex); - while (firstToSave != lastToSave) (*linesBegin++) = (*firstToSave++); } if (getNbLines() > newLineNumber) - { - fmt::print(stderr, "Resizing because {} > {}\n", getNbLines(), newLineNumber); resizeLines(newLineNumber); - } { unsigned int nbLinesCopied = currentLastLineIndex - newFirstLineIndex; @@ -52,17 +38,13 @@ void SubConfig::update() auto firstToSave = model.getConstIterator(0, currentLastLineIndex, 0); auto lastToSave = model.getConstIterator(0, newLastLineIndex, 0); - fmt::print(stderr, "Copying from {} to {} into {}\n", currentLastLineIndex, newLastLineIndex, firstLineIndex+nbLinesCopied); - if (newlinesBegin < getIterator(0, 0, 0)) - fmt::print(stderr, "Bug\n"); - while (firstToSave != lastToSave) (*newlinesBegin++) = (*firstToSave++); firstLineIndex = newFirstLineIndex; } - fmt::print(stderr, "End\n"); + return true; } std::size_t SubConfig::getNbColumns() const