Fixed an issue where the StringUtil::toCapitalized() function didn't correctly capitalize multi-byte Unicode characters

Also modernized some code
2025-01-18 15:15:37 +00:00 · 2024-07-16 21:13:50 +02:00 · 2024-07-16 21:13:50 +02:00 · cac5ec0402
parent be3aea7845
commit cac5ec0402
1 changed files with 43 additions and 37 deletions
--- a/es-core/src/utils/StringUtil.cpp
+++ b/es-core/src/utils/StringUtil.cpp
@ -302,8 +302,8 @@ namespace Utils

        unsigned int chars2Unicode(const std::string& stringArg, size_t& cursor)
        {
-            unsigned const char checkCharType = stringArg[cursor];
-            unsigned int result = '?';
+            unsigned const char checkCharType {static_cast<unsigned char>(stringArg[cursor])};
+            unsigned int result {'?'};

            // 0xxxxxxx, one byte character.
            if (checkCharType <= 0x7F) {
@ -376,7 +376,7 @@ namespace Utils
        std::string getFirstCharacter(const std::string& stringArg, bool toUpper)
        {
            std::string firstChar;
-            unsigned const char checkCharType = stringArg.front();
+            unsigned const char checkCharType {static_cast<unsigned char>(stringArg.front())};

            // Normal UTF-8 ASCII character.
            if (checkCharType <= 0x7F)
@ -396,7 +396,7 @@ namespace Utils

        size_t nextCursor(const std::string& stringArg, const size_t cursor)
        {
-            size_t result = cursor;
+            size_t result {cursor};

            while (result < stringArg.length()) {
                ++result;
@ -411,7 +411,7 @@ namespace Utils

        size_t prevCursor(const std::string& stringArg, const size_t cursor)
        {
-            size_t result = cursor;
+            size_t result {cursor};

            while (result > 0) {
                --result;
@ -426,14 +426,14 @@ namespace Utils

        size_t moveCursor(const std::string& stringArg, const size_t cursor, const int amount)
        {
-            size_t result = cursor;
+            size_t result {cursor};

            if (amount > 0) {
-                for (int i = 0; i < amount; ++i)
+                for (int i {0}; i < amount; ++i)
                    result = nextCursor(stringArg, result);
            }
            else if (amount < 0) {
-                for (int i = amount; i < 0; ++i)
+                for (int i {amount}; i < 0; ++i)
                    result = prevCursor(stringArg, result);
            }

@ -445,7 +445,7 @@ namespace Utils
            std::string stringLower;
            unsigned char checkCharType;

-            for (size_t i = 0; i < stringArg.length();) {
+            for (size_t i {0}; i < stringArg.length();) {
                checkCharType = stringArg[i];
                // Normal UTF-8 ASCII character.
                if (checkCharType <= 0x7F) {
@ -470,10 +470,10 @@ namespace Utils
                    wchar_t unicodeChar = firstChar | secondChar;

                    // Try to find an entry for the character in the Unicode uppercase table.
-                    wchar_t* charIndex = std::wcschr(unicodeUppercase, unicodeChar);
+                    wchar_t* charIndex {std::wcschr(unicodeUppercase, unicodeChar)};

                    if (charIndex != nullptr) {
-                        wchar_t lowerChar = *(unicodeLowercase + (charIndex - unicodeUppercase));
+                        wchar_t lowerChar {*(unicodeLowercase + (charIndex - unicodeUppercase))};
                        // Convert back to string format.
                        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter;

@ -497,7 +497,7 @@ namespace Utils
            std::string stringUpper;
            unsigned char checkCharType;

-            for (size_t i = 0; i < stringArg.length();) {
+            for (size_t i {0}; i < stringArg.length();) {
                checkCharType = stringArg[i];
                // Normal UTF-8 ASCII character.
                if (checkCharType <= 0x7F) {
@ -522,10 +522,10 @@ namespace Utils
                    wchar_t unicodeChar = firstChar | secondChar;

                    // Try to find an entry for the character in the Unicode lowercase table.
-                    wchar_t* charIndex = std::wcschr(unicodeLowercase, unicodeChar);
+                    wchar_t* charIndex {std::wcschr(unicodeLowercase, unicodeChar)};

                    if (charIndex != nullptr) {
-                        wchar_t upperChar = *(unicodeUppercase + (charIndex - unicodeLowercase));
+                        wchar_t upperChar {*(unicodeUppercase + (charIndex - unicodeLowercase))};
                        // Convert back to string format.
                        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter;

@ -549,25 +549,31 @@ namespace Utils
            if (stringArg == "")
                return stringArg;

-            std::string line {stringArg};
-            bool active {true};
+            // This is a bit tricky as Unicode characters can be up to four bytes in length,
+            // so we need to figure out how many bytes each cursor move corresponds to and
+            // extract and case convert the string content accordingly.
+            const std::string stringLower {Utils::String::toLower(stringArg)};
+            std::string stringCapitalized;

-            for (auto& chr : line) {
-                if (std::isalnum(static_cast<unsigned char>(chr))) {
-                    if (active) {
-                        chr = std::toupper(chr);
-                        active = false;
-                    }
-                    else {
-                        chr = std::tolower(chr);
-                    }
+            size_t charLength {0};
+            bool capitalize {true};
+
+            for (size_t i {0}; i < stringLower.length(); i += charLength) {
+                charLength = moveCursor(stringLower, i, 1) - i;
+                const std::string chr {stringLower.substr(i, charLength)};
+                if (capitalize) {
+                    stringCapitalized += Utils::String::toUpper(chr);
+                    capitalize = false;
                }
-                else if (chr == ' ' || chr == '-' || chr == '\n' || chr == '\r' || chr == '\t') {
-                    active = true;
+                else {
+                    stringCapitalized += chr;
                }
+
+                if (chr == " " || chr == "-" || chr == "\n" || chr == "\r" || chr == "\t")
+                    capitalize = true;
            }

-            return line;
+            return stringCapitalized;
        }

        std::string filterUtf8(const std::string& stringArg)
@ -579,7 +585,7 @@ namespace Utils

        std::string trim(const std::string& stringArg)
        {
-            std::string trimString = stringArg;
+            std::string trimString {stringArg};

            // Trim leading and trailing whitespaces.
            trimString.erase(trimString.begin(),
@ -691,16 +697,16 @@ namespace Utils

        std::string removeParenthesis(const std::string& stringArg)
        {
-            static std::vector<char> remove = {'(', ')', '[', ']'};
-            std::string stringRemove = stringArg;
+            static std::vector<char> remove {'(', ')', '[', ']'};
+            std::string stringRemove {stringArg};
            size_t start;
            size_t end;
-            bool done = false;
+            bool done {false};

            while (!done) {
                done = true;

-                for (size_t i = 0; i < remove.size(); i += 2) {
+                for (size_t i {0}; i < remove.size(); i += 2) {
                    end = stringRemove.find_first_of(remove[i + 1]);
                    start = stringRemove.find_last_of(remove[i + 0], end);

@ -720,8 +726,8 @@ namespace Utils
                                                         bool caseInsensitive)
        {
            std::vector<std::string> vectorResult;
-            size_t start = 0;
-            size_t delimPos = stringArg.find(delimiter);
+            size_t start {0};
+            size_t delimPos {stringArg.find(delimiter)};

            while (delimPos != std::string::npos) {
                vectorResult.push_back(stringArg.substr(start, delimPos - start));
@ -772,9 +778,9 @@ namespace Utils

        std::string scramble(const std::string& input, const std::string& key)
        {
-            std::string buffer = input;
+            std::string buffer {input};

-            for (size_t i = 0; i < input.size(); ++i)
+            for (size_t i {0}; i < input.size(); ++i)
                buffer[i] = input[i] ^ key[i];

            return buffer;