Fixed an issue where the StringUtil::toCapitalized() function didn't correctly capitalize multi-byte Unicode characters

Also modernized some code
This commit is contained in:
Leon Styhre 2024-07-16 21:13:50 +02:00
parent be3aea7845
commit cac5ec0402

View file

@ -302,8 +302,8 @@ namespace Utils
unsigned int chars2Unicode(const std::string& stringArg, size_t& cursor) unsigned int chars2Unicode(const std::string& stringArg, size_t& cursor)
{ {
unsigned const char checkCharType = stringArg[cursor]; unsigned const char checkCharType {static_cast<unsigned char>(stringArg[cursor])};
unsigned int result = '?'; unsigned int result {'?'};
// 0xxxxxxx, one byte character. // 0xxxxxxx, one byte character.
if (checkCharType <= 0x7F) { if (checkCharType <= 0x7F) {
@ -376,7 +376,7 @@ namespace Utils
std::string getFirstCharacter(const std::string& stringArg, bool toUpper) std::string getFirstCharacter(const std::string& stringArg, bool toUpper)
{ {
std::string firstChar; std::string firstChar;
unsigned const char checkCharType = stringArg.front(); unsigned const char checkCharType {static_cast<unsigned char>(stringArg.front())};
// Normal UTF-8 ASCII character. // Normal UTF-8 ASCII character.
if (checkCharType <= 0x7F) if (checkCharType <= 0x7F)
@ -396,7 +396,7 @@ namespace Utils
size_t nextCursor(const std::string& stringArg, const size_t cursor) size_t nextCursor(const std::string& stringArg, const size_t cursor)
{ {
size_t result = cursor; size_t result {cursor};
while (result < stringArg.length()) { while (result < stringArg.length()) {
++result; ++result;
@ -411,7 +411,7 @@ namespace Utils
size_t prevCursor(const std::string& stringArg, const size_t cursor) size_t prevCursor(const std::string& stringArg, const size_t cursor)
{ {
size_t result = cursor; size_t result {cursor};
while (result > 0) { while (result > 0) {
--result; --result;
@ -426,14 +426,14 @@ namespace Utils
size_t moveCursor(const std::string& stringArg, const size_t cursor, const int amount) size_t moveCursor(const std::string& stringArg, const size_t cursor, const int amount)
{ {
size_t result = cursor; size_t result {cursor};
if (amount > 0) { if (amount > 0) {
for (int i = 0; i < amount; ++i) for (int i {0}; i < amount; ++i)
result = nextCursor(stringArg, result); result = nextCursor(stringArg, result);
} }
else if (amount < 0) { else if (amount < 0) {
for (int i = amount; i < 0; ++i) for (int i {amount}; i < 0; ++i)
result = prevCursor(stringArg, result); result = prevCursor(stringArg, result);
} }
@ -445,7 +445,7 @@ namespace Utils
std::string stringLower; std::string stringLower;
unsigned char checkCharType; unsigned char checkCharType;
for (size_t i = 0; i < stringArg.length();) { for (size_t i {0}; i < stringArg.length();) {
checkCharType = stringArg[i]; checkCharType = stringArg[i];
// Normal UTF-8 ASCII character. // Normal UTF-8 ASCII character.
if (checkCharType <= 0x7F) { if (checkCharType <= 0x7F) {
@ -470,10 +470,10 @@ namespace Utils
wchar_t unicodeChar = firstChar | secondChar; wchar_t unicodeChar = firstChar | secondChar;
// Try to find an entry for the character in the Unicode uppercase table. // Try to find an entry for the character in the Unicode uppercase table.
wchar_t* charIndex = std::wcschr(unicodeUppercase, unicodeChar); wchar_t* charIndex {std::wcschr(unicodeUppercase, unicodeChar)};
if (charIndex != nullptr) { if (charIndex != nullptr) {
wchar_t lowerChar = *(unicodeLowercase + (charIndex - unicodeUppercase)); wchar_t lowerChar {*(unicodeLowercase + (charIndex - unicodeUppercase))};
// Convert back to string format. // Convert back to string format.
std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter; std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter;
@ -497,7 +497,7 @@ namespace Utils
std::string stringUpper; std::string stringUpper;
unsigned char checkCharType; unsigned char checkCharType;
for (size_t i = 0; i < stringArg.length();) { for (size_t i {0}; i < stringArg.length();) {
checkCharType = stringArg[i]; checkCharType = stringArg[i];
// Normal UTF-8 ASCII character. // Normal UTF-8 ASCII character.
if (checkCharType <= 0x7F) { if (checkCharType <= 0x7F) {
@ -522,10 +522,10 @@ namespace Utils
wchar_t unicodeChar = firstChar | secondChar; wchar_t unicodeChar = firstChar | secondChar;
// Try to find an entry for the character in the Unicode lowercase table. // Try to find an entry for the character in the Unicode lowercase table.
wchar_t* charIndex = std::wcschr(unicodeLowercase, unicodeChar); wchar_t* charIndex {std::wcschr(unicodeLowercase, unicodeChar)};
if (charIndex != nullptr) { if (charIndex != nullptr) {
wchar_t upperChar = *(unicodeUppercase + (charIndex - unicodeLowercase)); wchar_t upperChar {*(unicodeUppercase + (charIndex - unicodeLowercase))};
// Convert back to string format. // Convert back to string format.
std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter; std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> byteConverter;
@ -549,25 +549,31 @@ namespace Utils
if (stringArg == "") if (stringArg == "")
return stringArg; return stringArg;
std::string line {stringArg}; // This is a bit tricky as Unicode characters can be up to four bytes in length,
bool active {true}; // so we need to figure out how many bytes each cursor move corresponds to and
// extract and case convert the string content accordingly.
const std::string stringLower {Utils::String::toLower(stringArg)};
std::string stringCapitalized;
for (auto& chr : line) { size_t charLength {0};
if (std::isalnum(static_cast<unsigned char>(chr))) { bool capitalize {true};
if (active) {
chr = std::toupper(chr); for (size_t i {0}; i < stringLower.length(); i += charLength) {
active = false; charLength = moveCursor(stringLower, i, 1) - i;
} const std::string chr {stringLower.substr(i, charLength)};
else { if (capitalize) {
chr = std::tolower(chr); stringCapitalized += Utils::String::toUpper(chr);
} capitalize = false;
} }
else if (chr == ' ' || chr == '-' || chr == '\n' || chr == '\r' || chr == '\t') { else {
active = true; stringCapitalized += chr;
} }
if (chr == " " || chr == "-" || chr == "\n" || chr == "\r" || chr == "\t")
capitalize = true;
} }
return line; return stringCapitalized;
} }
std::string filterUtf8(const std::string& stringArg) std::string filterUtf8(const std::string& stringArg)
@ -579,7 +585,7 @@ namespace Utils
std::string trim(const std::string& stringArg) std::string trim(const std::string& stringArg)
{ {
std::string trimString = stringArg; std::string trimString {stringArg};
// Trim leading and trailing whitespaces. // Trim leading and trailing whitespaces.
trimString.erase(trimString.begin(), trimString.erase(trimString.begin(),
@ -691,16 +697,16 @@ namespace Utils
std::string removeParenthesis(const std::string& stringArg) std::string removeParenthesis(const std::string& stringArg)
{ {
static std::vector<char> remove = {'(', ')', '[', ']'}; static std::vector<char> remove {'(', ')', '[', ']'};
std::string stringRemove = stringArg; std::string stringRemove {stringArg};
size_t start; size_t start;
size_t end; size_t end;
bool done = false; bool done {false};
while (!done) { while (!done) {
done = true; done = true;
for (size_t i = 0; i < remove.size(); i += 2) { for (size_t i {0}; i < remove.size(); i += 2) {
end = stringRemove.find_first_of(remove[i + 1]); end = stringRemove.find_first_of(remove[i + 1]);
start = stringRemove.find_last_of(remove[i + 0], end); start = stringRemove.find_last_of(remove[i + 0], end);
@ -720,8 +726,8 @@ namespace Utils
bool caseInsensitive) bool caseInsensitive)
{ {
std::vector<std::string> vectorResult; std::vector<std::string> vectorResult;
size_t start = 0; size_t start {0};
size_t delimPos = stringArg.find(delimiter); size_t delimPos {stringArg.find(delimiter)};
while (delimPos != std::string::npos) { while (delimPos != std::string::npos) {
vectorResult.push_back(stringArg.substr(start, delimPos - start)); vectorResult.push_back(stringArg.substr(start, delimPos - start));
@ -772,9 +778,9 @@ namespace Utils
std::string scramble(const std::string& input, const std::string& key) std::string scramble(const std::string& input, const std::string& key)
{ {
std::string buffer = input; std::string buffer {input};
for (size_t i = 0; i < input.size(); ++i) for (size_t i {0}; i < input.size(); ++i)
buffer[i] = input[i] ^ key[i]; buffer[i] = input[i] ^ key[i];
return buffer; return buffer;