ES-DE/es-core/src/utils/StringUtil.cpp
2024-08-09 22:54:57 +02:00

457 lines
16 KiB
C++

// SPDX-License-Identifier: MIT
//
// ES-DE Frontend
// StringUtil.cpp
//
// Low-level string functions.
//
// Suppress codecvt deprecation warnings.
#if defined(_MSC_VER) // MSVC compiler.
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#elif defined(__clang__)
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
#include "utils/StringUtil.h"
#include "Log.h"
#include "utils/PlatformUtil.h"
#include <unicode/brkiter.h>
#include <unicode/ustring.h>
#include <algorithm>
#include <cstdarg>
#include <locale>
namespace Utils
{
namespace String
{
unsigned int chars2Unicode(const std::string& stringArg, size_t& cursor)
{
unsigned const char checkCharType {static_cast<unsigned char>(stringArg[cursor])};
unsigned int result {'?'};
// 0xxxxxxx, one byte character.
if (checkCharType <= 0x7F) {
// 0xxxxxxx
result = (stringArg[cursor++]);
}
// 11110xxx, four byte character.
else if (checkCharType >= 0xF0 && cursor < stringArg.length() - 2) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
result = (stringArg[cursor++] & 0x07) << 18;
result |= (stringArg[cursor++] & 0x3F) << 12;
result |= (stringArg[cursor++] & 0x3F) << 6;
result |= stringArg[cursor++] & 0x3F;
}
// 1110xxxx, three byte character.
else if (checkCharType >= 0xE0 && cursor < stringArg.length() - 1) {
// 1110xxxx 10xxxxxx 10xxxxxx
result = (stringArg[cursor++] & 0x0F) << 12;
result |= (stringArg[cursor++] & 0x3F) << 6;
result |= stringArg[cursor++] & 0x3F;
}
// 110xxxxx, two byte character.
else if (checkCharType >= 0xC0 && cursor < stringArg.length()) {
// 110xxxxx 10xxxxxx
result = (stringArg[cursor++] & 0x1F) << 6;
result |= stringArg[cursor++] & 0x3F;
}
else {
// Error, invalid character.
++cursor;
}
return result;
}
std::string unicode2Chars(const unsigned int unicodeArg)
{
std::string result;
// Normal UTF-8 ASCII character.
if (unicodeArg < 0x80) {
result += (unicodeArg & 0xFF);
}
// Two-byte character.
else if (unicodeArg < 0x800) {
result += ((unicodeArg >> 6) & 0xFF) | 0xC0;
result += (unicodeArg & 0x3F) | 0x80;
}
// Three-byte character.
else if (unicodeArg < 0xFFFF) {
result += ((unicodeArg >> 12) & 0xFF) | 0xE0;
result += ((unicodeArg >> 6) & 0x3F) | 0x80;
result += (unicodeArg & 0x3F) | 0x80;
}
// Four-byte character.
else if (unicodeArg <= 0x1fffff) {
result += ((unicodeArg >> 18) & 0xFF) | 0xF0;
result += ((unicodeArg >> 12) & 0x3F) | 0x80;
result += ((unicodeArg >> 6) & 0x3F) | 0x80;
result += (unicodeArg & 0x3F) | 0x80;
}
else {
// Error, invalid character.
result += '?';
}
return result;
}
std::string getFirstCharacter(const std::string& stringArg, bool toUpper)
{
std::string firstChar;
unsigned const char checkCharType {static_cast<unsigned char>(stringArg.front())};
// Normal UTF-8 ASCII character.
if (checkCharType <= 0x7F)
(toUpper) ? firstChar = toupper(stringArg.front()) : firstChar = stringArg.front();
// Four-byte Unicode character.
else if (checkCharType >= 0xF0)
firstChar = stringArg.substr(0, 4);
// Three-byte Unicode character.
else if (checkCharType >= 0xE0)
firstChar = stringArg.substr(0, 3);
// Two-byte Unicode character.
else if (checkCharType >= 0xC0)
firstChar = stringArg.substr(0, 2);
return firstChar;
}
size_t nextCursor(const std::string& stringArg, const size_t cursor)
{
size_t result {cursor};
while (result < stringArg.length()) {
++result;
// Break if current character is not 10xxxxxx
if ((stringArg[result] & 0xC0) != 0x80)
break;
}
return result;
}
size_t prevCursor(const std::string& stringArg, const size_t cursor)
{
size_t result {cursor};
while (result > 0) {
--result;
// Break if current character is not 10xxxxxx
if ((stringArg[result] & 0xC0) != 0x80)
break;
}
return result;
}
size_t moveCursor(const std::string& stringArg, const size_t cursor, const int amount)
{
size_t result {cursor};
if (amount > 0) {
for (int i {0}; i < amount; ++i)
result = nextCursor(stringArg, result);
}
else if (amount < 0) {
for (int i {amount}; i < 0; ++i)
result = prevCursor(stringArg, result);
}
return result;
}
size_t unicodeLength(const std::string& stringArg)
{
size_t length {0};
size_t charLength {0};
for (size_t i {0}; i < stringArg.length(); i += charLength) {
charLength = moveCursor(stringArg, i, 1) - i;
++length;
}
return length;
}
std::string toLower(const std::string& stringArg)
{
// IMPORTANT: On Windows specifically the StringPiece constructor which is implicitly
// called by fromUTF8() crashes the application if a std::string is passed as the
// argument. It's therefore necessary to use c_str() to work around this issue.
// This behavior has been observed with ICU 75.1.
icu::UnicodeString convert {icu::UnicodeString::fromUTF8(stringArg.c_str())};
std::string stringLower;
convert.toLower();
return convert.toUTF8String(stringLower);
}
std::string toUpper(const std::string& stringArg)
{
icu::UnicodeString convert {icu::UnicodeString::fromUTF8(stringArg.c_str())};
std::string stringUpper;
convert.toUpper();
return convert.toUTF8String(stringUpper);
}
std::string toCapitalized(const std::string& stringArg)
{
if (stringArg == "")
return stringArg;
UErrorCode status {U_ZERO_ERROR};
std::unique_ptr<icu::BreakIterator> iterator {nullptr};
// Since we don't know the actual text language we set it to locale en_US.
iterator.reset(icu::BreakIterator::createWordInstance(icu::Locale::getUS(), status));
if (U_FAILURE(status) || iterator == nullptr)
return stringArg;
icu::UnicodeString iterateString {
icu::UnicodeString::fromUTF8(stringArg.c_str()).toLower()};
if (iterateString != nullptr) {
iterator->setText(iterateString);
int32_t pos {iterator->first()};
int32_t lastPos {pos};
while (pos != icu::BreakIterator::DONE) {
iterateString.replace(lastPos, 1,
icu::UnicodeString(iterateString, pos, 1).toUpper());
pos = iterator->next();
lastPos = pos;
}
}
else {
return stringArg;
}
std::string stringCapitalized;
return iterateString.toUTF8String(stringCapitalized);
}
std::string filterUtf8(const std::string& stringArg)
{
std::string tempString;
utf8::replace_invalid(stringArg.begin(), stringArg.end(), back_inserter(tempString));
return tempString;
}
std::string trim(const std::string& stringArg)
{
std::string trimString {stringArg};
// Trim leading and trailing whitespaces.
trimString.erase(trimString.begin(),
std::find_if(trimString.begin(), trimString.end(), [](char c) {
return !std::isspace(static_cast<unsigned char>(c));
}));
trimString.erase(
std::find_if(trimString.rbegin(), trimString.rend(),
[](char c) { return !std::isspace(static_cast<unsigned char>(c)); })
.base(),
trimString.end());
return trimString;
}
std::string replace(const std::string& stringArg,
const std::string& from,
const std::string& to)
{
std::string result {stringArg};
// The outer loop makes sure that we're eliminating all repeating occurances
// of the 'from' value.
while (result.find(from) != std::string::npos) {
// Prevent endless loops.
if (from == to)
break;
std::string replaced;
size_t lastPos {0};
size_t findPos {0};
while ((findPos = result.find(from, lastPos)) != std::string::npos) {
replaced.append(result, lastPos, findPos - lastPos).append(to);
lastPos = findPos + from.length();
}
replaced.append(result.substr(lastPos));
result = replaced;
// Prevent endless loops.
if (to.find(from) != std::string::npos)
break;
}
return result;
}
std::string format(const std::string stringArg, ...)
{
if (stringArg.empty())
return "";
// Extract all the variadic function arguments.
va_list args;
va_list copy;
va_start(args, stringArg);
va_copy(copy, args);
const int length {vsnprintf(nullptr, 0, &stringArg[0], copy)};
va_end(copy);
std::string buffer(length, '\0');
va_copy(copy, args);
vsnprintf(&buffer[0], length + 1, &stringArg[0], copy);
va_end(copy);
va_end(args);
return buffer;
}
std::wstring stringToWideString(const std::string& stringArg)
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> stringConverter;
try {
return stringConverter.from_bytes(stringArg);
}
catch (...) {
LOG(LogError) << "StringUtil::stringToWideString(): Conversion failed, invalid "
"characters in source string?";
LOG(LogError) << stringArg;
Utils::Platform::emergencyShutdown();
return L"";
}
}
std::string wideStringToString(const std::wstring& stringArg)
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> stringConverter;
try {
return stringConverter.to_bytes(stringArg);
}
catch (...) {
LOG(LogError) << "StringUtil::wideStringToString(): Conversion failed, invalid "
"characters in source string?";
Utils::Platform::emergencyShutdown();
return "";
}
}
bool startsWith(const std::string& stringArg, const std::string& start)
{
return (stringArg.find(start) == 0);
}
bool endsWith(const std::string& stringArg, const std::string& end)
{
return (stringArg.find(end) == (stringArg.size() - end.size()));
}
std::string removeParenthesis(const std::string& stringArg)
{
static std::vector<char> remove {'(', ')', '[', ']'};
std::string stringRemove {stringArg};
size_t start;
size_t end;
bool done {false};
while (!done) {
done = true;
for (size_t i {0}; i < remove.size(); i += 2) {
end = stringRemove.find_first_of(remove[i + 1]);
start = stringRemove.find_last_of(remove[i + 0], end);
if ((start != std::string::npos) && (end != std::string::npos)) {
stringRemove.erase(start, end - start + 1);
done = false;
}
}
}
return trim(stringRemove);
}
std::vector<std::string> delimitedStringToVector(const std::string& stringArg,
const std::string& delimiter,
bool sort,
bool caseInsensitive)
{
std::vector<std::string> vectorResult;
size_t start {0};
size_t delimPos {stringArg.find(delimiter)};
while (delimPos != std::string::npos) {
vectorResult.push_back(stringArg.substr(start, delimPos - start));
start = delimPos + 1;
delimPos = stringArg.find(delimiter, start);
}
vectorResult.push_back(stringArg.substr(start));
if (sort) {
if (caseInsensitive)
std::sort(std::begin(vectorResult), std::end(vectorResult),
[](std::string a, std::string b) {
return std::toupper(a.front()) < std::toupper(b.front());
});
else
std::sort(vectorResult.begin(), vectorResult.end());
}
// Remove any empty elements.
vectorResult.erase(remove(vectorResult.begin(), vectorResult.end(), ""),
vectorResult.end());
return vectorResult;
}
std::string vectorToDelimitedString(std::vector<std::string> vectorArg,
const std::string& delimiter,
bool caseInsensitive)
{
std::string resultString;
if (caseInsensitive) {
std::sort(std::begin(vectorArg), std::end(vectorArg),
[](std::string a, std::string b) {
return std::toupper(a.front()) < std::toupper(b.front());
});
}
else {
std::sort(vectorArg.begin(), vectorArg.end());
}
for (std::vector<std::string>::const_iterator it = vectorArg.cbegin();
it != vectorArg.cend(); ++it)
resultString += (resultString.length() ? delimiter : "") + (*it);
return resultString;
}
std::string scramble(const std::string& input, const std::string& key)
{
std::string buffer {input};
for (size_t i {0}; i < input.size(); ++i)
buffer[i] = input[i] ^ key[i];
return buffer;
}
} // namespace String
} // namespace Utils