StringUtil: Add BytePatternSearch()

This commit is contained in:
Stenzek 2024-08-13 14:47:11 +10:00
parent 9e09f53566
commit cbbfc2f11a
No known key found for this signature in database
2 changed files with 102 additions and 2 deletions

View file

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "string_util.h"
@ -9,6 +9,12 @@
#include <cstdio>
#include <sstream>
#ifndef __APPLE__
#include <malloc.h> // alloca
#else
#include <alloca.h>
#endif
#ifdef _WIN32
#include "windows_headers.h"
#endif
@ -427,6 +433,96 @@ void StringUtil::EllipsiseInPlace(std::string& str, u32 max_length, const char*
}
}
std::optional<size_t> StringUtil::BytePatternSearch(const std::span<const u8> bytes, const std::string_view pattern)
{
// Parse the pattern into a bytemask.
size_t pattern_length = 0;
bool hinibble = true;
for (size_t i = 0; i < pattern.size(); i++)
{
if ((pattern[i] >= '0' && pattern[i] <= '9') || (pattern[i] >= 'a' && pattern[i] <= 'f') ||
(pattern[i] >= 'A' && pattern[i] <= 'F') || pattern[i] == '?')
{
hinibble ^= true;
if (hinibble)
pattern_length++;
}
else if (pattern[i] == ' ' || pattern[i] == '\r' || pattern[i] == '\n')
{
continue;
}
else
{
break;
}
}
if (pattern_length == 0)
return std::nullopt;
const bool allocate_on_heap = (pattern_length >= 512);
u8* match_bytes = allocate_on_heap ? static_cast<u8*>(alloca(pattern_length * 2)) : new u8[pattern_length * 2];
u8* match_masks = match_bytes + pattern_length;
hinibble = true;
u8 match_byte = 0;
u8 match_mask = 0;
for (size_t i = 0, match_len = 0; i < pattern.size(); i++)
{
u8 nibble = 0, nibble_mask = 0xF;
if (pattern[i] >= '0' && pattern[i] <= '9')
nibble = pattern[i] - '0';
else if (pattern[i] >= 'a' && pattern[i] <= 'f')
nibble = pattern[i] - 'a' + 0xa;
else if (pattern[i] >= 'A' && pattern[i] <= 'F')
nibble = pattern[i] - 'A' + 0xa;
else if (pattern[i] == '?')
nibble_mask = 0;
else if (pattern[i] == ' ' || pattern[i] == '\r' || pattern[i] == '\n')
continue;
else
break;
hinibble ^= true;
if (hinibble)
{
match_bytes[match_len] = nibble | (match_byte << 4);
match_masks[match_len] = nibble_mask | (match_mask << 4);
match_len++;
}
else
{
match_byte = nibble;
match_mask = nibble_mask;
}
}
if (pattern_length == 0)
return std::nullopt;
std::optional<size_t> ret;
const size_t max_search_offset = bytes.size() - pattern_length;
for (size_t offset = 0; offset < max_search_offset; offset++)
{
const u8* start = bytes.data() + offset;
for (size_t match_offset = 0;;)
{
if ((start[match_offset] & match_masks[match_offset]) != match_bytes[match_offset])
break;
match_offset++;
if (match_offset == pattern_length)
{
// found it!
ret = offset;
}
}
}
if (allocate_on_heap)
delete[] match_bytes;
return ret;
}
size_t StringUtil::DecodeUTF8(const std::string_view str, size_t offset, char32_t* ch)
{
return DecodeUTF8(str.data() + offset, str.length() - offset, ch);

View file

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
@ -8,6 +8,7 @@
#include <cstring>
#include <iomanip>
#include <optional>
#include <span>
#include <string>
#include <string_view>
#include <vector>
@ -275,6 +276,9 @@ size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch);
std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "...");
void EllipsiseInPlace(std::string& str, u32 max_length, const char* ellipsis = "...");
/// Searches for the specified byte pattern in the given memory span. Wildcards (i.e. ??) are supported.
std::optional<size_t> BytePatternSearch(const std::span<const u8> bytes, const std::string_view pattern);
/// Strided memcpy/memcmp.
ALWAYS_INLINE static void StrideMemCpy(void* dst, std::size_t dst_stride, const void* src, std::size_t src_stride,
std::size_t copy_size, std::size_t count)