| 1 | //! @file match.cpp |
| 2 | //! @author ryftchen |
| 3 | //! @brief The definitions (match) in the algorithm module. |
| 4 | //! @version 0.1.0 |
| 5 | //! @copyright Copyright (c) 2022-2025 ryftchen. All rights reserved. |
| 6 | |
| 7 | #include "match.hpp" |
| 8 | |
| 9 | #include <cstring> |
| 10 | #include <vector> |
| 11 | |
| 12 | namespace algorithm::match |
| 13 | { |
| 14 | //! @brief Function version number. |
| 15 | //! @return version number (major.minor.patch) |
| 16 | const char* version() noexcept |
| 17 | { |
| 18 | static const char* const ver = "0.1.0" ; |
| 19 | return ver; |
| 20 | } |
| 21 | |
| 22 | std::int64_t Match::rk( |
| 23 | const unsigned char* const text, |
| 24 | const unsigned char* const pattern, |
| 25 | const std::uint32_t textLen, |
| 26 | const std::uint32_t patternLen) |
| 27 | { |
| 28 | if (!text || !pattern || (textLen == 0) || (patternLen == 0) || (textLen < patternLen)) |
| 29 | { |
| 30 | return -1; |
| 31 | } |
| 32 | |
| 33 | std::int64_t shift = -1; |
| 34 | constexpr std::int64_t rollingHashBase = maxASCII; |
| 35 | constexpr std::int64_t rollingHashMod = 1e9 + 7; |
| 36 | std::int64_t textHash = 0; |
| 37 | std::int64_t patternHash = 0; |
| 38 | std::int64_t pow = 1; |
| 39 | |
| 40 | for (std::uint32_t i = 0; i < (patternLen - 1); ++i) |
| 41 | { |
| 42 | pow = (pow * rollingHashBase) % rollingHashMod; |
| 43 | } |
| 44 | |
| 45 | for (std::uint32_t i = 0; i < patternLen; ++i) |
| 46 | { |
| 47 | patternHash = (rollingHashBase * patternHash + pattern[i]) % rollingHashMod; |
| 48 | textHash = (rollingHashBase * textHash + text[i]) % rollingHashMod; |
| 49 | } |
| 50 | |
| 51 | for (std::uint32_t i = 0; i <= (textLen - patternLen); ++i) |
| 52 | { |
| 53 | if ((patternHash == textHash) && (std::memcmp(s1: text + i, s2: pattern, n: patternLen) == 0)) |
| 54 | { |
| 55 | shift = i; |
| 56 | break; |
| 57 | } |
| 58 | |
| 59 | if (i < (textLen - patternLen)) |
| 60 | { |
| 61 | textHash = (rollingHashBase * (textHash - text[i] * pow) + text[i + patternLen]) % rollingHashMod; |
| 62 | if (textHash < 0) |
| 63 | { |
| 64 | textHash += rollingHashMod; |
| 65 | } |
| 66 | } |
| 67 | } |
| 68 | return shift; |
| 69 | } |
| 70 | |
| 71 | std::int64_t Match::kmp( |
| 72 | const unsigned char* const text, |
| 73 | const unsigned char* const pattern, |
| 74 | const std::uint32_t textLen, |
| 75 | const std::uint32_t patternLen) |
| 76 | { |
| 77 | if (!text || !pattern || (textLen == 0) || (patternLen == 0) || (textLen < patternLen)) |
| 78 | { |
| 79 | return -1; |
| 80 | } |
| 81 | |
| 82 | std::int64_t shift = -1; |
| 83 | std::vector<std::uint32_t> next(patternLen + 1, 0); |
| 84 | |
| 85 | for (std::uint32_t i = 1; i < patternLen; ++i) |
| 86 | { |
| 87 | std::uint32_t j = next[i + 1]; |
| 88 | while ((j > 0) && (pattern[j] != pattern[i])) |
| 89 | { |
| 90 | j = next[j]; |
| 91 | } |
| 92 | |
| 93 | if ((j > 0) || (pattern[j] == pattern[i])) |
| 94 | { |
| 95 | next[i + 1] = j + 1; |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | for (std::uint32_t i = 0, j = 0; i < textLen; ++i) |
| 100 | { |
| 101 | if (*(text + i) == *(pattern + j)) |
| 102 | { |
| 103 | if (++j == patternLen) |
| 104 | { |
| 105 | shift = i - j + 1; |
| 106 | break; |
| 107 | } |
| 108 | } |
| 109 | else if (j > 0) |
| 110 | { |
| 111 | j = next[j]; |
| 112 | --i; |
| 113 | } |
| 114 | } |
| 115 | return shift; |
| 116 | } |
| 117 | |
| 118 | std::int64_t Match::bm( |
| 119 | const unsigned char* const text, |
| 120 | const unsigned char* const pattern, |
| 121 | const std::uint32_t textLen, |
| 122 | const std::uint32_t patternLen) |
| 123 | { |
| 124 | if (!text || !pattern || (textLen == 0) || (patternLen == 0) || (textLen < patternLen)) |
| 125 | { |
| 126 | return -1; |
| 127 | } |
| 128 | |
| 129 | std::int64_t shift = -1; |
| 130 | std::uint32_t badCharRuleTable[maxASCII] = {}; |
| 131 | std::uint32_t goodSuffixIndexTable[maxASCII] = {}; |
| 132 | |
| 133 | fillBadCharRuleTable(badCharRuleTable, pattern, patternLen); |
| 134 | fillGoodSuffixRuleTable(goodSuffixRuleTable: goodSuffixIndexTable, pattern, patternLen); |
| 135 | |
| 136 | std::uint32_t textIdx = patternLen - 1; |
| 137 | while (textIdx < textLen) |
| 138 | { |
| 139 | std::uint32_t patternIdx = patternLen - 1; |
| 140 | while ((patternIdx > 0) && (text[textIdx] == pattern[patternIdx])) |
| 141 | { |
| 142 | --textIdx; |
| 143 | --patternIdx; |
| 144 | } |
| 145 | if ((patternIdx == 0) && (text[textIdx] == pattern[patternIdx])) |
| 146 | { |
| 147 | shift = textIdx; |
| 148 | break; |
| 149 | } |
| 150 | |
| 151 | textIdx += std::max(a: badCharRuleTable[text[textIdx]], b: goodSuffixIndexTable[patternIdx]); |
| 152 | } |
| 153 | return shift; |
| 154 | } |
| 155 | |
| 156 | void Match::fillBadCharRuleTable( |
| 157 | std::uint32_t badCharRuleTable[], const unsigned char* const pattern, const std::uint32_t patternLen) |
| 158 | { |
| 159 | for (std::uint16_t i = 0; i < maxASCII; ++i) |
| 160 | { |
| 161 | badCharRuleTable[i] = patternLen; |
| 162 | } |
| 163 | |
| 164 | for (std::uint32_t j = 0; j < patternLen; ++j) |
| 165 | { |
| 166 | badCharRuleTable[pattern[j]] = patternLen - 1 - j; |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | void Match::fillGoodSuffixRuleTable( |
| 171 | std::uint32_t goodSuffixRuleTable[], const unsigned char* const pattern, const std::uint32_t patternLen) |
| 172 | { |
| 173 | std::uint32_t lastPrefixIdx = 1; |
| 174 | for (std::int64_t pos = (patternLen - 1); pos >= 0; --pos) |
| 175 | { |
| 176 | const std::uint32_t suffixLen = patternLen - (pos + 1); |
| 177 | bool isPrefix = true; |
| 178 | for (std::uint32_t i = 0; i < suffixLen; ++i) |
| 179 | { |
| 180 | if (pattern[i] != pattern[pos + 1 + i]) |
| 181 | { |
| 182 | isPrefix = false; |
| 183 | break; |
| 184 | } |
| 185 | } |
| 186 | if (isPrefix) |
| 187 | { |
| 188 | lastPrefixIdx = pos + 1; |
| 189 | } |
| 190 | |
| 191 | goodSuffixRuleTable[pos] = lastPrefixIdx + (patternLen - 1 - pos); |
| 192 | } |
| 193 | |
| 194 | for (std::uint32_t pos = 0; pos < (patternLen - 1); ++pos) |
| 195 | { |
| 196 | std::uint32_t suffixLen = 0; |
| 197 | while ((suffixLen <= pos) && (suffixLen <= (patternLen - 1)) |
| 198 | && (pattern[pos - suffixLen] == pattern[patternLen - 1 - suffixLen])) |
| 199 | { |
| 200 | ++suffixLen; |
| 201 | } |
| 202 | if ((suffixLen <= pos) && (suffixLen <= (patternLen - 1)) |
| 203 | && (pattern[pos - suffixLen] != pattern[patternLen - 1 - suffixLen])) |
| 204 | { |
| 205 | goodSuffixRuleTable[patternLen - 1 - suffixLen] = patternLen - 1 - pos + suffixLen; |
| 206 | } |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | std::int64_t Match::horspool( |
| 211 | const unsigned char* const text, |
| 212 | const unsigned char* const pattern, |
| 213 | const std::uint32_t textLen, |
| 214 | const std::uint32_t patternLen) |
| 215 | { |
| 216 | if (!text || !pattern || (textLen == 0) || (patternLen == 0) || (textLen < patternLen)) |
| 217 | { |
| 218 | return -1; |
| 219 | } |
| 220 | |
| 221 | std::int64_t shift = -1; |
| 222 | std::uint32_t badCharShiftTable[maxASCII] = {}; |
| 223 | |
| 224 | fillHorspoolBadCharShiftTable(badCharShiftTable, pattern, patternLen); |
| 225 | |
| 226 | std::uint32_t moveLen = patternLen - 1; |
| 227 | while (moveLen <= (textLen - 1)) |
| 228 | { |
| 229 | std::uint32_t matchLen = 0; |
| 230 | while ((matchLen <= (patternLen - 1)) && (pattern[patternLen - 1 - matchLen] == text[moveLen - matchLen])) |
| 231 | { |
| 232 | ++matchLen; |
| 233 | } |
| 234 | |
| 235 | if (matchLen == patternLen) |
| 236 | { |
| 237 | shift = moveLen - patternLen + 1; |
| 238 | break; |
| 239 | } |
| 240 | |
| 241 | moveLen += badCharShiftTable[text[moveLen]]; |
| 242 | } |
| 243 | return shift; |
| 244 | } |
| 245 | |
| 246 | void Match::fillHorspoolBadCharShiftTable( |
| 247 | std::uint32_t badCharShiftTable[], const unsigned char* const pattern, const std::uint32_t patternLen) |
| 248 | { |
| 249 | for (std::uint16_t i = 0; i < maxASCII; ++i) |
| 250 | { |
| 251 | badCharShiftTable[i] = patternLen; |
| 252 | } |
| 253 | |
| 254 | for (std::uint32_t j = 0; j < (patternLen - 1); ++j) |
| 255 | { |
| 256 | badCharShiftTable[pattern[j]] = patternLen - 1 - j; |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | std::int64_t Match::sunday( |
| 261 | const unsigned char* const text, |
| 262 | const unsigned char* const pattern, |
| 263 | const std::uint32_t textLen, |
| 264 | const std::uint32_t patternLen) |
| 265 | { |
| 266 | if (!text || !pattern || (textLen == 0) || (patternLen == 0) || (textLen < patternLen)) |
| 267 | { |
| 268 | return -1; |
| 269 | } |
| 270 | |
| 271 | std::int64_t shift = -1; |
| 272 | std::uint32_t badCharShiftTable[maxASCII] = {}; |
| 273 | |
| 274 | fillSundayBadCharShiftTable(badCharShiftTable, pattern, patternLen); |
| 275 | |
| 276 | std::uint32_t textIdx = 0; |
| 277 | while (textIdx <= (textLen - patternLen)) |
| 278 | { |
| 279 | std::uint32_t matchLen = 0; |
| 280 | while (text[textIdx + matchLen] == pattern[matchLen]) |
| 281 | { |
| 282 | ++matchLen; |
| 283 | if (matchLen == patternLen) |
| 284 | { |
| 285 | shift = textIdx; |
| 286 | return shift; |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | textIdx += badCharShiftTable[text[textIdx + patternLen]]; |
| 291 | } |
| 292 | return shift; |
| 293 | } |
| 294 | |
| 295 | void Match::fillSundayBadCharShiftTable( |
| 296 | std::uint32_t badCharShiftTable[], const unsigned char* const pattern, const std::uint32_t patternLen) |
| 297 | { |
| 298 | for (std::uint16_t i = 0; i < maxASCII; ++i) |
| 299 | { |
| 300 | badCharShiftTable[i] = patternLen + 1; |
| 301 | } |
| 302 | |
| 303 | for (std::uint32_t j = 0; j < patternLen; ++j) |
| 304 | { |
| 305 | badCharShiftTable[pattern[j]] = patternLen - j; |
| 306 | } |
| 307 | } |
| 308 | } // namespace algorithm::match |
| 309 | |