本文最后更新于 78 天前,如有失效请评论区留言。
字符串哈希, 一般都是写困难题的, 学会上大分. 同类型的AC自动机,后缀数组都是较难的知识点
哈希分单模双模哈希:
单模哈希
根据生日攻击,设 M 为模数,仅仅计算大约 1.18⋅ sqrt(M)个不同字符串的哈希值,就有 50% 的概率会发生哈希碰撞,也就是有两个不同的字符串,哈希值是一样的。
双模哈希
设 M1和 M2为模数,此时要计算大约 1.18⋅ sqrt(M1M2)个不同字符串的哈希,才有 50% 的概率会发生哈希碰撞。(注:对于 Python 来说,把 mod 和 base 改大就行。)
原理:
n = len(target)
# 多项式字符串哈希(方便计算子串哈希值)
# 哈希函数 hash(s) = s[0] * BASE^(n-1) + s[1] * BASE^(n-2) + ... + s[n-2] * BASE + s[n-1]
MOD = 1_070_777_777
BASE = randint(8 * 10 ** 8, 9 * 10 ** 8) # 随机 BASE,防止 hack
# 双模哈希
# MOD = 10 ** 18 + 3
# BASE = randint(8 * 10 ** 17, 9 * 10 ** 17)
pow_base = [1] + [0] * n # pow_base[i] = BASE^i
pre_hash = [0] * (n + 1) # 前缀哈希值 pre_hash[i] = hash(s[:i])
for i, b in enumerate(target):
pow_base[i + 1] = pow_base[i] * BASE % MOD
pre_hash[i + 1] = (pre_hash[i] * BASE + ord(b)) % MOD # 秦九韶算法计算多项式哈希
.......
# 计算子串 target[i-sz:i] 的哈希值(计算方法类似前缀和)
sub_hash = (pre_hash[i] - pre_hash[i - sz] * pow_base[sz]) % MOD
https://leetcode.cn/problems/count-prefix-and-suffix-pairs-ii/description/
模板:
MOD = random.getrandbits(64)
BASE = random.randint(150, 200)
class Hash:
def __init__(self, s: str):
self.n = len(s)
self.mod = MOD
self.base = BASE
self.p = [1] * (self.n + 1)
self.pre = [0] * (self.n + 1)
for i in range(self.n):
self.p[i + 1] = (self.p[i] * self.base) % self.mod
self.pre[i + 1] = (self.pre[i] * self.base + ord(s[i])) % self.mod
self.h = self.hash(0, self.n - 1)
def hash(self, i: int, j: int) -> int:
return (self.pre[j + 1] - self.pre[i] * self.p[j - i + 1]) % self.mod
# MOD = 1_070_777_777
# BASE = randint(8 * 10 ** 8, 9 * 10 ** 8) # 随机 BASE,防止 hack
MOD = random.getrandbits(64)
BASE = random.randint(150, 200)
class Solution:
def countPrefixSuffixPairs(self, words: List[str]) -> int:
ans = 0
cnt = defaultdict(int)
for x in words:
n = len(x)
pow_base = [1] + [0] * n
pre_hash = [0] * (n + 1)
for i, b in enumerate(x):
pow_base[i + 1] = pow_base[i] * BASE % MOD
pre_hash[i + 1] = (pre_hash[i] * BASE + ord(b)) % MOD
# 枚举长度 l
for l in range(1, n+1):
prefix_hash = pre_hash[l] % MOD
suffix_hash = (pre_hash[-1] - pre_hash[- l - 1] * pow_base[l]) % MOD
if prefix_hash == suffix_hash and prefix_hash in cnt:
ans += cnt[prefix_hash]
cnt[pre_hash[-1]] += 1
return ans
https://leetcode.cn/problems/minimum-cost-to-convert-string-ii/description/
MOD = 1_070_777_777
BASE = randint(8 * 10 ** 8, 9 * 10 ** 8)
class Solution:
def minimumCost(self, source: str, target: str, original: List[str], changed: List[str], cost: List[int]) -> int:
cnt = defaultdict(lambda: defaultdict(lambda: inf))
for o, c, co in zip(original, changed, cost):
o_hash = c_hash = 0
for c1, o1 in zip(c, o):
o_hash = (o_hash * BASE + ord(o1)) % MOD
c_hash = (c_hash * BASE + ord(c1)) % MOD
cnt[o_hash][c_hash] = min(cnt[o_hash][c_hash], co)
# 提取 cnt 中所有出现的哈希值(包括键和值)
all_hashes = set(cnt.keys()).union(*cnt.values())
# floyd 全源最短路
for k in all_hashes:
for i in all_hashes:
if cnt[i][k] == inf: continue
for j in all_hashes:
cnt[i][j] = min(cnt[i][j], cnt[i][k] + cnt[k][j])
n = len(source)
sorted_len = sorted(set(map(len, original)))
# 计算 source 和 target 的哈希值
pow_base = [1] * (n + 1)
pre_hashs = [0] * (n + 1)
pre_hasht = [0] * (n + 1)
for i in range(n):
pow_base[i + 1] = pow_base[i] * BASE % MOD
pre_hashs[i + 1] = (pre_hashs[i] * BASE + ord(source[i])) % MOD
pre_hasht[i + 1] = (pre_hasht[i] * BASE + ord(target[i])) % MOD
@cache
def dfs(i):
if i == n:
return 0
res = inf
for j in sorted_len:
j += i
if j > n:
break
s_sub = (pre_hashs[j] - pre_hashs[i] * pow_base[j - i]) % MOD # source[i:j]
t_sub = (pre_hasht[j] - pre_hasht[i] * pow_base[j - i]) % MOD # target[i:j]
if s_sub in cnt and t_sub in cnt[s_sub]:
res = min(res, dfs(j) + cnt[s_sub][t_sub])
if source[i] == target[i]:
return min(dfs(i + 1), res)
return res
return -1 if (ans := dfs(0)) == inf else ans