Aho-Corasick算法是一种多模式匹配算法,用于在一个文本中查找多个模式串的出现位置。下面是一个使用Aho-Corasick算法实现部分文本匹配的代码示例:
class TrieNode:
def __init__(self):
self.children = {}
self.is_end_of_word = False
self.failure_link = None
self.output = []
class AhoCorasick:
def __init__(self, patterns):
self.root = TrieNode()
self.build_trie(patterns)
self.build_failure_links()
def build_trie(self, patterns):
for pattern in patterns:
current_node = self.root
for char in pattern:
if char not in current_node.children:
current_node.children[char] = TrieNode()
current_node = current_node.children[char]
current_node.is_end_of_word = True
current_node.output.append(pattern)
def build_failure_links(self):
queue = []
for child_node in self.root.children.values():
queue.append(child_node)
child_node.failure_link = self.root
while queue:
current_node = queue.pop(0)
for char, child_node in current_node.children.items():
queue.append(child_node)
failure_node = current_node.failure_link
while failure_node and char not in failure_node.children:
failure_node = failure_node.failure_link
if failure_node:
child_node.failure_link = failure_node.children[char]
child_node.output += failure_node.children[char].output
def match(self, text):
current_node = self.root
matches = []
for i, char in enumerate(text):
while current_node and char not in current_node.children:
current_node = current_node.failure_link
if not current_node:
current_node = self.root
continue
current_node = current_node.children[char]
if current_node.output:
matches.append((i - len(current_node.output[0]) + 1, current_node.output[0]))
return matches
# 示例用法
patterns = ["ab", "abc", "bc", "c"]
text = "abcdefg"
ac = AhoCorasick(patterns)
matches = ac.match(text)
for match in matches:
print("Pattern '{}' found at index {}".format(match[1], match[0]))
以上代码实现了一个Aho-Corasick算法的类AhoCorasick
,通过调用match
方法可以在指定文本中查找多个模式串的出现位置。在示例中,模式串列表为["ab", "abc", "bc", "c"]
,文本为"abcdefg"
,输出结果为:
Pattern 'ab' found at index 0
Pattern 'bc' found at index 1
Pattern 'c' found at index 2
这表明在文本中分别找到了模式串"ab"
、"bc"
和"c"
,它们的起始位置分别为0、1和2。