Skip to content

Instantly share code, notes, and snippets.

@TonyPythoneer
Created March 8, 2021 09:14
Show Gist options
  • Save TonyPythoneer/44535b053cc2c56b7e0d9b3cdc1afb1e to your computer and use it in GitHub Desktop.
Save TonyPythoneer/44535b053cc2c56b7e0d9b3cdc1afb1e to your computer and use it in GitHub Desktop.
KMP
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
IS_DEBUG = False
text_sample = "Peter told me that peter the pickle piper piped a pitted pickle before he petered out. Phew!"
no_output_text = '<No Output>'
testing_dataset = [
# known cases
{
'textToSearch': text_sample,
'subtext': 'Peter',
'output': "1, 20, 75",
},
{
'textToSearch': text_sample,
'subtext': 'peter',
'output': "1, 20, 75",
},
{
'textToSearch': text_sample,
'subtext': 'pick',
'output': "30, 58",
},
{
'textToSearch': text_sample,
'subtext': 'pi',
'output': "30, 37, 43, 51, 58",
},
{
'textToSearch': text_sample,
'subtext': 'z',
'output': no_output_text,
},
{
'textToSearch': text_sample,
'subtext': 'Peterz',
'output': no_output_text,
},
# my cases
{
'textToSearch': text_sample,
'subtext': 'phew!',
'output': '88',
},
{
'textToSearch': text_sample,
'subtext': '!',
'output': '92',
},
{
'textToSearch': text_sample,
'subtext': ' ',
'output': '6, 11, 14, 19, 25, 29, 36, 42, 48, 50, 57, 64, 71, 74, 82, 87',
},
{
'textToSearch': '上AB上ab', # include chinese
'subtext': '上aB',
'output': '1, 4',
},
{
'textToSearch': 'aaaaa', # subtext length is 1
'subtext': 'a',
'output': '1, 2, 3, 4, 5',
},
{
'textToSearch': 'aaa', # textToSearch and subtext are the same
'subtext': 'aaa',
'output': '1',
},
{
'textToSearch': 'aBcDeAbCdEabcde', # test lower
'subtext': 'AbcdE',
'output': '1, 6, 11',
},
{
'textToSearch': 'abababa', # Palindrome
'subtext': 'aba',
'output': '1, 3, 5',
},
{
'textToSearch': 'abcbabcba', # Palindrome
'subtext': 'abcba',
'output': '1, 5',
},
{
'textToSearch': 'baabaabaab',
'subtext': 'aabaa',
'output': '2, 5',
},
# edge cases
{
'textToSearch': '',
'subtext': '',
'output': no_output_text,
},
{
'textToSearch': 0,
'subtext': 'abc',
'output': no_output_text,
},
{
'textToSearch': 'abc',
'subtext': 0,
'output': no_output_text,
},
{
'textToSearch': 'I am so short',
'subtext': 'I am longer than textToSearch',
'output': no_output_text,
},
]
def solution(textToSearch=text_sample, subtext='', *args, **kwargs):
# reject unexpected params
if not textToSearch or not subtext or not isinstance(textToSearch, str) or not isinstance(subtext, str):
return no_output_text
main_text, sub_text = textToSearch.lower(), subtext.lower() # force to lower all string
main_text_index, sub_text_index = 0, 0
main_text_length, sub_text_length = len(main_text), len(sub_text)
failure = get_failure_function(main_text)
# reject when sub text is longer than main text
if main_text_length < sub_text_length:
return no_output_text
indexes = []
while main_text_index < main_text_length:
if IS_DEBUG:
print(sub_text_index, main_text_index) # test_mode
if main_text[main_text_index] == sub_text[sub_text_index]:
# if matched, it increases sub_text_index
sub_text_index += 1
else:
# if dismatched, sub_text_index is rest by failure's value
# (or it sets the previous repeated word's index for the sub_text_index)
if sub_text_index > 0:
sub_text_index = failure[sub_text_index]
continue
main_text_index += 1
if sub_text_index >= sub_text_length:
indexes.append(main_text_index - sub_text_index)
sub_text_index = failure[sub_text_index - 1] # offset the sub_text_index to the previous repeated word's index
if len(indexes) == 0:
return no_output_text
mapping = map(lambda x: str(x+1), indexes)
output_str = ', '.join(mapping)
return output_str
"""
case:
(1)
ij
index 01234567
text abcdabca
table 0 (old)
text[i] != text[j]
so, table[j] = 0 and j+=1
(2)
i j
index 01234567
text abcdabca
table 00
text[i] != text[j]
so, table[j] = 0 and j+=1
(3)
i j
index 01234567
text abcdabca
table 000
text[i] != text[j]
so, table[j] = 0 and j+=1
(4)
i j
index 01234567
text abcdabca
table 0000
text[i] == text[j]
so, table[j] = i+1 and j+=1, i+= 1
(5)
i j
index 01234567
text abcdabca
table 00001
text[i] == text[j]
so, table[j] = i+1 and j+=1, i+= 1
(6)
i j
index 01234567
text abcdabca
table 000012
text[i] == text[j]
so, table[j] = i+1 and j+=1, i+= 1
(7)
i j
index 01234567
text abcdabca
table 0000123
text[i] != text[j]
so, i = table[i-1]
(8)
i j
index 01234567
text abcdabca
table 0000123
text[i] == text[j]
so, table[j] = i+1 and j+=1, i+= 1
(9, stop)
index 01234567
text abcdabca
table 00001231
---
explanation:
it means that the index will go back to `0` the when the searching process mismatches at `index = 0` (a).
it means that the index will go back to `0` the when the searching process mismatches at `index = 1` (b).
it means that the index will go back to `0` the when the searching process mismatches at `index = 2` (c).
it means that the index will go back to `0` the when the searching process mismatches at `index = 3` (d).
it means that the index will go back to `1` the when the searching process mismatches at `index = 4` (a).
it means that the index will go back to `2` the when the searching process mismatches at `index = 5` (b).
it means that the index will go back to `3` the when the searching process mismatches at `index = 6` (c).
it means that the index will go back to `1` the when the searching process mismatches at `index = 7` (a).
---
conclustion:
the table can memorize the index of the previous repeated word and redcue the invalid matching process
---
example:
when it compares `abcdabcdabca` of the main text and `abcdabca` of the sub text,
it happens:
v
abcdabcdabca
↓↓↓↓↓↓↓↓
ooooooox
abcdabca
The sub text just moves minimal index and continues to seek the main text's substring
v
abcdabcdabca
↓↓↓↓
abcdabca
Owing to dismatching, sub_text[7] (a) is invalid but sub_text[0:7] is valid.
However, we don't expect that it will miss any potential match when moving subtext's index.
The last repeated word of the sub_text[6] (c) is sub_text[2] (c).
It means that sub_text[0:3] is valid. Also, we will start at sub_text[3](d) and continue to compare.
"""
def get_failure_function(text):
# i = inner index, it seeks the last repeated word behind the outer index and records its index in the failure function
# j = outer index, it runs the basic loop
i, j, length = 0, 1, len(text)
indexes = [0]
if length == 1:
return indexes
while j < length:
if text[j] == text[i]:
indexes.append(i+1)
i, j = i + 1, j + 1
continue
if i > 0:
i = indexes[i - 1] # jump to the previous word's index, and get the value from the previous word's index
continue
# can't find anymore, so it continues to increase the outer index
indexes.append(0)
j += 1
return indexes
if __name__ == "__main__":
"""
To provide debug mode and run
Otherwise, it will run the testcases
command:
python3 test.py debug
"""
if 'debug' in sys.argv:
IS_DEBUG = True
if IS_DEBUG:
print('=====')
print(solution('aBcDeAbCdEabcde', 'abcde'))
print(get_failure_function('abcde'))
else:
data = testing_dataset[0]
for i, data in enumerate(testing_dataset):
actual_result, expected_result = str(solution(**data)), data['output']
if actual_result != expected_result:
print('=====')
print(f'testing_dataset[{i}] is error')
print(f'your result is: {actual_result}')
print(f'expected result is: {expected_result}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment