Created
March 8, 2021 09:14
-
-
Save TonyPythoneer/44535b053cc2c56b7e0d9b3cdc1afb1e to your computer and use it in GitHub Desktop.
KMP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import sys | |
IS_DEBUG = False | |
text_sample = "Peter told me that peter the pickle piper piped a pitted pickle before he petered out. Phew!" | |
no_output_text = '<No Output>' | |
testing_dataset = [ | |
# known cases | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'Peter', | |
'output': "1, 20, 75", | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'peter', | |
'output': "1, 20, 75", | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'pick', | |
'output': "30, 58", | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'pi', | |
'output': "30, 37, 43, 51, 58", | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'z', | |
'output': no_output_text, | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'Peterz', | |
'output': no_output_text, | |
}, | |
# my cases | |
{ | |
'textToSearch': text_sample, | |
'subtext': 'phew!', | |
'output': '88', | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': '!', | |
'output': '92', | |
}, | |
{ | |
'textToSearch': text_sample, | |
'subtext': ' ', | |
'output': '6, 11, 14, 19, 25, 29, 36, 42, 48, 50, 57, 64, 71, 74, 82, 87', | |
}, | |
{ | |
'textToSearch': '上AB上ab', # include chinese | |
'subtext': '上aB', | |
'output': '1, 4', | |
}, | |
{ | |
'textToSearch': 'aaaaa', # subtext length is 1 | |
'subtext': 'a', | |
'output': '1, 2, 3, 4, 5', | |
}, | |
{ | |
'textToSearch': 'aaa', # textToSearch and subtext are the same | |
'subtext': 'aaa', | |
'output': '1', | |
}, | |
{ | |
'textToSearch': 'aBcDeAbCdEabcde', # test lower | |
'subtext': 'AbcdE', | |
'output': '1, 6, 11', | |
}, | |
{ | |
'textToSearch': 'abababa', # Palindrome | |
'subtext': 'aba', | |
'output': '1, 3, 5', | |
}, | |
{ | |
'textToSearch': 'abcbabcba', # Palindrome | |
'subtext': 'abcba', | |
'output': '1, 5', | |
}, | |
{ | |
'textToSearch': 'baabaabaab', | |
'subtext': 'aabaa', | |
'output': '2, 5', | |
}, | |
# edge cases | |
{ | |
'textToSearch': '', | |
'subtext': '', | |
'output': no_output_text, | |
}, | |
{ | |
'textToSearch': 0, | |
'subtext': 'abc', | |
'output': no_output_text, | |
}, | |
{ | |
'textToSearch': 'abc', | |
'subtext': 0, | |
'output': no_output_text, | |
}, | |
{ | |
'textToSearch': 'I am so short', | |
'subtext': 'I am longer than textToSearch', | |
'output': no_output_text, | |
}, | |
] | |
def solution(textToSearch=text_sample, subtext='', *args, **kwargs): | |
# reject unexpected params | |
if not textToSearch or not subtext or not isinstance(textToSearch, str) or not isinstance(subtext, str): | |
return no_output_text | |
main_text, sub_text = textToSearch.lower(), subtext.lower() # force to lower all string | |
main_text_index, sub_text_index = 0, 0 | |
main_text_length, sub_text_length = len(main_text), len(sub_text) | |
failure = get_failure_function(main_text) | |
# reject when sub text is longer than main text | |
if main_text_length < sub_text_length: | |
return no_output_text | |
indexes = [] | |
while main_text_index < main_text_length: | |
if IS_DEBUG: | |
print(sub_text_index, main_text_index) # test_mode | |
if main_text[main_text_index] == sub_text[sub_text_index]: | |
# if matched, it increases sub_text_index | |
sub_text_index += 1 | |
else: | |
# if dismatched, sub_text_index is rest by failure's value | |
# (or it sets the previous repeated word's index for the sub_text_index) | |
if sub_text_index > 0: | |
sub_text_index = failure[sub_text_index] | |
continue | |
main_text_index += 1 | |
if sub_text_index >= sub_text_length: | |
indexes.append(main_text_index - sub_text_index) | |
sub_text_index = failure[sub_text_index - 1] # offset the sub_text_index to the previous repeated word's index | |
if len(indexes) == 0: | |
return no_output_text | |
mapping = map(lambda x: str(x+1), indexes) | |
output_str = ', '.join(mapping) | |
return output_str | |
""" | |
case: | |
(1) | |
ij | |
index 01234567 | |
text abcdabca | |
table 0 (old) | |
text[i] != text[j] | |
so, table[j] = 0 and j+=1 | |
(2) | |
i j | |
index 01234567 | |
text abcdabca | |
table 00 | |
text[i] != text[j] | |
so, table[j] = 0 and j+=1 | |
(3) | |
i j | |
index 01234567 | |
text abcdabca | |
table 000 | |
text[i] != text[j] | |
so, table[j] = 0 and j+=1 | |
(4) | |
i j | |
index 01234567 | |
text abcdabca | |
table 0000 | |
text[i] == text[j] | |
so, table[j] = i+1 and j+=1, i+= 1 | |
(5) | |
i j | |
index 01234567 | |
text abcdabca | |
table 00001 | |
text[i] == text[j] | |
so, table[j] = i+1 and j+=1, i+= 1 | |
(6) | |
i j | |
index 01234567 | |
text abcdabca | |
table 000012 | |
text[i] == text[j] | |
so, table[j] = i+1 and j+=1, i+= 1 | |
(7) | |
i j | |
index 01234567 | |
text abcdabca | |
table 0000123 | |
text[i] != text[j] | |
so, i = table[i-1] | |
(8) | |
i j | |
index 01234567 | |
text abcdabca | |
table 0000123 | |
text[i] == text[j] | |
so, table[j] = i+1 and j+=1, i+= 1 | |
(9, stop) | |
index 01234567 | |
text abcdabca | |
table 00001231 | |
--- | |
explanation: | |
it means that the index will go back to `0` the when the searching process mismatches at `index = 0` (a). | |
it means that the index will go back to `0` the when the searching process mismatches at `index = 1` (b). | |
it means that the index will go back to `0` the when the searching process mismatches at `index = 2` (c). | |
it means that the index will go back to `0` the when the searching process mismatches at `index = 3` (d). | |
it means that the index will go back to `1` the when the searching process mismatches at `index = 4` (a). | |
it means that the index will go back to `2` the when the searching process mismatches at `index = 5` (b). | |
it means that the index will go back to `3` the when the searching process mismatches at `index = 6` (c). | |
it means that the index will go back to `1` the when the searching process mismatches at `index = 7` (a). | |
--- | |
conclustion: | |
the table can memorize the index of the previous repeated word and redcue the invalid matching process | |
--- | |
example: | |
when it compares `abcdabcdabca` of the main text and `abcdabca` of the sub text, | |
it happens: | |
v | |
abcdabcdabca | |
↓↓↓↓↓↓↓↓ | |
ooooooox | |
abcdabca | |
The sub text just moves minimal index and continues to seek the main text's substring | |
v | |
abcdabcdabca | |
↓↓↓↓ | |
abcdabca | |
Owing to dismatching, sub_text[7] (a) is invalid but sub_text[0:7] is valid. | |
However, we don't expect that it will miss any potential match when moving subtext's index. | |
The last repeated word of the sub_text[6] (c) is sub_text[2] (c). | |
It means that sub_text[0:3] is valid. Also, we will start at sub_text[3](d) and continue to compare. | |
""" | |
def get_failure_function(text): | |
# i = inner index, it seeks the last repeated word behind the outer index and records its index in the failure function | |
# j = outer index, it runs the basic loop | |
i, j, length = 0, 1, len(text) | |
indexes = [0] | |
if length == 1: | |
return indexes | |
while j < length: | |
if text[j] == text[i]: | |
indexes.append(i+1) | |
i, j = i + 1, j + 1 | |
continue | |
if i > 0: | |
i = indexes[i - 1] # jump to the previous word's index, and get the value from the previous word's index | |
continue | |
# can't find anymore, so it continues to increase the outer index | |
indexes.append(0) | |
j += 1 | |
return indexes | |
if __name__ == "__main__": | |
""" | |
To provide debug mode and run | |
Otherwise, it will run the testcases | |
command: | |
python3 test.py debug | |
""" | |
if 'debug' in sys.argv: | |
IS_DEBUG = True | |
if IS_DEBUG: | |
print('=====') | |
print(solution('aBcDeAbCdEabcde', 'abcde')) | |
print(get_failure_function('abcde')) | |
else: | |
data = testing_dataset[0] | |
for i, data in enumerate(testing_dataset): | |
actual_result, expected_result = str(solution(**data)), data['output'] | |
if actual_result != expected_result: | |
print('=====') | |
print(f'testing_dataset[{i}] is error') | |
print(f'your result is: {actual_result}') | |
print(f'expected result is: {expected_result}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment