TonyPythoneer · March 8, 2021 09:14
diff --git a/main.py b/main.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import sys

 IS_DEBUG = False

 text_sample = "Peter told me that peter the pickle piper piped a pitted pickle before he petered out. Phew!"
 no_output_text = '<No Output>'

 testing_dataset = [
    # known cases
    {
        'textToSearch': text_sample,
        'subtext': 'Peter',
        'output': "1, 20, 75",
    },
    {
        'textToSearch': text_sample,
        'subtext': 'peter',
        'output': "1, 20, 75",
    },
    {
        'textToSearch': text_sample,
        'subtext': 'pick',
        'output': "30, 58",
    },
    {
        'textToSearch': text_sample,
        'subtext': 'pi',
        'output': "30, 37, 43, 51, 58",
    },
    {
        'textToSearch': text_sample,
        'subtext': 'z',
        'output': no_output_text,
    },
    {
        'textToSearch': text_sample,
        'subtext': 'Peterz',
        'output': no_output_text,
    },
    # my cases
    {
        'textToSearch': text_sample,
        'subtext': 'phew!',
        'output': '88',
    },
    {
        'textToSearch': text_sample,
        'subtext': '!',
        'output': '92',
    },
    {
        'textToSearch': text_sample,
        'subtext': ' ',
        'output': '6, 11, 14, 19, 25, 29, 36, 42, 48, 50, 57, 64, 71, 74, 82, 87',
    },
    {
        'textToSearch': '上AB上ab', # include chinese
        'subtext': '上aB',
        'output': '1, 4',
    },
    {
        'textToSearch': 'aaaaa', # subtext length is 1
        'subtext': 'a',
        'output': '1, 2, 3, 4, 5',
    },
    {
        'textToSearch': 'aaa', # textToSearch and subtext are the same
        'subtext': 'aaa',
        'output': '1',
    },
    {
        'textToSearch': 'aBcDeAbCdEabcde', # test lower
        'subtext': 'AbcdE',
        'output': '1, 6, 11',
    },
    {
        'textToSearch': 'abababa', # Palindrome
        'subtext': 'aba',
        'output': '1, 3, 5',
    },
    {
        'textToSearch': 'abcbabcba', # Palindrome
        'subtext': 'abcba',
        'output': '1, 5',
    },
    {
        'textToSearch': 'baabaabaab',
        'subtext': 'aabaa',
        'output': '2, 5',
    },
    # edge cases
    {
        'textToSearch': '',
        'subtext': '',
        'output': no_output_text,
    },
    {
        'textToSearch': 0,
        'subtext': 'abc',
        'output': no_output_text,
    },
    {
        'textToSearch': 'abc',
        'subtext': 0,
        'output': no_output_text,
    },
    {
        'textToSearch': 'I am so short',
        'subtext': 'I am longer than textToSearch',
        'output': no_output_text,
    },
 ]


 def solution(textToSearch=text_sample, subtext='', *args, **kwargs):
    # reject unexpected params
    if not textToSearch or not subtext or not isinstance(textToSearch, str) or not isinstance(subtext, str):
        return no_output_text

    main_text, sub_text = textToSearch.lower(), subtext.lower() # force to lower all string

    main_text_index, sub_text_index = 0, 0
    main_text_length, sub_text_length = len(main_text), len(sub_text)
    failure = get_failure_function(main_text)

    # reject when sub text is longer than main text
    if main_text_length < sub_text_length:
        return no_output_text

    indexes = []
    while main_text_index < main_text_length:
        if IS_DEBUG:
            print(sub_text_index, main_text_index) # test_mode

        if main_text[main_text_index] == sub_text[sub_text_index]:
            # if matched, it increases sub_text_index
            sub_text_index += 1
        else:
            # if dismatched, sub_text_index is rest by failure's value
            # (or it sets the previous repeated word's index for the sub_text_index)
            if sub_text_index > 0:
                sub_text_index = failure[sub_text_index]
                continue

        main_text_index += 1

        if sub_text_index >= sub_text_length:
            indexes.append(main_text_index - sub_text_index)
            sub_text_index = failure[sub_text_index - 1] # offset the sub_text_index to the previous repeated word's index

    if len(indexes) == 0:
        return no_output_text

    mapping = map(lambda x: str(x+1), indexes)
    output_str = ', '.join(mapping)
    return output_str

 """
 case:

 (1)
      ij
 index 01234567
 text  abcdabca
 table 0        (old)

 text[i] != text[j]
 so, table[j] = 0 and j+=1

 (2)
      i j
 index 01234567
 text  abcdabca
 table 00

 text[i] != text[j]
 so, table[j] = 0 and j+=1

 (3)
      i  j
 index 01234567
 text  abcdabca
 table 000

 text[i] != text[j]
 so, table[j] = 0 and j+=1

 (4)
      i   j
 index 01234567
 text  abcdabca
 table 0000

 text[i] == text[j]
 so, table[j] = i+1 and j+=1, i+= 1

 (5)
       i   j
 index 01234567
 text  abcdabca
 table 00001

 text[i] == text[j]
 so, table[j] = i+1 and j+=1, i+= 1

 (6)
        i   j
 index 01234567
 text  abcdabca
 table 000012

 text[i] == text[j]
 so, table[j] = i+1 and j+=1, i+= 1

 (7)
         i   j
 index 01234567
 text  abcdabca
 table 0000123

 text[i] != text[j]
 so, i = table[i-1]

 (8)
      i      j
 index 01234567
 text  abcdabca
 table 0000123

 text[i] == text[j]
 so, table[j] = i+1 and j+=1, i+= 1

 (9, stop)

 index 01234567
 text  abcdabca
 table 00001231

 ---

 explanation:

 it means that the index will go back to `0` the when the searching process mismatches at `index = 0` (a).
 it means that the index will go back to `0` the when the searching process mismatches at `index = 1` (b).
 it means that the index will go back to `0` the when the searching process mismatches at `index = 2` (c).
 it means that the index will go back to `0` the when the searching process mismatches at `index = 3` (d).
 it means that the index will go back to `1` the when the searching process mismatches at `index = 4` (a).
 it means that the index will go back to `2` the when the searching process mismatches at `index = 5` (b).
 it means that the index will go back to `3` the when the searching process mismatches at `index = 6` (c).
 it means that the index will go back to `1` the when the searching process mismatches at `index = 7` (a).

 ---

 conclustion:

 the table can memorize the index of the previous repeated word and redcue the invalid matching process

 ---

 example:

 when it compares `abcdabcdabca` of the main text and `abcdabca` of the sub text,
 it happens:

       v
 abcdabcdabca

 ↓↓↓↓↓↓↓↓
 ooooooox

 abcdabca

 The sub text just moves minimal index and continues to seek the main text's substring

       v
 abcdabcdabca
    ↓↓↓↓
    abcdabca

 Owing to dismatching, sub_text[7] (a) is invalid but sub_text[0:7] is valid.
 However, we don't expect that it will miss any potential match when moving subtext's index.
 The last repeated word of the sub_text[6] (c) is sub_text[2] (c).
 It means that sub_text[0:3] is valid. Also, we will start at sub_text[3](d) and continue to compare.
 """
 def get_failure_function(text):
    # i = inner index, it seeks the last repeated word behind the outer index and records its index in the failure function
    # j = outer index, it runs the basic loop
    i, j, length = 0, 1, len(text)

    indexes = [0]
    if length == 1:
        return indexes

    while j < length:
        if text[j] == text[i]:

            indexes.append(i+1)
            i, j = i + 1, j + 1
            continue

        if i > 0:
            i = indexes[i - 1] # jump to the previous word's index, and get the value from the previous word's index
            continue

        # can't find anymore, so it continues to increase the outer index
        indexes.append(0)
        j += 1

    return indexes


 if __name__ == "__main__":
    """
    To provide debug mode and run
    Otherwise, it will run the testcases

    command:
       python3 test.py debug
    """

    if 'debug' in sys.argv:
        IS_DEBUG = True

    if IS_DEBUG:
        print('=====')
        print(solution('aBcDeAbCdEabcde', 'abcde'))
        print(get_failure_function('abcde'))
    else:
        data = testing_dataset[0]
        for i, data in enumerate(testing_dataset):
            actual_result, expected_result = str(solution(**data)), data['output']
            if actual_result != expected_result:
                print('=====')
                print(f'testing_dataset[{i}] is error')
                print(f'your result is: {actual_result}')
                print(f'expected result is: {expected_result}')
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import sys

	IS_DEBUG = False

	text_sample = "Peter told me that peter the pickle piper piped a pitted pickle before he petered out. Phew!"
	no_output_text = '<No Output>'

	testing_dataset = [
	# known cases
	{
	'textToSearch': text_sample,
	'subtext': 'Peter',
	'output': "1, 20, 75",
	},
	{
	'textToSearch': text_sample,
	'subtext': 'peter',
	'output': "1, 20, 75",
	},
	{
	'textToSearch': text_sample,
	'subtext': 'pick',
	'output': "30, 58",
	},
	{
	'textToSearch': text_sample,
	'subtext': 'pi',
	'output': "30, 37, 43, 51, 58",
	},
	{
	'textToSearch': text_sample,
	'subtext': 'z',
	'output': no_output_text,
	},
	{
	'textToSearch': text_sample,
	'subtext': 'Peterz',
	'output': no_output_text,
	},
	# my cases
	{
	'textToSearch': text_sample,
	'subtext': 'phew!',
	'output': '88',
	},
	{
	'textToSearch': text_sample,
	'subtext': '!',
	'output': '92',
	},
	{
	'textToSearch': text_sample,
	'subtext': ' ',
	'output': '6, 11, 14, 19, 25, 29, 36, 42, 48, 50, 57, 64, 71, 74, 82, 87',
	},
	{
	'textToSearch': '上AB上ab', # include chinese
	'subtext': '上aB',
	'output': '1, 4',
	},
	{
	'textToSearch': 'aaaaa', # subtext length is 1
	'subtext': 'a',
	'output': '1, 2, 3, 4, 5',
	},
	{
	'textToSearch': 'aaa', # textToSearch and subtext are the same
	'subtext': 'aaa',
	'output': '1',
	},
	{
	'textToSearch': 'aBcDeAbCdEabcde', # test lower
	'subtext': 'AbcdE',
	'output': '1, 6, 11',
	},
	{
	'textToSearch': 'abababa', # Palindrome
	'subtext': 'aba',
	'output': '1, 3, 5',
	},
	{
	'textToSearch': 'abcbabcba', # Palindrome
	'subtext': 'abcba',
	'output': '1, 5',
	},
	{
	'textToSearch': 'baabaabaab',
	'subtext': 'aabaa',
	'output': '2, 5',
	},
	# edge cases
	{
	'textToSearch': '',
	'subtext': '',
	'output': no_output_text,
	},
	{
	'textToSearch': 0,
	'subtext': 'abc',
	'output': no_output_text,
	},
	{
	'textToSearch': 'abc',
	'subtext': 0,
	'output': no_output_text,
	},
	{
	'textToSearch': 'I am so short',
	'subtext': 'I am longer than textToSearch',
	'output': no_output_text,
	},
	]


	def solution(textToSearch=text_sample, subtext='', args, *kwargs):
	# reject unexpected params
	if not textToSearch or not subtext or not isinstance(textToSearch, str) or not isinstance(subtext, str):
	return no_output_text

	main_text, sub_text = textToSearch.lower(), subtext.lower() # force to lower all string

	main_text_index, sub_text_index = 0, 0
	main_text_length, sub_text_length = len(main_text), len(sub_text)
	failure = get_failure_function(main_text)

	# reject when sub text is longer than main text
	if main_text_length < sub_text_length:
	return no_output_text

	indexes = []
	while main_text_index < main_text_length:
	if IS_DEBUG:
	print(sub_text_index, main_text_index) # test_mode

	if main_text[main_text_index] == sub_text[sub_text_index]:
	# if matched, it increases sub_text_index
	sub_text_index += 1
	else:
	# if dismatched, sub_text_index is rest by failure's value
	# (or it sets the previous repeated word's index for the sub_text_index)
	if sub_text_index > 0:
	sub_text_index = failure[sub_text_index]
	continue

	main_text_index += 1

	if sub_text_index >= sub_text_length:
	indexes.append(main_text_index - sub_text_index)
	sub_text_index = failure[sub_text_index - 1] # offset the sub_text_index to the previous repeated word's index

	if len(indexes) == 0:
	return no_output_text

	mapping = map(lambda x: str(x+1), indexes)
	output_str = ', '.join(mapping)
	return output_str

	"""
	case:

	(1)
	ij
	index 01234567
	text abcdabca
	table 0 (old)

	text[i] != text[j]
	so, table[j] = 0 and j+=1

	(2)
	i j
	index 01234567
	text abcdabca
	table 00

	text[i] != text[j]
	so, table[j] = 0 and j+=1

	(3)
	i j
	index 01234567
	text abcdabca
	table 000

	text[i] != text[j]
	so, table[j] = 0 and j+=1

	(4)
	i j
	index 01234567
	text abcdabca
	table 0000

	text[i] == text[j]
	so, table[j] = i+1 and j+=1, i+= 1

	(5)
	i j
	index 01234567
	text abcdabca
	table 00001

	text[i] == text[j]
	so, table[j] = i+1 and j+=1, i+= 1

	(6)
	i j
	index 01234567
	text abcdabca
	table 000012

	text[i] == text[j]
	so, table[j] = i+1 and j+=1, i+= 1

	(7)
	i j
	index 01234567
	text abcdabca
	table 0000123

	text[i] != text[j]
	so, i = table[i-1]

	(8)
	i j
	index 01234567
	text abcdabca
	table 0000123

	text[i] == text[j]
	so, table[j] = i+1 and j+=1, i+= 1

	(9, stop)

	index 01234567
	text abcdabca
	table 00001231

	---

	explanation:

	it means that the index will go back to `0` the when the searching process mismatches at `index = 0` (a).
	it means that the index will go back to `0` the when the searching process mismatches at `index = 1` (b).
	it means that the index will go back to `0` the when the searching process mismatches at `index = 2` (c).
	it means that the index will go back to `0` the when the searching process mismatches at `index = 3` (d).
	it means that the index will go back to `1` the when the searching process mismatches at `index = 4` (a).
	it means that the index will go back to `2` the when the searching process mismatches at `index = 5` (b).
	it means that the index will go back to `3` the when the searching process mismatches at `index = 6` (c).
	it means that the index will go back to `1` the when the searching process mismatches at `index = 7` (a).

	---

	conclustion:

	the table can memorize the index of the previous repeated word and redcue the invalid matching process

	---

	example:

	when it compares `abcdabcdabca` of the main text and `abcdabca` of the sub text,
	it happens:

	v
	abcdabcdabca

	↓↓↓↓↓↓↓↓
	ooooooox

	abcdabca

	The sub text just moves minimal index and continues to seek the main text's substring

	v
	abcdabcdabca
	↓↓↓↓
	abcdabca

	Owing to dismatching, sub_text[7] (a) is invalid but sub_text[0:7] is valid.
	However, we don't expect that it will miss any potential match when moving subtext's index.
	The last repeated word of the sub_text[6] (c) is sub_text[2] (c).
	It means that sub_text[0:3] is valid. Also, we will start at sub_text[3](d) and continue to compare.
	"""
	def get_failure_function(text):
	# i = inner index, it seeks the last repeated word behind the outer index and records its index in the failure function
	# j = outer index, it runs the basic loop
	i, j, length = 0, 1, len(text)

	indexes = [0]
	if length == 1:
	return indexes

	while j < length:
	if text[j] == text[i]:

	indexes.append(i+1)
	i, j = i + 1, j + 1
	continue

	if i > 0:
	i = indexes[i - 1] # jump to the previous word's index, and get the value from the previous word's index
	continue

	# can't find anymore, so it continues to increase the outer index
	indexes.append(0)
	j += 1

	return indexes


	if __name__ == "__main__":
	"""
	To provide debug mode and run
	Otherwise, it will run the testcases

	command:
	python3 test.py debug
	"""

	if 'debug' in sys.argv:
	IS_DEBUG = True

	if IS_DEBUG:
	print('=====')
	print(solution('aBcDeAbCdEabcde', 'abcde'))
	print(get_failure_function('abcde'))
	else:
	data = testing_dataset[0]
	for i, data in enumerate(testing_dataset):
	actual_result, expected_result = str(solution(**data)), data['output']
	if actual_result != expected_result:
	print('=====')
	print(f'testing_dataset[{i}] is error')
	print(f'your result is: {actual_result}')
	print(f'expected result is: {expected_result}')