Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created August 27, 2024 18:59
Show Gist options
  • Save pszemraj/37dda050af9c63999b8125318370222d to your computer and use it in GitHub Desktop.
Save pszemraj/37dda050af9c63999b8125318370222d to your computer and use it in GitHub Desktop.
extract non-code strings from python files: comments, docstrings, string literals
import re
def extract_comments_and_docs(multiline_string):
# Pattern to match lines where the first non-whitespace character is '#'
comment_pattern = r"^\s*#(.*)"
# Pattern to match any text within triple quotes (either ''' or """)
docstring_pattern = r"(\'\'\'(.*?)\'\'\'|\"\"\"(.*?)\"\"\")"
# Pattern to match any text within single or double quotes (excluding triple quotes)
string_pattern = r'(?<!\w)(\'(?:[^\']|\'\')*\'|"(?:[^"]|"")*")(?!\w)'
# Extract all comments using the comment pattern
comments = re.findall(comment_pattern, multiline_string, re.MULTILINE)
# Extract all docstrings using the docstring pattern
docstrings = re.findall(docstring_pattern, multiline_string, re.DOTALL)
# Extract all strings using the string pattern
strings = re.findall(string_pattern, multiline_string)
# Since re.findall returns tuples for each match group in docstrings, we need to flatten the list
docstrings_flattened = [match[1] if match[1] else match[2] for match in docstrings]
# Remove surrounding quotes from strings
cleaned_strings = [s.strip("'\"") for s in strings]
# Concatenate comments, docstrings, and strings with a space
concatenated_string = (
" ".join(comment.strip() for comment in comments)
+ " "
+ " ".join(docstring.strip() for docstring in docstrings_flattened)
+ " "
+ " ".join(cleaned_strings)
)
return concatenated_string.strip()
# Example usage:
multiline_string = """
#coding utf-8
'''
斐波那契数列-循环法
'''
def Fib_circle():
msg = "This is a test string"
while True: # 去掉while循环,只用for循环
num_1 = 0
num_2 = 1
fib_array = [0] # 用于存储计算出的FB数列值
m = input('你想要查找的起始项:')
n = input('你想要查找的结束项:')
if m.isdigit() and n.isdigit(): # 在这个实现函数中,不要进行检验。每个函数只做一个事情
m = int(m) # 将输入化为整数型
n = int(n)
for i in range(n):
num_1, num_2 = num_2, num_1 + num_2
fib_array.append(num_1)
print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}')
break
else:
print('请输入有效的正整数')
if __name__ == '__main__':
Fib_circle()
"""
result = extract_comments_and_docs(multiline_string)
result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment