Created
August 27, 2024 18:59
-
-
Save pszemraj/37dda050af9c63999b8125318370222d to your computer and use it in GitHub Desktop.
extract non-code strings from python files: comments, docstrings, string literals
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def extract_comments_and_docs(multiline_string): | |
# Pattern to match lines where the first non-whitespace character is '#' | |
comment_pattern = r"^\s*#(.*)" | |
# Pattern to match any text within triple quotes (either ''' or """) | |
docstring_pattern = r"(\'\'\'(.*?)\'\'\'|\"\"\"(.*?)\"\"\")" | |
# Pattern to match any text within single or double quotes (excluding triple quotes) | |
string_pattern = r'(?<!\w)(\'(?:[^\']|\'\')*\'|"(?:[^"]|"")*")(?!\w)' | |
# Extract all comments using the comment pattern | |
comments = re.findall(comment_pattern, multiline_string, re.MULTILINE) | |
# Extract all docstrings using the docstring pattern | |
docstrings = re.findall(docstring_pattern, multiline_string, re.DOTALL) | |
# Extract all strings using the string pattern | |
strings = re.findall(string_pattern, multiline_string) | |
# Since re.findall returns tuples for each match group in docstrings, we need to flatten the list | |
docstrings_flattened = [match[1] if match[1] else match[2] for match in docstrings] | |
# Remove surrounding quotes from strings | |
cleaned_strings = [s.strip("'\"") for s in strings] | |
# Concatenate comments, docstrings, and strings with a space | |
concatenated_string = ( | |
" ".join(comment.strip() for comment in comments) | |
+ " " | |
+ " ".join(docstring.strip() for docstring in docstrings_flattened) | |
+ " " | |
+ " ".join(cleaned_strings) | |
) | |
return concatenated_string.strip() | |
# Example usage: | |
multiline_string = """ | |
#coding utf-8 | |
''' | |
斐波那契数列-循环法 | |
''' | |
def Fib_circle(): | |
msg = "This is a test string" | |
while True: # 去掉while循环,只用for循环 | |
num_1 = 0 | |
num_2 = 1 | |
fib_array = [0] # 用于存储计算出的FB数列值 | |
m = input('你想要查找的起始项:') | |
n = input('你想要查找的结束项:') | |
if m.isdigit() and n.isdigit(): # 在这个实现函数中,不要进行检验。每个函数只做一个事情 | |
m = int(m) # 将输入化为整数型 | |
n = int(n) | |
for i in range(n): | |
num_1, num_2 = num_2, num_1 + num_2 | |
fib_array.append(num_1) | |
print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}') | |
break | |
else: | |
print('请输入有效的正整数') | |
if __name__ == '__main__': | |
Fib_circle() | |
""" | |
result = extract_comments_and_docs(multiline_string) | |
result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment