Skip to content

Instantly share code, notes, and snippets.

@Witty-Kitty
Last active April 11, 2023 15:43
Show Gist options
  • Save Witty-Kitty/c863370b3e6b1f73bb1c814be1b334e1 to your computer and use it in GitHub Desktop.
Save Witty-Kitty/c863370b3e6b1f73bb1c814be1b334e1 to your computer and use it in GitHub Desktop.
Word Frequencies for TTS
#!/usr/bin/env python3
import argparse
from collections import Counter
from pathlib import Path
import string
def word_frequency(text_directory: Path) -> Counter:
"""Get the word frequency table for a directory of text files."""
if not text_directory.is_dir():
raise IOError(f"cannot find text directory path: {text_directory}")
word_count = Counter()
for text_file in text_directory.glob("*.txt"):
word_count += file_word_count(text_file)
return word_count
def file_word_count(text_file: Path) -> Counter:
"""Get the word frequency table for a single file."""
with open(text_file) as text_fp:
file_text = remove_punctuation(text_fp.read().lower().strip())
return Counter(file_text.split())
def remove_punctuation(text: str) -> str:
"""Remove punctuation from a string."""
return text.translate(str.maketrans("", "", string.punctuation))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
"--text-directory",
required=True,
type=Path,
default=Path.cwd(),
help="path to folder containing text files.",
)
args = parser.parse_args()
word_freq = word_frequency(args.input).most_common()
with open('tts_frequencies.txt', 'w') as f:
for item in word_freq:
f.write(str(item[0]) + ' ' + str(item[1]) + '\n')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment