Last active
April 11, 2023 15:43
-
-
Save Witty-Kitty/c863370b3e6b1f73bb1c814be1b334e1 to your computer and use it in GitHub Desktop.
Word Frequencies for TTS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
from collections import Counter | |
from pathlib import Path | |
import string | |
def word_frequency(text_directory: Path) -> Counter: | |
"""Get the word frequency table for a directory of text files.""" | |
if not text_directory.is_dir(): | |
raise IOError(f"cannot find text directory path: {text_directory}") | |
word_count = Counter() | |
for text_file in text_directory.glob("*.txt"): | |
word_count += file_word_count(text_file) | |
return word_count | |
def file_word_count(text_file: Path) -> Counter: | |
"""Get the word frequency table for a single file.""" | |
with open(text_file) as text_fp: | |
file_text = remove_punctuation(text_fp.read().lower().strip()) | |
return Counter(file_text.split()) | |
def remove_punctuation(text: str) -> str: | |
"""Remove punctuation from a string.""" | |
return text.translate(str.maketrans("", "", string.punctuation)) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-i", | |
"--input", | |
"--text-directory", | |
required=True, | |
type=Path, | |
default=Path.cwd(), | |
help="path to folder containing text files.", | |
) | |
args = parser.parse_args() | |
word_freq = word_frequency(args.input).most_common() | |
with open('tts_frequencies.txt', 'w') as f: | |
for item in word_freq: | |
f.write(str(item[0]) + ' ' + str(item[1]) + '\n') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment