yshalsager · April 4, 2024 03:53
diff --git a/calc_weight.py b/calc_weight.py
 import sys
 from pathlib import Path
 import re

 # Check if a file name is provided
 if len(sys.argv) < 2:
    print("Usage: python calc_weight.py <filename>")
    sys.exit(1)

 filename = Path(sys.argv[1])

 # Ensure the file exists
 if not filename.exists():
    print(f"Error: File {filename} does not exist.")
    sys.exit(1)

 # Count the number of lines
 with filename.open("r", encoding="utf-8") as file:
    lines = file.readlines()
    count = len(lines)

 # Calculate the divider to ensure results between 50 and 254
 divider = (count // 205) + 1

 # Process the lines and update the weight
 new_lines = []
 for line in lines:
    count -= 1

    # Replace the weight if it's a word line, otherwise add without actions
    if "f=" in line:
        weighed = (count // divider) + 50
        name = re.search("=(.*),", line)
        if name and len(name.group(1)) > 1 and not name.group(1).isdigit():
            line = re.sub(r"(\d*[.])?\d+", str(weighed), line)
    new_lines.append(line)

 # Write the updated lines back to the file
 with filename.open("w", encoding="utf-8") as file:
    file.writelines(new_lines)
diff --git a/cleanup_non_arabic.py b/cleanup_non_arabic.py
 # pip install regex
 from pathlib import Path
 import regex

 # Compile the pattern to match lines with non-Arabic characters in the word field
 pattern = regex.compile(r"word=.*\P{Arabic}.*, f=\d+")

 # Read the input file
 input_file = Path(".").resolve() / "word_list.txt"
 lines = input_file.read_text().splitlines()

 # Filter out the lines that match the pattern
 arabic_lines = (line for line in lines if not pattern.search(line))

 # Write the result back to the input file
 input_file.write_text("\n".join(arabic_lines))
diff --git a/extract_words_from_epub.py b/extract_words_from_epub.py
 # pip install ebooklib parsel pyarabic
 import ebooklib
 from ebooklib import epub
 from parsel import Selector
 from pyarabic import araby as pyarabic
 from pathlib import Path


 # Function to extract text from an EPUB item using parsel
 def extract_text_from_item(item):
    selector = Selector(text=item.get_content().decode())
    text = " ".join(selector.css("*::text").getall())
    return text.strip()


 # Function to process an EPUB file
 def process_epub(file_path):
    # Check if the output file already exists
    output_file_path = file_path.with_suffix(".txt")
    #    if output_file_path.exists():
    #        print(f"Output file already exists for {file_path}. Skipping.")
    #        return # Exit early

    # Load the EPUB file
    book = epub.read_epub(str(file_path))

    # Extract text from each item in the EPUB using parsel
    texts = []
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        text = extract_text_from_item(item)
        texts.append(text)

    # Combine all texts into a single string
    full_text = " ".join(texts)

    # Remove Arabic tashkeel
    full_text = pyarabic.strip_tashkeel(full_text)
    # Tokenize the text into words using pyarabic
    words = pyarabic.tokenize(full_text)
    # Write each word to the output file
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        for word in words:
            if word.isalpha():
                output_file.write(word + "\n")


 # Iterate over subfolders in the current folder looking for books
 for root in Path(".").rglob("*.epub"):
    print(root)
    process_epub(root)
diff --git a/gen_freq.py b/gen_freq.py
 # pip install tqdm
 from collections import Counter
 import argparse
 from tqdm import tqdm

 def count_lines(filename):
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        return sum(1 for _ in file)

 def word_frequency(filename, total_lines):
    frequency = Counter()
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"):
            words = line.lower().split()
            frequency.update(words)
    return frequency

 def write_frequency_to_file(frequency, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as file:
        for word, count in frequency.items():
            file.write(f" word={word}, f={count}\n")

 def main():
    parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.')
    parser.add_argument('input_file', type=str, help='The input file to read words from.')
    parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.')
    args = parser.parse_args()

    total_lines = count_lines(args.input_file)
    frequency = word_frequency(args.input_file, total_lines)

    write_frequency_to_file(frequency, args.output_file)

 if __name__ == "__main__":
    main()
	import sys
	from pathlib import Path
	import re

	# Check if a file name is provided
	if len(sys.argv) < 2:
	print("Usage: python calc_weight.py <filename>")
	sys.exit(1)

	filename = Path(sys.argv[1])

	# Ensure the file exists
	if not filename.exists():
	print(f"Error: File {filename} does not exist.")
	sys.exit(1)

	# Count the number of lines
	with filename.open("r", encoding="utf-8") as file:
	lines = file.readlines()
	count = len(lines)

	# Calculate the divider to ensure results between 50 and 254
	divider = (count // 205) + 1

	# Process the lines and update the weight
	new_lines = []
	for line in lines:
	count -= 1

	# Replace the weight if it's a word line, otherwise add without actions
	if "f=" in line:
	weighed = (count // divider) + 50
	name = re.search("=(.*),", line)
	if name and len(name.group(1)) > 1 and not name.group(1).isdigit():
	line = re.sub(r"(\d*[.])?\d+", str(weighed), line)
	new_lines.append(line)

	# Write the updated lines back to the file
	with filename.open("w", encoding="utf-8") as file:
	file.writelines(new_lines)
	# pip install regex
	from pathlib import Path
	import regex

	# Compile the pattern to match lines with non-Arabic characters in the word field
	pattern = regex.compile(r"word=.\P{Arabic}., f=\d+")

	# Read the input file
	input_file = Path(".").resolve() / "word_list.txt"
	lines = input_file.read_text().splitlines()

	# Filter out the lines that match the pattern
	arabic_lines = (line for line in lines if not pattern.search(line))

	# Write the result back to the input file
	input_file.write_text("\n".join(arabic_lines))
	# pip install ebooklib parsel pyarabic
	import ebooklib
	from ebooklib import epub
	from parsel import Selector
	from pyarabic import araby as pyarabic
	from pathlib import Path


	# Function to extract text from an EPUB item using parsel
	def extract_text_from_item(item):
	selector = Selector(text=item.get_content().decode())
	text = " ".join(selector.css("*::text").getall())
	return text.strip()


	# Function to process an EPUB file
	def process_epub(file_path):
	# Check if the output file already exists
	output_file_path = file_path.with_suffix(".txt")
	# if output_file_path.exists():
	# print(f"Output file already exists for {file_path}. Skipping.")
	# return # Exit early

	# Load the EPUB file
	book = epub.read_epub(str(file_path))

	# Extract text from each item in the EPUB using parsel
	texts = []
	for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
	text = extract_text_from_item(item)
	texts.append(text)

	# Combine all texts into a single string
	full_text = " ".join(texts)

	# Remove Arabic tashkeel
	full_text = pyarabic.strip_tashkeel(full_text)
	# Tokenize the text into words using pyarabic
	words = pyarabic.tokenize(full_text)
	# Write each word to the output file
	with open(output_file_path, "w", encoding="utf-8") as output_file:
	for word in words:
	if word.isalpha():
	output_file.write(word + "\n")


	# Iterate over subfolders in the current folder looking for books
	for root in Path(".").rglob("*.epub"):
	print(root)
	process_epub(root)
	# pip install tqdm
	from collections import Counter
	import argparse
	from tqdm import tqdm

	def count_lines(filename):
	with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
	return sum(1 for _ in file)

	def word_frequency(filename, total_lines):
	frequency = Counter()
	with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
	for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"):
	words = line.lower().split()
	frequency.update(words)
	return frequency

	def write_frequency_to_file(frequency, output_filename):
	with open(output_filename, 'w', encoding='utf-8') as file:
	for word, count in frequency.items():
	file.write(f" word={word}, f={count}\n")

	def main():
	parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.')
	parser.add_argument('input_file', type=str, help='The input file to read words from.')
	parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.')
	args = parser.parse_args()

	total_lines = count_lines(args.input_file)
	frequency = word_frequency(args.input_file, total_lines)

	write_frequency_to_file(frequency, args.output_file)

	if __name__ == "__main__":
	main()