Created
April 4, 2024 03:53
-
-
Save yshalsager/bdf484a792a57d0db2b509f883da12e0 to your computer and use it in GitHub Desktop.
Steps to create an Arabic dictionary for AOSP keyboard from shamela.ws
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from pathlib import Path | |
import re | |
# Check if a file name is provided | |
if len(sys.argv) < 2: | |
print("Usage: python calc_weight.py <filename>") | |
sys.exit(1) | |
filename = Path(sys.argv[1]) | |
# Ensure the file exists | |
if not filename.exists(): | |
print(f"Error: File {filename} does not exist.") | |
sys.exit(1) | |
# Count the number of lines | |
with filename.open("r", encoding="utf-8") as file: | |
lines = file.readlines() | |
count = len(lines) | |
# Calculate the divider to ensure results between 50 and 254 | |
divider = (count // 205) + 1 | |
# Process the lines and update the weight | |
new_lines = [] | |
for line in lines: | |
count -= 1 | |
# Replace the weight if it's a word line, otherwise add without actions | |
if "f=" in line: | |
weighed = (count // divider) + 50 | |
name = re.search("=(.*),", line) | |
if name and len(name.group(1)) > 1 and not name.group(1).isdigit(): | |
line = re.sub(r"(\d*[.])?\d+", str(weighed), line) | |
new_lines.append(line) | |
# Write the updated lines back to the file | |
with filename.open("w", encoding="utf-8") as file: | |
file.writelines(new_lines) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install regex | |
from pathlib import Path | |
import regex | |
# Compile the pattern to match lines with non-Arabic characters in the word field | |
pattern = regex.compile(r"word=.*\P{Arabic}.*, f=\d+") | |
# Read the input file | |
input_file = Path(".").resolve() / "word_list.txt" | |
lines = input_file.read_text().splitlines() | |
# Filter out the lines that match the pattern | |
arabic_lines = (line for line in lines if not pattern.search(line)) | |
# Write the result back to the input file | |
input_file.write_text("\n".join(arabic_lines)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install ebooklib parsel pyarabic | |
import ebooklib | |
from ebooklib import epub | |
from parsel import Selector | |
from pyarabic import araby as pyarabic | |
from pathlib import Path | |
# Function to extract text from an EPUB item using parsel | |
def extract_text_from_item(item): | |
selector = Selector(text=item.get_content().decode()) | |
text = " ".join(selector.css("*::text").getall()) | |
return text.strip() | |
# Function to process an EPUB file | |
def process_epub(file_path): | |
# Check if the output file already exists | |
output_file_path = file_path.with_suffix(".txt") | |
# if output_file_path.exists(): | |
# print(f"Output file already exists for {file_path}. Skipping.") | |
# return # Exit early | |
# Load the EPUB file | |
book = epub.read_epub(str(file_path)) | |
# Extract text from each item in the EPUB using parsel | |
texts = [] | |
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): | |
text = extract_text_from_item(item) | |
texts.append(text) | |
# Combine all texts into a single string | |
full_text = " ".join(texts) | |
# Remove Arabic tashkeel | |
full_text = pyarabic.strip_tashkeel(full_text) | |
# Tokenize the text into words using pyarabic | |
words = pyarabic.tokenize(full_text) | |
# Write each word to the output file | |
with open(output_file_path, "w", encoding="utf-8") as output_file: | |
for word in words: | |
if word.isalpha(): | |
output_file.write(word + "\n") | |
# Iterate over subfolders in the current folder looking for books | |
for root in Path(".").rglob("*.epub"): | |
print(root) | |
process_epub(root) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install tqdm | |
from collections import Counter | |
import argparse | |
from tqdm import tqdm | |
def count_lines(filename): | |
with open(filename, 'r', encoding='utf-8', errors='ignore') as file: | |
return sum(1 for _ in file) | |
def word_frequency(filename, total_lines): | |
frequency = Counter() | |
with open(filename, 'r', encoding='utf-8', errors='ignore') as file: | |
for line in tqdm(file, total=total_lines, desc="Processing file", unit="lines"): | |
words = line.lower().split() | |
frequency.update(words) | |
return frequency | |
def write_frequency_to_file(frequency, output_filename): | |
with open(output_filename, 'w', encoding='utf-8') as file: | |
for word, count in frequency.items(): | |
file.write(f" word={word}, f={count}\n") | |
def main(): | |
parser = argparse.ArgumentParser(description='Calculate word frequency and write to a file.') | |
parser.add_argument('input_file', type=str, help='The input file to read words from.') | |
parser.add_argument('output_file', type=str, help='The output file to write word frequencies to.') | |
args = parser.parse_args() | |
total_lines = count_lines(args.input_file) | |
frequency = word_frequency(args.input_file, total_lines) | |
write_frequency_to_file(frequency, args.output_file) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment