Skip to content

Instantly share code, notes, and snippets.

@paulgrammer
Created April 20, 2022 17:49
Show Gist options
  • Save paulgrammer/bb3c9ad79b75715b3b032fc06b390056 to your computer and use it in GitHub Desktop.
Save paulgrammer/bb3c9ad79b75715b3b032fc06b390056 to your computer and use it in GitHub Desktop.
import os
import re
import sys
from glob import glob
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
EN_URL = "https://www.jw.org/en/library/bible/nwt/books/"
LG_URL = "https://www.jw.org/lg/layibulale/bayibuli/nwt/ebitabo/"
def get_books(lang_url):
url = requests.get(lang_url)
page =BeautifulSoup(url.text, 'lxml')
books = page.find('select', attrs={'id':'Book'}).text.split('\n')[1:]
for i in range(len(books)):
if(len(books[i].split()) > 1):
hyphen_join = books[i].split()
books[i] = '-'.join(hyphen_join)
return books
en_books = get_books(EN_URL)
en_books.remove('')
lg_books = get_books(LG_URL)
lg_books.remove('')
def write_book_to_file(sub_url, book,lang):
for i in range(len(book)):
os.makedirs("Scrapped/"+lang+book[i])
address = sub_url + book[i]
print(address)
url = requests.get(address)
page = BeautifulSoup(url.text, 'lxml')
chapters = page.find('div', attrs={'class': 'chapters cms-clearfix'}).text.split('\n')[1:]
chapters.remove('')
## Get Chapters for Each book
for ch in chapters:
url1 = requests.get(sub_url + book[i] +'/' + ch)
print(sub_url + book[i] +'/' + ch)
page1 = BeautifulSoup(url1.text,'lxml')
ch1 = page1.find('div',attrs={'id': "bibleText"})
tt = [
verses.text.replace(u'\xa0', u' ').replace('\n',' ').replace('+', '').replace('*', '') for verses in ch1.find_all('span',attrs={'class':'verse'})]
chapter = open("Scrapped/"+lang+book[i]+"/"+str(ch) + ".txt", 'w')
for item in tt:
chapter.write("{}\n".format(item))
write_book_to_file(LG_URL, lg_books,"Luganda/")
write_book_to_file(EN_URL, en_books,"English/")
def merge_books(lang, books):
file_lang = []
for bk in books:
file_lang.append((glob("Scrapped/"+lang+"/" + bk + "/*.txt")))
with open("Scrapped/"+lang+"/merged.txt","wb") as write_file:
for f in file_lang:
for i in f:
with open(i,'rb') as r:
write_file.write(r.read())
merge_books("Luganda", lg_books)
merge_books("English", en_books)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment