Skip to content

Instantly share code, notes, and snippets.

@GuyMicciche
Last active August 17, 2023 21:03
Show Gist options
  • Save GuyMicciche/25328acd1551d6a93bc95c43c9cb0ce2 to your computer and use it in GitHub Desktop.
Save GuyMicciche/25328acd1551d6a93bc95c43c9cb0ce2 to your computer and use it in GitHub Desktop.
Extracts specific content from HTML based on a dictionary input, and then processes the data into a structured output.
"""
Description:
This code implements a specialized HTML parser using Python's built-in
HTMLParser module. Its primary function is to extract specific content
from HTML based on user-defined tags and attributes. The code is tailored
to capture content from <h2> tags, <p> tags with the class 'themeScrp',
and <div> tags with a 'data-date' attribute of value 'tabContent'.
Once parsed, the extracted data is then processed to produce a concise
result containing the date, the full scripture text, a scripture reference,
and the daily text. This parser provides a flexible and efficient solution
for extracting relevant information from structured HTML content.
Author:
Guy Micciche
"""
from html.parser import HTMLParser
import re
from collections.abc import Mapping
from datetime import datetime
# THE HTML ALWAYS CONTAINS 3 DAILY TEXTS, YESTERDAY (0), TODAY (1), and TOMORROW (2).
# ALWAYS GET TODAY
day = 1 # TODAY
# Get the current date
current_date = datetime.now()
# Format the date
formatted_date = current_date.strftime('%Y-%m-%d')
class MyHTMLParser(HTMLParser):
def __init__(self, tags_to_search):
super().__init__()
self.tags_to_search = tags_to_search
self.recording = {tag: False for tag in tags_to_search}
self.current_data = {}
self.nested_count = {}
self.contents = {tag: [] for tag in tags_to_search}
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
criteria = self.tags_to_search.get(tag)
if criteria is not None:
if isinstance(criteria, Mapping): # Checks if criteria is a dictionary
if all(v in attrs.get(k, "") for k, v in criteria.items()):
self.start_recording(tag)
else:
self.start_recording(tag)
def handle_endtag(self, tag):
if tag in self.recording and self.recording[tag]:
self.nested_count[tag] -= 1
if self.nested_count[tag] == 0:
content = ''.join(self.current_data[tag]).strip()
self.contents[tag].append(content)
self.recording[tag] = False
def handle_data(self, data):
for tag, recording in self.recording.items():
if recording:
self.current_data[tag].append(data)
def start_recording(self, tag):
self.recording[tag] = True
self.current_data[tag] = []
self.nested_count[tag] = self.nested_count.get(tag, 0) + 1
tags_to_search = {
"h2": {},
"p": {"class": "themeScrp"},
"div": {"data-date": formatted_date} # will always be today's date, use "div": {"class": "tabContent"} if you want to get it by the "day" variable at top, adjust the tabContent_text at the bottom to use "day" variable
}
parser = MyHTMLParser(tags_to_search)
parser.feed(inputData['htmlContent'])
# YOU CAN OUTPUT JUST THE TAGS, OR FORMAT THEM
# UNFORMATTED TAGS
"""
result = {}
for tag in tags_to_search:
result[tag] = parser.contents[tag][1] if len(parser.contents[tag]) > 1 else 'Not Found'
"""
# FORMATTED TAGS
h2_text = parser.contents["h2"][day] if len(parser.contents["h2"]) > day else 'Not Found'
themeScrp_text = parser.contents["p"][day] if len(parser.contents["p"]) > day else 'Not Found'
tabContent_text = parser.contents["div"][0] if len(parser.contents["div"]) > 0 else 'Not Found'
# Extracting scripture from themeScrp_text based on pattern
scripture_match = re.search(r'—(.*? \d+:\d+)', themeScrp_text)
scripture_text = scripture_match.group(1) if scripture_match else 'Not Found'
result = {
'dateText': h2_text,
'scriptureFull': themeScrp_text,
'scriptureReference': scripture_text,
'dailyText': tabContent_text
}
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment