Skip to content

Instantly share code, notes, and snippets.

@mseri
Last active September 18, 2024 09:42
Show Gist options
  • Save mseri/69851f74ab15d73e39ac487ad11e7140 to your computer and use it in GitHub Desktop.
Save mseri/69851f74ab15d73e39ac487ad11e7140 to your computer and use it in GitHub Desktop.
Use reader-lm locally with ollama
# Faster but not private way to achive the above
# is to define a bash function and source it at startup
function html_to_md () {
if [[ $# -eq 2 ]]; then
curl "https://r.jina.ai/$1" > "$2".md
echo "Content saved to \"$2\".md"
else
curl "https://r.jina.ai/$@"
fi
}
import argparse
import ollama
import re
import requests
from math import log, pow
### Start code from https://jina.ai/news/reader-lm-small-language-models-for-cleaning-and-converting-html-to-markdown/?nocache=1
def get_html_content(url):
api_url = f'https://r.jina.ai/{url}'
headers = {'X-Return-Format': 'html'}
try:
response = requests.get(api_url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
return f"error: {str(e)}"
# (REMOVE <SCRIPT> to </script> and variations)
SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <STYLE> to </style> and variations)
STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <META> to </meta> and variations)
META_PATTERN = r'<[ ]*meta.*?>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML COMMENTS <!-- to --> and variations)
COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML LINK <LINK> to </link> and variations)
LINK_PATTERN = r'<[ ]*link.*?>' # mach any char zero or more times
# (REPLACE base64 images)
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
# (REPLACE <svg> to </svg> and variations)
SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)'
def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
return re.sub(
SVG_PATTERN,
lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
html,
flags=re.DOTALL,
)
def replace_base64_images(html: str, new_image_src: str = "#") -> str:
return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
def has_base64_images(text: str) -> bool:
base64_content_pattern = r'data:image/[^;]+;base64,[^"]+'
return bool(re.search(base64_content_pattern, text, flags=re.DOTALL))
def has_svg_components(text: str) -> bool:
return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL))
def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
if clean_svg:
html = replace_svg(html)
if clean_base64:
html = replace_base64_images(html)
return html
### End code from there
def main():
parser = argparse.ArgumentParser(description="HTML to markdown using reader-lm")
parser.add_argument("url", type=str, help="The URL to process. Will be downloaded from r.jina.ai to ensure dynamic content gets parsed.")
parser.add_argument("-o", "--output", type=str, help="The output filename. If not provided, defaults to the standard output", default=None)
args = parser.parse_args()
html = get_html_content(args.url)
print("Downloaded content")
clean_svg = has_svg_components(html)
clean_base64 = has_base64_images(html)
html = clean_html(html, clean_svg, clean_base64)
print("Cleaned HTML, ready to process")
# print(html)
# Badly approximate context length (makes it much faster on my system)
# Not using a power of 2 makes it hallucinate massively in my tests \/(0.0)\/
ctx = min(262144,
pow(2, int(log(len(html.split()))/log(2)) + 1)
)
ctx = max(1024, ctx)
print(f"Approximate context length: {ctx}")
response = ollama.generate(
model="reader-lm",
prompt=html,
stream=True,
options={"temperature":0, "num_ctx": ctx, "num_predict": -1, "top_p": 1.0, "top_k":-1, "max_tokens": 4096}
)
if args.output:
with open(output_file, "w") as file:
for chunk in response:
file.write(chunk['response'])
else:
for chunk in response:
print(chunk['response'], end="")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment