mseri · September 18, 2024 09:42
diff --git a/htm_to_md.sh b/htm_to_md.sh
 # Faster but not private way to achive the above
 # is to define a bash function and source it at startup
 function html_to_md () {
  if [[ $# -eq 2 ]]; then
    curl "https://r.jina.ai/$1" > "$2".md
    echo "Content saved to \"$2\".md"
  else
    curl "https://r.jina.ai/$@"
  fi
 }
diff --git a/mder.py b/mder.py
 import argparse
 import ollama
 import re
 import requests

 from math import log, pow

 ### Start code from https://jina.ai/news/reader-lm-small-language-models-for-cleaning-and-converting-html-to-markdown/?nocache=1

 def get_html_content(url):
    api_url = f'https://r.jina.ai/{url}'
    headers = {'X-Return-Format': 'html'}
    try:
        response = requests.get(api_url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        return f"error: {str(e)}"

 # (REMOVE <SCRIPT> to </script> and variations)
 SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>'  # mach any char zero or more times
 # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

 # (REMOVE HTML <STYLE> to </style> and variations)
 STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>'  # mach any char zero or more times
 # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

 # (REMOVE HTML <META> to </meta> and variations)
 META_PATTERN = r'<[ ]*meta.*?>'  # mach any char zero or more times
 # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

 # (REMOVE HTML COMMENTS <!-- to --> and variations)
 COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>'  # mach any char zero or more times
 # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

 # (REMOVE HTML LINK <LINK> to </link> and variations)
 LINK_PATTERN = r'<[ ]*link.*?>'  # mach any char zero or more times

 # (REPLACE base64 images)
 BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'

 # (REPLACE <svg> to </svg> and variations)
 SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)'


 def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
    return re.sub(
        SVG_PATTERN,
        lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
        html,
        flags=re.DOTALL,
    )


 def replace_base64_images(html: str, new_image_src: str = "#") -> str:
    return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)


 def has_base64_images(text: str) -> bool:
    base64_content_pattern = r'data:image/[^;]+;base64,[^"]+'
    return bool(re.search(base64_content_pattern, text, flags=re.DOTALL))


 def has_svg_components(text: str) -> bool:
    return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL))


 def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
    html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
    html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
    html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
    html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
    html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

    if clean_svg:
        html = replace_svg(html)

    if clean_base64:
        html = replace_base64_images(html)

    return html

 ### End code from there

 def main():
    parser = argparse.ArgumentParser(description="HTML to markdown using reader-lm")
    parser.add_argument("url", type=str, help="The URL to process. Will be downloaded from r.jina.ai to ensure dynamic content gets parsed.")
    parser.add_argument("-o", "--output", type=str, help="The output filename. If not provided, defaults to the standard output", default=None)
    args = parser.parse_args()

    html = get_html_content(args.url)
    print("Downloaded content")
    
    clean_svg = has_svg_components(html)
    clean_base64 = has_base64_images(html)
    html = clean_html(html, clean_svg, clean_base64)
    print("Cleaned HTML, ready to process")

    # print(html)
    # Badly approximate context length (makes it much faster on my system)
    # Not using a power of 2 makes it hallucinate massively in my tests \/(0.0)\/
    ctx = min(262144,
              pow(2, int(log(len(html.split()))/log(2)) + 1)
              )
    ctx = max(1024, ctx)
    print(f"Approximate context length: {ctx}")

    response = ollama.generate(
        model="reader-lm",
        prompt=html,
        stream=True,
        options={"temperature":0, "num_ctx": ctx, "num_predict": -1, "top_p": 1.0, "top_k":-1, "max_tokens": 4096}
    )

    if args.output:
        with open(output_file, "w") as file:
            for chunk in response:
                file.write(chunk['response'])
    else:
        for chunk in response:
            print(chunk['response'], end="")

 if __name__ == "__main__":
    main()
	# Faster but not private way to achive the above
	# is to define a bash function and source it at startup
	function html_to_md () {
	if [[ $# -eq 2 ]]; then
	curl "https://r.jina.ai/$1" > "$2".md
	echo "Content saved to \"$2\".md"
	else
	curl "https://r.jina.ai/$@"
	fi
	}
	import argparse
	import ollama
	import re
	import requests

	from math import log, pow

	### Start code from https://jina.ai/news/reader-lm-small-language-models-for-cleaning-and-converting-html-to-markdown/?nocache=1

	def get_html_content(url):
	api_url = f'https://r.jina.ai/{url}'
	headers = {'X-Return-Format': 'html'}
	try:
	response = requests.get(api_url, headers=headers, timeout=10)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	return f"error: {str(e)}"

	# (REMOVE <SCRIPT> to </script> and variations)
	SCRIPT_PATTERN = r'<[ ]script.?\/[ ]script[ ]>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML <STYLE> to </style> and variations)
	STYLE_PATTERN = r'<[ ]style.?\/[ ]style[ ]>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML <META> to </meta> and variations)
	META_PATTERN = r'<[ ]meta.?>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML COMMENTS <!-- to --> and variations)
	COMMENT_PATTERN = r'<[ ]!--.?--[ ]*>' # mach any char zero or more times
	# text = re.sub(pattern, '', text, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	# (REMOVE HTML LINK <LINK> to </link> and variations)
	LINK_PATTERN = r'<[ ]link.?>' # mach any char zero or more times

	# (REPLACE base64 images)
	BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'

	# (REPLACE <svg> to </svg> and variations)
	SVG_PATTERN = r'(<svg[^>]>)(.?)(<\/svg>)'


	def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
	return re.sub(
	SVG_PATTERN,
	lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
	html,
	flags=re.DOTALL,
	)


	def replace_base64_images(html: str, new_image_src: str = "#") -> str:
	return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)


	def has_base64_images(text: str) -> bool:
	base64_content_pattern = r'data:image/[^;]+;base64,[^"]+'
	return bool(re.search(base64_content_pattern, text, flags=re.DOTALL))


	def has_svg_components(text: str) -> bool:
	return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL))


	def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
	html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))
	html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE \| re.MULTILINE \| re.DOTALL))

	if clean_svg:
	html = replace_svg(html)

	if clean_base64:
	html = replace_base64_images(html)

	return html

	### End code from there

	def main():
	parser = argparse.ArgumentParser(description="HTML to markdown using reader-lm")
	parser.add_argument("url", type=str, help="The URL to process. Will be downloaded from r.jina.ai to ensure dynamic content gets parsed.")
	parser.add_argument("-o", "--output", type=str, help="The output filename. If not provided, defaults to the standard output", default=None)
	args = parser.parse_args()

	html = get_html_content(args.url)
	print("Downloaded content")

	clean_svg = has_svg_components(html)
	clean_base64 = has_base64_images(html)
	html = clean_html(html, clean_svg, clean_base64)
	print("Cleaned HTML, ready to process")

	# print(html)
	# Badly approximate context length (makes it much faster on my system)
	# Not using a power of 2 makes it hallucinate massively in my tests \/(0.0)\/
	ctx = min(262144,
	pow(2, int(log(len(html.split()))/log(2)) + 1)
	)
	ctx = max(1024, ctx)
	print(f"Approximate context length: {ctx}")

	response = ollama.generate(
	model="reader-lm",
	prompt=html,
	stream=True,
	options={"temperature":0, "num_ctx": ctx, "num_predict": -1, "top_p": 1.0, "top_k":-1, "max_tokens": 4096}
	)

	if args.output:
	with open(output_file, "w") as file:
	for chunk in response:
	file.write(chunk['response'])
	else:
	for chunk in response:
	print(chunk['response'], end="")

	if __name__ == "__main__":
	main()