koriaf · August 18, 2015 18:40
diff --git a/habproxy.py b/habproxy.py
 #!/usr/bin/env python2.7
 # encoding: utf-8
 # src: https://gist.github.com/anonymous/06e0bd519490c8f03404
 # habraproxy.py — это простейший http-прокси-сервер, запускаемый локально , который показывает содержимое страниц Хабра.
 # С одним исключением: после каждого слова из шести букв должен стоять значок «™».
 # Мне было по фану.
 #
 # INSTALL:
 # pip install requests lxml
 #
 # USAGE:
 # ./habproxy.py [localhost] [1666]
 #
 # TODO:
 # tests
 # different domains
 # auth
 # limit of threads
 # more blacklisted tags
 # user phantom.js to support script-populated websites
 # logging
 # uwsgi
 # https

 import BaseHTTPServer
 import logging
 import re
 import sys
 from SocketServer import ThreadingMixIn

 import requests
 from lxml import html


 class HabrahabrModificationHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    # old-style class :-(
    TARGET_URL = 'http://habrahabr.ru'  # TODO: port?  # TODO: another domains support?
    TAGS_BLACKLIST = [
        'script',
        'style',
        'img',
    ]
    TARGET_REGEXP = re.compile(ur'([^\w])([^\W\d_]{6})([^\w])', re.UNICODE)
    logger = logging.getLogger('http')

    def process_tag_text(self, text):
        # these space-magic allow to simplify regexp
        new_text = self.TARGET_REGEXP.sub(ur'\1\2™\3', ' ' + text + ' ')
        new_text = new_text[1:-1]
        return new_text

    def iter_tag(self, root):
        for tag in root.iterchildren("*"):
            tag = self.iter_tag(tag)
            if tag.text:
                if tag.tag not in self.TAGS_BLACKLIST:
                    tag.text = self.process_tag_text(unicode(tag.text))
                    for child in tag:
                        if child.tail is not None:
                            child.tail = self.process_tag_text(unicode(child.tail))
        return root

    def process_html_page(self, source_page, url=''):
        self.logger.debug("Processing page {}".format(url))
        processed_page = source_page.decode("utf-8")
        processed_page = processed_page.replace('habrahabr.ru', 'localhost:1666')
        processed_page = processed_page.replace('https://', 'http://')  # it's fine
        root = html.fromstring(processed_page)  # , html.HTMLParser(encoding='utf-8')
        root = self.iter_tag(root)
        return html.tostring(root, method="html")

    def do_IT(self, habr_resp):  # NOQA
        if habr_resp.headers.get('content-type').startswith('text/html'):  # TODO: text/xhtml and other exotic
            result = self.process_html_page(
                habr_resp.text.encode("utf-8"),
                self.path
            )
        else:
            result = habr_resp.text.encode("utf-8")
        self.send_response(habr_resp.status_code)
        self.end_headers()
        self.wfile.write(result)
        return

    def do_GET(self):  # NOQA
        habr_resp = requests.get(self.TARGET_URL + self.path)
        return self.do_IT(habr_resp)

    def do_POST(self):  # NOQA
        # TODO: never tested it. test it.
        in_data = self.rfile.read()
        habr_resp = requests.post(self.TARGET_URL + self.path, data=in_data)
        return self.do_IT(habr_resp)


 class ThreadedHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
    # don't think about too much threads
    pass


 def main(local_hostname, local_port):
    http_server = ThreadedHTTPServer((local_hostname, local_port), HabrahabrModificationHandler)
    print("server is ready and listening at http://{}:{}/".format(
        local_hostname,
        local_port
    ))
    while True:
        try:
            http_server.handle_request()
        except KeyboardInterrupt:
            break
    http_server.server_close()
    return


 if __name__ == '__main__':
    try:
        local_port = int(sys.argv[2] if len(sys.argv) > 2 else '1666')
    except (ValueError, TypeError):
        print("Please provide integer second argument")
        exit(1)
    main(
        local_hostname=sys.argv[1] if len(sys.argv) > 1 else 'localhost',
        local_port=local_port
    )
 # TODO: remove trailing whitespace on their template code
	#!/usr/bin/env python2.7
	# encoding: utf-8
	# src: https://gist.github.com/anonymous/06e0bd519490c8f03404
	# habraproxy.py — это простейший http-прокси-сервер, запускаемый локально , который показывает содержимое страниц Хабра.
	# С одним исключением: после каждого слова из шести букв должен стоять значок «™».
	# Мне было по фану.
	#
	# INSTALL:
	# pip install requests lxml
	#
	# USAGE:
	# ./habproxy.py [localhost] [1666]
	#
	# TODO:
	# tests
	# different domains
	# auth
	# limit of threads
	# more blacklisted tags
	# user phantom.js to support script-populated websites
	# logging
	# uwsgi
	# https

	import BaseHTTPServer
	import logging
	import re
	import sys
	from SocketServer import ThreadingMixIn

	import requests
	from lxml import html


	class HabrahabrModificationHandler(BaseHTTPServer.BaseHTTPRequestHandler):
	# old-style class :-(
	TARGET_URL = 'http://habrahabr.ru' # TODO: port? # TODO: another domains support?
	TAGS_BLACKLIST = [
	'script',
	'style',
	'img',
	]
	TARGET_REGEXP = re.compile(ur'([^\w])([^\W\d_]{6})([^\w])', re.UNICODE)
	logger = logging.getLogger('http')

	def process_tag_text(self, text):
	# these space-magic allow to simplify regexp
	new_text = self.TARGET_REGEXP.sub(ur'\1\2™\3', ' ' + text + ' ')
	new_text = new_text[1:-1]
	return new_text

	def iter_tag(self, root):
	for tag in root.iterchildren("*"):
	tag = self.iter_tag(tag)
	if tag.text:
	if tag.tag not in self.TAGS_BLACKLIST:
	tag.text = self.process_tag_text(unicode(tag.text))
	for child in tag:
	if child.tail is not None:
	child.tail = self.process_tag_text(unicode(child.tail))
	return root

	def process_html_page(self, source_page, url=''):
	self.logger.debug("Processing page {}".format(url))
	processed_page = source_page.decode("utf-8")
	processed_page = processed_page.replace('habrahabr.ru', 'localhost:1666')
	processed_page = processed_page.replace('https://', 'http://') # it's fine
	root = html.fromstring(processed_page) # , html.HTMLParser(encoding='utf-8')
	root = self.iter_tag(root)
	return html.tostring(root, method="html")

	def do_IT(self, habr_resp): # NOQA
	if habr_resp.headers.get('content-type').startswith('text/html'): # TODO: text/xhtml and other exotic
	result = self.process_html_page(
	habr_resp.text.encode("utf-8"),
	self.path
	)
	else:
	result = habr_resp.text.encode("utf-8")
	self.send_response(habr_resp.status_code)
	self.end_headers()
	self.wfile.write(result)
	return

	def do_GET(self): # NOQA
	habr_resp = requests.get(self.TARGET_URL + self.path)
	return self.do_IT(habr_resp)

	def do_POST(self): # NOQA
	# TODO: never tested it. test it.
	in_data = self.rfile.read()
	habr_resp = requests.post(self.TARGET_URL + self.path, data=in_data)
	return self.do_IT(habr_resp)


	class ThreadedHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
	# don't think about too much threads
	pass


	def main(local_hostname, local_port):
	http_server = ThreadedHTTPServer((local_hostname, local_port), HabrahabrModificationHandler)
	print("server is ready and listening at http://{}:{}/".format(
	local_hostname,
	local_port
	))
	while True:
	try:
	http_server.handle_request()
	except KeyboardInterrupt:
	break
	http_server.server_close()
	return


	if __name__ == '__main__':
	try:
	local_port = int(sys.argv[2] if len(sys.argv) > 2 else '1666')
	except (ValueError, TypeError):
	print("Please provide integer second argument")
	exit(1)
	main(
	local_hostname=sys.argv[1] if len(sys.argv) > 1 else 'localhost',
	local_port=local_port
	)
	# TODO: remove trailing whitespace on their template code