Skip to content

Instantly share code, notes, and snippets.

@koriaf
Created August 18, 2015 18:40
Show Gist options
  • Save koriaf/6c5a3efa6d4700033087 to your computer and use it in GitHub Desktop.
Save koriaf/6c5a3efa6d4700033087 to your computer and use it in GitHub Desktop.
Pythonic demonstration of BaseHTTPServer and lxml usage.
#!/usr/bin/env python2.7
# encoding: utf-8
# src: https://gist.github.com/anonymous/06e0bd519490c8f03404
# habraproxy.py — это простейший http-прокси-сервер, запускаемый локально , который показывает содержимое страниц Хабра.
# С одним исключением: после каждого слова из шести букв должен стоять значок «™».
# Мне было по фану.
#
# INSTALL:
# pip install requests lxml
#
# USAGE:
# ./habproxy.py [localhost] [1666]
#
# TODO:
# tests
# different domains
# auth
# limit of threads
# more blacklisted tags
# user phantom.js to support script-populated websites
# logging
# uwsgi
# https
import BaseHTTPServer
import logging
import re
import sys
from SocketServer import ThreadingMixIn
import requests
from lxml import html
class HabrahabrModificationHandler(BaseHTTPServer.BaseHTTPRequestHandler):
# old-style class :-(
TARGET_URL = 'http://habrahabr.ru' # TODO: port? # TODO: another domains support?
TAGS_BLACKLIST = [
'script',
'style',
'img',
]
TARGET_REGEXP = re.compile(ur'([^\w])([^\W\d_]{6})([^\w])', re.UNICODE)
logger = logging.getLogger('http')
def process_tag_text(self, text):
# these space-magic allow to simplify regexp
new_text = self.TARGET_REGEXP.sub(ur'\1\2™\3', ' ' + text + ' ')
new_text = new_text[1:-1]
return new_text
def iter_tag(self, root):
for tag in root.iterchildren("*"):
tag = self.iter_tag(tag)
if tag.text:
if tag.tag not in self.TAGS_BLACKLIST:
tag.text = self.process_tag_text(unicode(tag.text))
for child in tag:
if child.tail is not None:
child.tail = self.process_tag_text(unicode(child.tail))
return root
def process_html_page(self, source_page, url=''):
self.logger.debug("Processing page {}".format(url))
processed_page = source_page.decode("utf-8")
processed_page = processed_page.replace('habrahabr.ru', 'localhost:1666')
processed_page = processed_page.replace('https://', 'http://') # it's fine
root = html.fromstring(processed_page) # , html.HTMLParser(encoding='utf-8')
root = self.iter_tag(root)
return html.tostring(root, method="html")
def do_IT(self, habr_resp): # NOQA
if habr_resp.headers.get('content-type').startswith('text/html'): # TODO: text/xhtml and other exotic
result = self.process_html_page(
habr_resp.text.encode("utf-8"),
self.path
)
else:
result = habr_resp.text.encode("utf-8")
self.send_response(habr_resp.status_code)
self.end_headers()
self.wfile.write(result)
return
def do_GET(self): # NOQA
habr_resp = requests.get(self.TARGET_URL + self.path)
return self.do_IT(habr_resp)
def do_POST(self): # NOQA
# TODO: never tested it. test it.
in_data = self.rfile.read()
habr_resp = requests.post(self.TARGET_URL + self.path, data=in_data)
return self.do_IT(habr_resp)
class ThreadedHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
# don't think about too much threads
pass
def main(local_hostname, local_port):
http_server = ThreadedHTTPServer((local_hostname, local_port), HabrahabrModificationHandler)
print("server is ready and listening at http://{}:{}/".format(
local_hostname,
local_port
))
while True:
try:
http_server.handle_request()
except KeyboardInterrupt:
break
http_server.server_close()
return
if __name__ == '__main__':
try:
local_port = int(sys.argv[2] if len(sys.argv) > 2 else '1666')
except (ValueError, TypeError):
print("Please provide integer second argument")
exit(1)
main(
local_hostname=sys.argv[1] if len(sys.argv) > 1 else 'localhost',
local_port=local_port
)
# TODO: remove trailing whitespace on their template code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment