#!/usr/bin/env python3 # syntax-highlighting-server.py: create a simple HTTP server to highlight # source for cgit. improves performance compared to invoking python on every # request. # # Requirements: Python 3, pygments. # # Usage: Configure your system to run this at boot. Note that this program is # not hardened, and it can be trivially DoSed. therefore, do not configure it # to listen on a public network. Once configured, set your cgit source filter # to syntax-highlighting-client.sh. import argparse import logging import socket import selectors import sys from http.server import BaseHTTPRequestHandler, HTTPServer from urllib.parse import parse_qs, unquote from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import LEXERS, _load_lexers, guess_lexer, guess_lexer_for_filename from pygments.lexers.special import TextLexer from pygments.util import ClassNotFound class HighlightingHTTPServer(HTTPServer): # set SO_REUSEADDR allow_reuse_address = True # socketserver defaults to 5. especially in listen mode 'single', a # sudden surge can easily overwhelm that. request_queue_size = 64 def __init__(self, *args, style='pastie', **kwargs): super().__init__(*args, **kwargs) self.formatter = HtmlFormatter(style=style, nobackground=True, encoding='utf-8') self.style_defs = ('').encode('utf-8') class HighlightingHandler(BaseHTTPRequestHandler): # read by BaseHTTPRequestHandler. need this so that curl doesn't delay # waiting for 100-continue protocol_version = 'HTTP/1.1' def do_POST(self): qs = parse_qs(self.path.split('?', 1)[1]) if len(qs['filename']) != 1: raise ValueError('cannot have multiple filenames') filename = unquote(qs['filename'][0]) data_len = int(self.headers['Content-Length']) # in theory this could be optimized, but pygments will use more peak # memory than this anyways data = self.rfile.read(data_len).decode('utf-8', errors='replace') # we don't need Server, Date headers self.log_request(200) self.send_response_only(200, None) self.send_header('Content-Type', 'text/html; charset=utf-8') # in theory we could use keep-alive, but cgit will only highlight one # file at a time, and this way we don't need to buffer the output in # order to calculate Content-Length self.send_header('Connection', 'close') self.end_headers() try: lexer = guess_lexer_for_filename(filename, data) except ClassNotFound: try: lexer = guess_lexer(data) except ClassNotFound: lexer = TextLexer() self.wfile.write(self.server.style_defs) highlight(data, lexer, self.server.formatter, outfile=self.wfile) def main(): logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description='syntax highlighting server', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--host', type=str, default='localhost', help=''' The hostname or IP address to listen on. Note that it is insecure to run syntax-highlighting-server on a public network. ''') parser.add_argument('--port', type=int, default=4872, help='the port to listen on') parser.add_argument('--style', type=str, default='pastie', help='pygments formatting style') parser.add_argument('--listen-mode', type=str, default='single', choices=['single', 'forking', 'reuseport'], help=''' single, forking, or reuseport. single uses one process/thread for all requests. forking forks a new process for each request. reuseport forks NUM_SERVERS servers at start, then binds them using SO_REUSEPORT (Linux kernel does a round robin). single is best for low query loads. forking is faster for high loads. reuseport is fastest but uses more idle memory and requires Linux ''') parser.add_argument('--num-servers', type=str, default='auto', help=''' The number of servers to run in reuseport mode. Ignored in other modes. auto means one for each CPU. ''') args = parser.parse_args() if args.host not in ('localhost', '127.0.0.1', '::1'): logging.warning(''' Warning: it is insecure to run syntax-highlighting-server on a public network. Clients can easily attack http.server or pygments. ''') logging.info('starting syntax-highlighting on {} port {}'.format(args.host, args.port)) # reduces first request latency, costs 5-10 MB RAM # for forking, this is necessary for any performance at all (otherwise it # defeats the whole purpose of using a separate process) for lexer in LEXERS.values(): _load_lexers(lexer[0]) __import__('pygments.styles.' + args.style) def start_server(MyHTTPServer): with MyHTTPServer((args.host, args.port), HighlightingHandler, style=args.style) as server: logging.info('started syntax-highlighting-server') server.serve_forever() if args.listen_mode == 'single': start_server(HighlightingHTTPServer) elif args.listen_mode == 'forking': # note: Threading isn't useful for performance because of the GIL from socketserver import ForkingMixIn class ForkingHTTPServer(ForkingMixIn, HighlightingHTTPServer): pass start_server(ForkingHTTPServer) elif args.listen_mode == 'reuseport': import os import signal if not hasattr(socket, 'SO_REUSEPORT'): raise Exception('SO_REUSEPORT not available on this platform') class ReusePortHTTPServer(HighlightingHTTPServer): def server_bind(self): self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) super().server_bind() if args.num_servers == 'auto': num_servers = len(os.sched_getaffinity(0)) logging.info('auto-detected {} CPUs'.format(num_servers)) else: num_servers = int(args.num_servers) logging.info('starting {} servers'.format(num_servers)) pipe = os.pipe() try: for i in range(num_servers): pid = os.fork() if pid == 0: os.close(pipe[1]) with ReusePortHTTPServer((args.host, args.port), HighlightingHandler, style=args.style) as server: with selectors.DefaultSelector() as selector: selector.register(server, selectors.EVENT_READ) selector.register(pipe[0], selectors.EVENT_READ) while True: ready = selector.select(None) for key, events in ready: if key.fd == pipe[0]: return if ready: server._handle_request_noblock() server.service_actions() sys.exit(0) os.close(pipe[0]) logging.info('started syntax-highlighting-server') os.wait() logging.info('worker died, shutting down syntax-highlighting-server') except KeyboardInterrupt: if pid != 0: logging.info('ctrl-c received, shutting down syntax-highlighting-server') else: raise Exception('invalid listen mode: {}'.format(args.listen_mode)) if __name__ == '__main__': main()