diff options
-rw-r--r-- | README | 23 | ||||
-rwxr-xr-x | syntax-highlighting-server.py | 228 |
2 files changed, 39 insertions, 212 deletions
@@ -8,26 +8,5 @@ Usage: 1. Run `make install`. 2. Configure your system to run /usr/lib/cgit/syntax-highlighting-server.py at boot. This can be done by `systemctl enable syntax-highlighting` on systemd - machines. Note that syntax-highlighting-server is very insecure. Therefore, - do not configure it to listen on a public network. + machines. 3. Set your cgit source filter to syntax-highlighting-client.sh. - -Tuning: - -By default, syntax-highlighting-server does all work in a single thread. If you -have a high query load and multiple CPUs, consider setting --listen-mode to -forking or reuseport. - -reuseport mode uses one worker per CPU and is the highest performance mode, but -requires Linux and additional idle memory (roughly 1-3 MB per worker). - -forking mode uses less idle memory and is compatible with non-Linux systems, -but is significantly less efficient, since it forks for every request. - -Security: - -syntax-highlighting-server is not hardened against malicious clients which send -malformed data or are simply excessively slow. Once again, do not configure -syntax-highlighting-server to listen on a public network. It is also -recommended to set max-blob-size in cgitrc, as the entire file must be buffered -in memory during syntax highlighting. diff --git a/syntax-highlighting-server.py b/syntax-highlighting-server.py index aa6c570..d270c4c 100755 --- a/syntax-highlighting-server.py +++ b/syntax-highlighting-server.py @@ -11,221 +11,69 @@ # to listen on a public network. Once configured, set your cgit source filter # to syntax-highlighting-client.sh. -import argparse -import logging - -from http.server import BaseHTTPRequestHandler, HTTPServer -from urllib.parse import parse_qs, unquote - from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import guess_lexer, guess_lexer_for_filename from pygments.lexers.special import TextLexer from pygments.util import ClassNotFound -class HighlightingHTTPServer(HTTPServer): - # set SO_REUSEADDR - allow_reuse_address = True - # socketserver defaults to 5. especially in listen mode 'single', a - # sudden surge can easily overwhelm that. - request_queue_size = 64 - - def __init__(self, *args, formatter, style_defs, **kwargs): - super().__init__(*args, **kwargs) - self.formatter = formatter - self.style_defs = style_defs - - def log_message(self, format, *args): - logging.info("%s %s" % (self.address_string(), format % args)) - -class HighlightingHandler(BaseHTTPRequestHandler): - # read by BaseHTTPRequestHandler. need this so that curl doesn't delay - # waiting for 100-continue - protocol_version = 'HTTP/1.1' - - def do_POST(self): - qs = parse_qs(self.path.split('?', 1)[1]) - if len(qs['filename']) != 1: - raise ValueError('cannot have multiple filenames') - filename = unquote(qs['filename'][0]) - - data_len = int(self.headers['Content-Length']) - # in theory this could be optimized, but pygments will use more peak - # memory than this anyways - data = self.rfile.read(data_len).decode('utf-8', errors='replace') - - # we don't need Server, Date headers - self.log_request(200) - self.send_response_only(200, None) - self.send_header('Content-Type', 'text/html; charset=utf-8') - - # in theory we could use keep-alive, but cgit will only highlight one - # file at a time, and this way we don't need to buffer the output in - # order to calculate Content-Length - self.send_header('Connection', 'close') - self.end_headers() - +def do_highlight(filename, data, style): + try: + lexer = guess_lexer_for_filename(filename, data) + except ClassNotFound: try: - lexer = guess_lexer_for_filename(filename, data) - except ClassNotFound: - try: - lexer = guess_lexer(data) - # SqlLexer always gives 0.01 - if lexer.analyse_text(data) <= 0.01: - lexer = TextLexer() - except ClassNotFound: + lexer = guess_lexer(data) + # SqlLexer always gives 0.01 + if lexer.analyse_text(data) <= 0.01: lexer = TextLexer() - self.wfile.write(('<!-- Lexer: ' + lexer.name + - ' (' + lexer.__class__.__name__ + ') -->').encode('utf-8')) - self.wfile.write(self.server.style_defs) - highlight(data, lexer, self.server.formatter, outfile=self.wfile) + except ClassNotFound: + lexer = TextLexer() + formatter = HtmlFormatter(style=style, nobackground=True) + return ''.join([ + f'<!-- Pygments {pygments.__version__}: {lexer.name} ({lexer.__class__.__name__}) -->', + '<style>', formatter.get_style_defs('.highlight'), '</style>', + highlight(data, lexer, formatter) + ]) def parse_args(): + import argparse parser = argparse.ArgumentParser(description='syntax highlighting server', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--host', type=str, default='127.0.0.1', - help=''' - The hostname or IP address to listen on. Note that it is - insecure to run syntax-highlighting-server on a public network. - ''') + help='the host to listen on') parser.add_argument('--port', type=int, default=4872, help='the port to listen on') parser.add_argument('--style', type=str, default='pastie', help='pygments formatting style') - parser.add_argument('--listen-mode', type=str, default='single', - choices=['single', 'forking', 'reuseport'], - help=''' - single, forking, or reuseport. single uses one - process/thread for all requests. forking forks a new - process for each request. reuseport forks NUM_SERVERS - servers at start, then binds them using SO_REUSEPORT (Linux - kernel does a round robin). single is best for low query - loads. forking is faster for high loads. reuseport is - fastest but uses more idle memory and requires Linux - ''') - parser.add_argument('--num-servers', type=str, default='auto', - help=''' - The number of servers to run in reuseport mode. Ignored in - other modes. auto means one for each CPU. - ''') + parser.add_argument('--preload', type=bool, default=True, + help='preload lexers and formatters to reduce fork memory usage') return parser.parse_args() -def main(): - logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) +async def handle_highlight(request): + import asyncio + from aiohttp import web + loop = asyncio.get_running_loop() + text = await request.text() + result = await loop.run_in_executor( + request.app['pool'], do_highlight, + request.query['filename'], text, request.app['style']) + return web.Response(text=result) +def main(): args = parse_args() - if args.host not in ('localhost', '127.0.0.1', '::1'): - logging.warning(''' - Warning: it is insecure to run syntax-highlighting-server on a - public network. Clients can easily attack http.server or pygments. - ''') - - logging.info('starting syntax-highlighting on {} port {}' - .format(args.host, args.port)) - - try: - # preload stuff to avoid first-request latency (every request for - # forking) and post-fork memory usage + from aiohttp import web + from concurrent.futures import ProcessPoolExecutor - # preload lexers + if args.preload: guess_lexer('') - # preload formatter - formatter = HtmlFormatter(style=args.style, - nobackground=True, encoding='utf-8') - # pre-compute style defs - style_defs = ('<style>' + - formatter.get_style_defs('.highlight') + - '</style>').encode('utf-8') - # used internally by socket - ''.encode('idna') - - def start_server(MyHTTPServer): - with MyHTTPServer((args.host, args.port), HighlightingHandler, - formatter=formatter, style_defs=style_defs) as server: - logging.info('started syntax-highlighting-server') - server.serve_forever() - - if args.listen_mode == 'single': - start_server(HighlightingHTTPServer) - - elif args.listen_mode == 'forking': - # note: Threading isn't useful for performance because of the GIL - from socketserver import ForkingMixIn - class ForkingHTTPServer(ForkingMixIn, HighlightingHTTPServer): - pass - start_server(ForkingHTTPServer) - - elif args.listen_mode == 'reuseport': - import os - import selectors - import socket - import sys - - class ReusePortHTTPServer(HighlightingHTTPServer): - def server_bind(self): - self.socket.setsockopt( - socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) - super().server_bind() - - # check that we can bind to the port - # keep tmp_server around to avoid TOCTOU - tmp_server = ReusePortHTTPServer((args.host, args.port), None, - formatter=None, style_defs=None) - - if args.num_servers != 'auto': - num_servers = int(args.num_servers) - elif hasattr(os, 'sched_getaffinity'): - num_servers = len(os.sched_getaffinity(0)) - else: - num_servers = os.cpu_count() - logging.info('starting {} servers'.format(num_servers)) - - pipe = os.pipe() - - for i in range(num_servers): - pid = os.fork() - if pid == 0: - tmp_server.server_close() - try: - os.close(pipe[1]) - with ReusePortHTTPServer((args.host, args.port), - HighlightingHandler, formatter=formatter, - style_defs=style_defs) as server: - with selectors.DefaultSelector() as selector: - selector.register(server, selectors.EVENT_READ) - selector.register(pipe[0], selectors.EVENT_READ) - while True: - ready = selector.select(None) - for key, events in ready: - if key.fd == pipe[0]: - sys.exit(0) - if ready: - server._handle_request_noblock() - server.service_actions() - except KeyboardInterrupt: - # Ctrl-C on the command line sends SIGINT to the whole - # process group. we could wait for the pipe, but just exit - # now - sys.exit(0) - except Exception: - # try to keep exception message together - # the default virtually guarantees mangled output - import traceback - sys.stderr.write(traceback.format_exc()) - sys.exit(1) - os.close(pipe[0]) - tmp_server.server_close() - logging.info('started syntax-highlighting-server') - pid, status = os.wait() - logging.info('worker {} died, shutting down syntax-highlighting-server' - .format(pid)) - - else: - raise Exception('invalid listen mode: {}'.format(args.listen_mode)) - except KeyboardInterrupt: - logging.info('ctrl-c received, shutting down syntax-highlighting-server') + with ProcessPoolExecutor() as pool: + app = web.Application() + app['pool'] = pool + app['style'] = args.style + app.add_routes([web.post('/highlight', handle_highlight)]) + web.run_app(app, host=args.host, port=args.port) if __name__ == '__main__': main() |