From 361d396d01f6cdeb49970c0eeb705da442879f05 Mon Sep 17 00:00:00 2001 From: "Alex Xu (Hello71)" Date: Fri, 6 Mar 2020 18:59:57 -0500 Subject: Initial commit --- syntax-highlighting-server.py | 189 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100755 syntax-highlighting-server.py (limited to 'syntax-highlighting-server.py') diff --git a/syntax-highlighting-server.py b/syntax-highlighting-server.py new file mode 100755 index 0000000..e093a9a --- /dev/null +++ b/syntax-highlighting-server.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 + +# syntax-highlighting-server.py: create a simple HTTP server to highlight +# source for cgit. improves performance compared to invoking python on every +# request. +# +# Requirements: Python 3, pygments. +# +# Usage: Configure your system to run this at boot. Note that this program is +# not hardened, and it can be trivially DoSed. therefore, do not configure it +# to listen on a public network. Once configured, set your cgit source filter +# to syntax-highlighting-client.sh. + +import argparse +import logging +import socket +import selectors +import sys + +from http.server import BaseHTTPRequestHandler, HTTPServer +from urllib.parse import parse_qs, unquote + +from pygments import highlight +from pygments.formatters import HtmlFormatter +from pygments.lexers import LEXERS, _load_lexers, guess_lexer, guess_lexer_for_filename +from pygments.lexers.special import TextLexer +from pygments.util import ClassNotFound + +class BaseHTTPServer(HTTPServer): + # set SO_REUSEADDR + allow_reuse_address = True + # socketserver defaults to 5. especially in listen mode 'single', a + # sudden surge can easily overwhelm that. + request_queue_size = 128 + +formatter = HtmlFormatter(style='pastie', nobackground=True, encoding='utf-8') +style_defs = ('').encode('utf-8') + +class HighlightingHandler(BaseHTTPRequestHandler): + # read by BaseHTTPRequestHandler. need this so that curl doesn't delay + # waiting for 100-continue + protocol_version = 'HTTP/1.1' + + def do_POST(self): + qs = parse_qs(self.path.split('?', 1)[1]) + if len(qs['filename']) != 1: + raise ValueError('cannot have multiple filenames') + filename = unquote(qs['filename'][0]) + + data_len = int(self.headers['Content-Length']) + # in theory this could be optimized, but pygments will use more peak + # memory than this anyways + data = self.rfile.read(data_len).decode('utf-8', errors='replace') + + # we don't need Server, Date headers + self.log_request(200) + self.send_response_only(200, None) + self.send_header('Content-Type', 'text/html; charset=utf-8') + + # in theory we could use keep-alive, but cgit will only highlight one + # file at a time, and this way we don't need to buffer the output in + # order to calculate Content-Length + self.send_header('Connection', 'close') + self.end_headers() + + try: + lexer = guess_lexer_for_filename(filename, data) + except ClassNotFound: + try: + lexer = guess_lexer(data) + except ClassNotFound: + lexer = TextLexer() + self.wfile.write(style_defs) + highlight(data, lexer, formatter, outfile=self.wfile) + +def main(): + logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) + + parser = argparse.ArgumentParser(description='syntax highlighting server', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--host', type=str, default='localhost', + help=''' + The hostname or IP address to listen on. Note that it is + insecure to run syntax-highlighting-server on a public network. + ''') + parser.add_argument('--port', type=int, default=4872, + help='the port to listen on') + parser.add_argument('--style', type=str, default='pastie', + help='pygments formatting style') + parser.add_argument('--listen-mode', type=str, default='single', + choices=['single', 'forking', 'reuseport'], + help=''' + single, forking, or reuseport. single uses one + process/thread for all requests. forking forks a new + process for each request. reuseport forks NUM_SERVERS + servers at start, then binds them using SO_REUSEPORT (Linux + kernel does a round robin). single is best for low query + loads. forking is faster for high loads. reuseport is + fastest but uses more idle memory and requires Linux + ''') + parser.add_argument('--num-servers', type=str, default='auto', + help=''' + The number of servers to run in reuseport mode. Ignored in + other modes. auto means one for each CPU. + ''') + args = parser.parse_args() + + if args.host not in ('localhost', '127.0.0.1', '::1'): + logging.warning(''' + Warning: it is insecure to run syntax-highlighting-server on a + public network. Clients can easily attack http.server or pygments. + ''') + + logging.info('starting syntax-highlighting on {} port {}'.format(args.host, args.port)) + + # reduces first request latency, costs 5-10 MB RAM + # for forking, this is necessary for any performance at all (otherwise it + # defeats the whole purpose of using a separate process) + for lexer in LEXERS.values(): + _load_lexers(lexer[0]) + __import__('pygments.styles.' + args.style) + + def start_server(MyHTTPServer): + with MyHTTPServer((args.host, args.port), HighlightingHandler) as server: + logging.info('started syntax-highlighting-server') + server.serve_forever() + + if args.listen_mode == 'single': + start_server(BaseHTTPServer) + + elif args.listen_mode == 'forking': + # note: Threading isn't useful for performance because of the GIL + from socketserver import ForkingMixIn + class ForkingHTTPServer(ForkingMixIn, BaseHTTPServer): + pass + start_server(ForkingHTTPServer) + + elif args.listen_mode == 'reuseport': + import os + import signal + + if not hasattr(socket, 'SO_REUSEPORT'): + raise Exception('SO_REUSEPORT not available on this platform') + + class ReusePortHTTPServer(BaseHTTPServer): + def server_bind(self): + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + super().server_bind() + + if args.num_servers == 'auto': + num_servers = len(os.sched_getaffinity(0)) + logging.info('auto-detected {} CPUs'.format(num_servers)) + else: + num_servers = int(args.num_servers) + logging.info('starting {} servers'.format(num_servers)) + + pipe = os.pipe() + + try: + for i in range(num_servers): + pid = os.fork() + if pid == 0: + os.close(pipe[1]) + with ReusePortHTTPServer((args.host, args.port), HighlightingHandler) as server: + with selectors.DefaultSelector() as selector: + selector.register(server, selectors.EVENT_READ) + selector.register(pipe[0], selectors.EVENT_READ) + while True: + ready = selector.select(None) + for key, events in ready: + if key.fd == pipe[0]: + return + if ready: + server._handle_request_noblock() + server.service_actions() + sys.exit(0) + os.close(pipe[0]) + logging.info('started syntax-highlighting-server') + os.wait() + logging.info('worker died, shutting down syntax-highlighting-server') + except KeyboardInterrupt: + if pid != 0: + logging.info('ctrl-c received, shutting down syntax-highlighting-server') + + else: + raise Exception('invalid listen mode: {}'.format(args.listen_mode)) + +if __name__ == '__main__': + main() -- cgit v1.2.3-54-g00ecf