summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README23
-rwxr-xr-xsyntax-highlighting-server.py228
2 files changed, 39 insertions, 212 deletions
diff --git a/README b/README
index 09b1bbf..da3b255 100644
--- a/README
+++ b/README
@@ -8,26 +8,5 @@ Usage:
1. Run `make install`.
2. Configure your system to run /usr/lib/cgit/syntax-highlighting-server.py at
boot. This can be done by `systemctl enable syntax-highlighting` on systemd
- machines. Note that syntax-highlighting-server is very insecure. Therefore,
- do not configure it to listen on a public network.
+ machines.
3. Set your cgit source filter to syntax-highlighting-client.sh.
-
-Tuning:
-
-By default, syntax-highlighting-server does all work in a single thread. If you
-have a high query load and multiple CPUs, consider setting --listen-mode to
-forking or reuseport.
-
-reuseport mode uses one worker per CPU and is the highest performance mode, but
-requires Linux and additional idle memory (roughly 1-3 MB per worker).
-
-forking mode uses less idle memory and is compatible with non-Linux systems,
-but is significantly less efficient, since it forks for every request.
-
-Security:
-
-syntax-highlighting-server is not hardened against malicious clients which send
-malformed data or are simply excessively slow. Once again, do not configure
-syntax-highlighting-server to listen on a public network. It is also
-recommended to set max-blob-size in cgitrc, as the entire file must be buffered
-in memory during syntax highlighting.
diff --git a/syntax-highlighting-server.py b/syntax-highlighting-server.py
index aa6c570..d270c4c 100755
--- a/syntax-highlighting-server.py
+++ b/syntax-highlighting-server.py
@@ -11,221 +11,69 @@
# to listen on a public network. Once configured, set your cgit source filter
# to syntax-highlighting-client.sh.
-import argparse
-import logging
-
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from urllib.parse import parse_qs, unquote
-
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import guess_lexer, guess_lexer_for_filename
from pygments.lexers.special import TextLexer
from pygments.util import ClassNotFound
-class HighlightingHTTPServer(HTTPServer):
- # set SO_REUSEADDR
- allow_reuse_address = True
- # socketserver defaults to 5. especially in listen mode 'single', a
- # sudden surge can easily overwhelm that.
- request_queue_size = 64
-
- def __init__(self, *args, formatter, style_defs, **kwargs):
- super().__init__(*args, **kwargs)
- self.formatter = formatter
- self.style_defs = style_defs
-
- def log_message(self, format, *args):
- logging.info("%s %s" % (self.address_string(), format % args))
-
-class HighlightingHandler(BaseHTTPRequestHandler):
- # read by BaseHTTPRequestHandler. need this so that curl doesn't delay
- # waiting for 100-continue
- protocol_version = 'HTTP/1.1'
-
- def do_POST(self):
- qs = parse_qs(self.path.split('?', 1)[1])
- if len(qs['filename']) != 1:
- raise ValueError('cannot have multiple filenames')
- filename = unquote(qs['filename'][0])
-
- data_len = int(self.headers['Content-Length'])
- # in theory this could be optimized, but pygments will use more peak
- # memory than this anyways
- data = self.rfile.read(data_len).decode('utf-8', errors='replace')
-
- # we don't need Server, Date headers
- self.log_request(200)
- self.send_response_only(200, None)
- self.send_header('Content-Type', 'text/html; charset=utf-8')
-
- # in theory we could use keep-alive, but cgit will only highlight one
- # file at a time, and this way we don't need to buffer the output in
- # order to calculate Content-Length
- self.send_header('Connection', 'close')
- self.end_headers()
-
+def do_highlight(filename, data, style):
+ try:
+ lexer = guess_lexer_for_filename(filename, data)
+ except ClassNotFound:
try:
- lexer = guess_lexer_for_filename(filename, data)
- except ClassNotFound:
- try:
- lexer = guess_lexer(data)
- # SqlLexer always gives 0.01
- if lexer.analyse_text(data) <= 0.01:
- lexer = TextLexer()
- except ClassNotFound:
+ lexer = guess_lexer(data)
+ # SqlLexer always gives 0.01
+ if lexer.analyse_text(data) <= 0.01:
lexer = TextLexer()
- self.wfile.write(('<!-- Lexer: ' + lexer.name +
- ' (' + lexer.__class__.__name__ + ') -->').encode('utf-8'))
- self.wfile.write(self.server.style_defs)
- highlight(data, lexer, self.server.formatter, outfile=self.wfile)
+ except ClassNotFound:
+ lexer = TextLexer()
+ formatter = HtmlFormatter(style=style, nobackground=True)
+ return ''.join([
+ f'<!-- Pygments {pygments.__version__}: {lexer.name} ({lexer.__class__.__name__}) -->',
+ '<style>', formatter.get_style_defs('.highlight'), '</style>',
+ highlight(data, lexer, formatter)
+ ])
def parse_args():
+ import argparse
parser = argparse.ArgumentParser(description='syntax highlighting server',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--host', type=str, default='127.0.0.1',
- help='''
- The hostname or IP address to listen on. Note that it is
- insecure to run syntax-highlighting-server on a public network.
- ''')
+ help='the host to listen on')
parser.add_argument('--port', type=int, default=4872,
help='the port to listen on')
parser.add_argument('--style', type=str, default='pastie',
help='pygments formatting style')
- parser.add_argument('--listen-mode', type=str, default='single',
- choices=['single', 'forking', 'reuseport'],
- help='''
- single, forking, or reuseport. single uses one
- process/thread for all requests. forking forks a new
- process for each request. reuseport forks NUM_SERVERS
- servers at start, then binds them using SO_REUSEPORT (Linux
- kernel does a round robin). single is best for low query
- loads. forking is faster for high loads. reuseport is
- fastest but uses more idle memory and requires Linux
- ''')
- parser.add_argument('--num-servers', type=str, default='auto',
- help='''
- The number of servers to run in reuseport mode. Ignored in
- other modes. auto means one for each CPU.
- ''')
+ parser.add_argument('--preload', type=bool, default=True,
+ help='preload lexers and formatters to reduce fork memory usage')
return parser.parse_args()
-def main():
- logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+async def handle_highlight(request):
+ import asyncio
+ from aiohttp import web
+ loop = asyncio.get_running_loop()
+ text = await request.text()
+ result = await loop.run_in_executor(
+ request.app['pool'], do_highlight,
+ request.query['filename'], text, request.app['style'])
+ return web.Response(text=result)
+def main():
args = parse_args()
- if args.host not in ('localhost', '127.0.0.1', '::1'):
- logging.warning('''
- Warning: it is insecure to run syntax-highlighting-server on a
- public network. Clients can easily attack http.server or pygments.
- ''')
-
- logging.info('starting syntax-highlighting on {} port {}'
- .format(args.host, args.port))
-
- try:
- # preload stuff to avoid first-request latency (every request for
- # forking) and post-fork memory usage
+ from aiohttp import web
+ from concurrent.futures import ProcessPoolExecutor
- # preload lexers
+ if args.preload:
guess_lexer('')
- # preload formatter
- formatter = HtmlFormatter(style=args.style,
- nobackground=True, encoding='utf-8')
- # pre-compute style defs
- style_defs = ('<style>' +
- formatter.get_style_defs('.highlight') +
- '</style>').encode('utf-8')
- # used internally by socket
- ''.encode('idna')
-
- def start_server(MyHTTPServer):
- with MyHTTPServer((args.host, args.port), HighlightingHandler,
- formatter=formatter, style_defs=style_defs) as server:
- logging.info('started syntax-highlighting-server')
- server.serve_forever()
-
- if args.listen_mode == 'single':
- start_server(HighlightingHTTPServer)
-
- elif args.listen_mode == 'forking':
- # note: Threading isn't useful for performance because of the GIL
- from socketserver import ForkingMixIn
- class ForkingHTTPServer(ForkingMixIn, HighlightingHTTPServer):
- pass
- start_server(ForkingHTTPServer)
-
- elif args.listen_mode == 'reuseport':
- import os
- import selectors
- import socket
- import sys
-
- class ReusePortHTTPServer(HighlightingHTTPServer):
- def server_bind(self):
- self.socket.setsockopt(
- socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
- super().server_bind()
-
- # check that we can bind to the port
- # keep tmp_server around to avoid TOCTOU
- tmp_server = ReusePortHTTPServer((args.host, args.port), None,
- formatter=None, style_defs=None)
-
- if args.num_servers != 'auto':
- num_servers = int(args.num_servers)
- elif hasattr(os, 'sched_getaffinity'):
- num_servers = len(os.sched_getaffinity(0))
- else:
- num_servers = os.cpu_count()
- logging.info('starting {} servers'.format(num_servers))
-
- pipe = os.pipe()
-
- for i in range(num_servers):
- pid = os.fork()
- if pid == 0:
- tmp_server.server_close()
- try:
- os.close(pipe[1])
- with ReusePortHTTPServer((args.host, args.port),
- HighlightingHandler, formatter=formatter,
- style_defs=style_defs) as server:
- with selectors.DefaultSelector() as selector:
- selector.register(server, selectors.EVENT_READ)
- selector.register(pipe[0], selectors.EVENT_READ)
- while True:
- ready = selector.select(None)
- for key, events in ready:
- if key.fd == pipe[0]:
- sys.exit(0)
- if ready:
- server._handle_request_noblock()
- server.service_actions()
- except KeyboardInterrupt:
- # Ctrl-C on the command line sends SIGINT to the whole
- # process group. we could wait for the pipe, but just exit
- # now
- sys.exit(0)
- except Exception:
- # try to keep exception message together
- # the default virtually guarantees mangled output
- import traceback
- sys.stderr.write(traceback.format_exc())
- sys.exit(1)
- os.close(pipe[0])
- tmp_server.server_close()
- logging.info('started syntax-highlighting-server')
- pid, status = os.wait()
- logging.info('worker {} died, shutting down syntax-highlighting-server'
- .format(pid))
-
- else:
- raise Exception('invalid listen mode: {}'.format(args.listen_mode))
- except KeyboardInterrupt:
- logging.info('ctrl-c received, shutting down syntax-highlighting-server')
+ with ProcessPoolExecutor() as pool:
+ app = web.Application()
+ app['pool'] = pool
+ app['style'] = args.style
+ app.add_routes([web.post('/highlight', handle_highlight)])
+ web.run_app(app, host=args.host, port=args.port)
if __name__ == '__main__':
main()