summaryrefslogtreecommitdiff
path: root/syntax-highlighting-server.py
blob: dabd631351c8cb58e21f58c595effb4c78e581e1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3

# syntax-highlighting-server.py: create a simple HTTP server to highlight
# source for cgit. improves performance compared to invoking python on every
# request.
#
# Requirements: Python 3, pygments.
#
# Usage: Configure your system to run this at boot. Note that this program is
# not hardened, and it can be trivially DoSed. therefore, do not configure it
# to listen on a public network. Once configured, set your cgit source filter
# to syntax-highlighting-client.sh.

import argparse
import logging

from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.parse import parse_qs, unquote

from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import guess_lexer, guess_lexer_for_filename
from pygments.lexers.special import TextLexer
from pygments.util import ClassNotFound

class HighlightingHTTPServer(HTTPServer):
    # set SO_REUSEADDR
    allow_reuse_address = True
    # socketserver defaults to 5. especially in listen mode 'single', a
    # sudden surge can easily overwhelm that.
    request_queue_size = 64

    def __init__(self, *args, formatter, style_defs, **kwargs):
        super().__init__(*args, **kwargs)
        self.formatter = formatter
        self.style_defs = style_defs

class HighlightingHandler(BaseHTTPRequestHandler):
    # read by BaseHTTPRequestHandler. need this so that curl doesn't delay
    # waiting for 100-continue
    protocol_version = 'HTTP/1.1'

    def do_POST(self):
        qs = parse_qs(self.path.split('?', 1)[1])
        if len(qs['filename']) != 1:
            raise ValueError('cannot have multiple filenames')
        filename = unquote(qs['filename'][0])

        data_len = int(self.headers['Content-Length'])
        # in theory this could be optimized, but pygments will use more peak
        # memory than this anyways
        data = self.rfile.read(data_len).decode('utf-8', errors='replace')

        # we don't need Server, Date headers
        self.log_request(200)
        self.send_response_only(200, None)
        self.send_header('Content-Type', 'text/html; charset=utf-8')

        # in theory we could use keep-alive, but cgit will only highlight one
        # file at a time, and this way we don't need to buffer the output in
        # order to calculate Content-Length
        self.send_header('Connection', 'close')
        self.end_headers()

        try:
            lexer = guess_lexer_for_filename(filename, data)
        except ClassNotFound:
            try:
                lexer = guess_lexer(data)
            except ClassNotFound:
                lexer = TextLexer()
        self.wfile.write(self.server.style_defs)
        highlight(data, lexer, self.server.formatter, outfile=self.wfile)

def parse_args():
    parser = argparse.ArgumentParser(description='syntax highlighting server',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--host', type=str, default='127.0.0.1',
            help='''
                The hostname or IP address to listen on. Note that it is
                insecure to run syntax-highlighting-server on a public network.
                ''')
    parser.add_argument('--port', type=int, default=4872,
            help='the port to listen on')
    parser.add_argument('--style', type=str, default='pastie',
            help='pygments formatting style')
    parser.add_argument('--listen-mode', type=str, default='single',
            choices=['single', 'forking', 'reuseport'],
            help='''
                    single, forking, or reuseport. single uses one
                    process/thread for all requests. forking forks a new
                    process for each request. reuseport forks NUM_SERVERS
                    servers at start, then binds them using SO_REUSEPORT (Linux
                    kernel does a round robin). single is best for low query
                    loads. forking is faster for high loads. reuseport is
                    fastest but uses more idle memory and requires Linux
                ''')
    parser.add_argument('--num-servers', type=str, default='auto',
            help='''
                The number of servers to run in reuseport mode. Ignored in
                other modes. auto means one for each CPU.
                ''')
    return parser.parse_args()

def main():
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    args = parse_args()

    if args.host not in ('localhost', '127.0.0.1', '::1'):
        logging.warning('''
            Warning: it is insecure to run syntax-highlighting-server on a
            public network. Clients can easily attack http.server or pygments.
            ''')

    logging.info('starting syntax-highlighting on {} port {}'
            .format(args.host, args.port))

    try:
        # preload lexers
        guess_lexer('')
        # preload formatter
        formatter = HtmlFormatter(style=args.style,
                nobackground=True, encoding='utf-8')
        # pre-compute style defs
        style_defs = ('<style>' +
                formatter.get_style_defs('.highlight') +
                '</style>').encode('utf-8')
        # used internally by socket
        ''.encode('idna')

        def start_server(MyHTTPServer):
            with MyHTTPServer((args.host, args.port), HighlightingHandler,
                    formatter=formatter, style_defs=style_defs) as server:
                logging.info('started syntax-highlighting-server')
                server.serve_forever()

        if args.listen_mode == 'single':
            start_server(HighlightingHTTPServer)

        elif args.listen_mode == 'forking':
            # note: Threading isn't useful for performance because of the GIL
            from socketserver import ForkingMixIn
            class ForkingHTTPServer(ForkingMixIn, HighlightingHTTPServer):
                pass
            start_server(ForkingHTTPServer)

        elif args.listen_mode == 'reuseport':
            import os
            import selectors
            import socket
            import sys

            class ReusePortHTTPServer(HighlightingHTTPServer):
                def server_bind(self):
                    self.socket.setsockopt(
                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
                    super().server_bind()

            # check that we can bind to the port
            # keep tmp_server around to avoid TOCTOU
            tmp_server = ReusePortHTTPServer((args.host, args.port), None,
                    formatter=None, style_defs=None)

            if args.num_servers != 'auto':
                num_servers = int(args.num_servers)
            elif hasattr(os, 'sched_getaffinity'):
                num_servers = len(os.sched_getaffinity(0))
            else:
                num_servers = os.cpu_count()
            logging.info('starting {} servers'.format(num_servers))

            pipe = os.pipe()

            for i in range(num_servers):
                pid = os.fork()
                if pid == 0:
                    tmp_server.server_close()
                    try:
                        os.close(pipe[1])
                        with ReusePortHTTPServer((args.host, args.port),
                                HighlightingHandler, formatter=formatter,
                                style_defs=style_defs) as server:
                            with selectors.DefaultSelector() as selector:
                                selector.register(server, selectors.EVENT_READ)
                                selector.register(pipe[0], selectors.EVENT_READ)
                                while True:
                                    ready = selector.select(None)
                                    for key, events in ready:
                                        if key.fd == pipe[0]:
                                            sys.exit(0)
                                    if ready:
                                        server._handle_request_noblock()
                                    server.service_actions()
                    except KeyboardInterrupt:
                        # Ctrl-C on the command line sends SIGINT to the whole
                        # process group. we could wait for the pipe, but just exit
                        # now
                        sys.exit(0)
                    except Exception:
                        # try to keep exception message together
                        # the default virtually guarantees mangled output
                        import traceback
                        sys.stderr.write(traceback.format_exc())
                        sys.exit(1)
            os.close(pipe[0])
            tmp_server.server_close()
            logging.info('started syntax-highlighting-server')
            pid, status = os.wait()
            logging.info('worker {} died, shutting down syntax-highlighting-server'
                    .format(pid))

        else:
            raise Exception('invalid listen mode: {}'.format(args.listen_mode))

    except KeyboardInterrupt:
        logging.info('ctrl-c received, shutting down syntax-highlighting-server')

if __name__ == '__main__':
    main()