summaryrefslogtreecommitdiff
path: root/syntax-highlighting-server.py
blob: e093a9a8922ff8dacbcf94e0a1de5679584ac82d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python3

# syntax-highlighting-server.py: create a simple HTTP server to highlight
# source for cgit. improves performance compared to invoking python on every
# request.
#
# Requirements: Python 3, pygments.
#
# Usage: Configure your system to run this at boot. Note that this program is
# not hardened, and it can be trivially DoSed. therefore, do not configure it
# to listen on a public network. Once configured, set your cgit source filter
# to syntax-highlighting-client.sh.

import argparse
import logging
import socket
import selectors
import sys

from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.parse import parse_qs, unquote

from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import LEXERS, _load_lexers, guess_lexer, guess_lexer_for_filename
from pygments.lexers.special import TextLexer
from pygments.util import ClassNotFound

class BaseHTTPServer(HTTPServer):
    # set SO_REUSEADDR
    allow_reuse_address = True
    # socketserver defaults to 5. especially in listen mode 'single', a
    # sudden surge can easily overwhelm that.
    request_queue_size = 128

formatter = HtmlFormatter(style='pastie', nobackground=True, encoding='utf-8')
style_defs = ('<style>' + formatter.get_style_defs('.highlight') + '</style>').encode('utf-8')

class HighlightingHandler(BaseHTTPRequestHandler):
    # read by BaseHTTPRequestHandler. need this so that curl doesn't delay
    # waiting for 100-continue
    protocol_version = 'HTTP/1.1'

    def do_POST(self):
        qs = parse_qs(self.path.split('?', 1)[1])
        if len(qs['filename']) != 1:
            raise ValueError('cannot have multiple filenames')
        filename = unquote(qs['filename'][0])

        data_len = int(self.headers['Content-Length'])
        # in theory this could be optimized, but pygments will use more peak
        # memory than this anyways
        data = self.rfile.read(data_len).decode('utf-8', errors='replace')

        # we don't need Server, Date headers
        self.log_request(200)
        self.send_response_only(200, None)
        self.send_header('Content-Type', 'text/html; charset=utf-8')

        # in theory we could use keep-alive, but cgit will only highlight one
        # file at a time, and this way we don't need to buffer the output in
        # order to calculate Content-Length
        self.send_header('Connection', 'close')
        self.end_headers()

        try:
            lexer = guess_lexer_for_filename(filename, data)
        except ClassNotFound:
            try:
                lexer = guess_lexer(data)
            except ClassNotFound:
                lexer = TextLexer()
        self.wfile.write(style_defs)
        highlight(data, lexer, formatter, outfile=self.wfile)

def main():
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    parser = argparse.ArgumentParser(description='syntax highlighting server',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--host', type=str, default='localhost',
            help='''
                The hostname or IP address to listen on. Note that it is
                insecure to run syntax-highlighting-server on a public network.
                ''')
    parser.add_argument('--port', type=int, default=4872,
            help='the port to listen on')
    parser.add_argument('--style', type=str, default='pastie',
            help='pygments formatting style')
    parser.add_argument('--listen-mode', type=str, default='single',
            choices=['single', 'forking', 'reuseport'],
            help='''
                    single, forking, or reuseport. single uses one
                    process/thread for all requests. forking forks a new
                    process for each request. reuseport forks NUM_SERVERS
                    servers at start, then binds them using SO_REUSEPORT (Linux
                    kernel does a round robin). single is best for low query
                    loads. forking is faster for high loads. reuseport is
                    fastest but uses more idle memory and requires Linux
                ''')
    parser.add_argument('--num-servers', type=str, default='auto',
            help='''
                The number of servers to run in reuseport mode. Ignored in
                other modes. auto means one for each CPU.
                ''')
    args = parser.parse_args()

    if args.host not in ('localhost', '127.0.0.1', '::1'):
        logging.warning('''
            Warning: it is insecure to run syntax-highlighting-server on a
            public network. Clients can easily attack http.server or pygments.
            ''')

    logging.info('starting syntax-highlighting on {} port {}'.format(args.host, args.port))

    # reduces first request latency, costs 5-10 MB RAM
    # for forking, this is necessary for any performance at all (otherwise it
    # defeats the whole purpose of using a separate process)
    for lexer in LEXERS.values():
        _load_lexers(lexer[0])
    __import__('pygments.styles.' + args.style)

    def start_server(MyHTTPServer):
        with MyHTTPServer((args.host, args.port), HighlightingHandler) as server:
            logging.info('started syntax-highlighting-server')
            server.serve_forever()

    if args.listen_mode == 'single':
        start_server(BaseHTTPServer)

    elif args.listen_mode == 'forking':
        # note: Threading isn't useful for performance because of the GIL
        from socketserver import ForkingMixIn
        class ForkingHTTPServer(ForkingMixIn, BaseHTTPServer):
            pass
        start_server(ForkingHTTPServer)

    elif args.listen_mode == 'reuseport':
        import os
        import signal

        if not hasattr(socket, 'SO_REUSEPORT'):
            raise Exception('SO_REUSEPORT not available on this platform')

        class ReusePortHTTPServer(BaseHTTPServer):
            def server_bind(self):
                self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
                super().server_bind()

        if args.num_servers == 'auto':
            num_servers = len(os.sched_getaffinity(0))
            logging.info('auto-detected {} CPUs'.format(num_servers))
        else:
            num_servers = int(args.num_servers)
        logging.info('starting {} servers'.format(num_servers))

        pipe = os.pipe()

        try:
            for i in range(num_servers):
                pid = os.fork()
                if pid == 0:
                    os.close(pipe[1])
                    with ReusePortHTTPServer((args.host, args.port), HighlightingHandler) as server:
                        with selectors.DefaultSelector() as selector:
                            selector.register(server, selectors.EVENT_READ)
                            selector.register(pipe[0], selectors.EVENT_READ)
                            while True:
                                ready = selector.select(None)
                                for key, events in ready:
                                    if key.fd == pipe[0]:
                                        return
                                if ready:
                                    server._handle_request_noblock()
                                server.service_actions()
                    sys.exit(0)
            os.close(pipe[0])
            logging.info('started syntax-highlighting-server')
            os.wait()
            logging.info('worker died, shutting down syntax-highlighting-server')
        except KeyboardInterrupt:
            if pid != 0:
                logging.info('ctrl-c received, shutting down syntax-highlighting-server')

    else:
        raise Exception('invalid listen mode: {}'.format(args.listen_mode))

if __name__ == '__main__':
    main()