1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
#!/usr/bin/env python3
# syntax-highlighting-server.py: create a simple HTTP server to highlight
# source for cgit. improves performance compared to invoking python on every
# request.
#
# Requirements: Python 3, pygments.
#
# Usage: Configure your system to run this at boot. Note that this program is
# not hardened, and it can be trivially DoSed. therefore, do not configure it
# to listen on a public network. Once configured, set your cgit source filter
# to syntax-highlighting-client.sh.
import argparse
import logging
from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.parse import parse_qs, unquote
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import guess_lexer, guess_lexer_for_filename
from pygments.lexers.special import TextLexer
from pygments.util import ClassNotFound
class HighlightingHTTPServer(HTTPServer):
# set SO_REUSEADDR
allow_reuse_address = True
# socketserver defaults to 5. especially in listen mode 'single', a
# sudden surge can easily overwhelm that.
request_queue_size = 64
def __init__(self, *args, formatter, style_defs, **kwargs):
super().__init__(*args, **kwargs)
self.formatter = formatter
self.style_defs = style_defs
class HighlightingHandler(BaseHTTPRequestHandler):
# read by BaseHTTPRequestHandler. need this so that curl doesn't delay
# waiting for 100-continue
protocol_version = 'HTTP/1.1'
def do_POST(self):
qs = parse_qs(self.path.split('?', 1)[1])
if len(qs['filename']) != 1:
raise ValueError('cannot have multiple filenames')
filename = unquote(qs['filename'][0])
data_len = int(self.headers['Content-Length'])
# in theory this could be optimized, but pygments will use more peak
# memory than this anyways
data = self.rfile.read(data_len).decode('utf-8', errors='replace')
# we don't need Server, Date headers
self.log_request(200)
self.send_response_only(200, None)
self.send_header('Content-Type', 'text/html; charset=utf-8')
# in theory we could use keep-alive, but cgit will only highlight one
# file at a time, and this way we don't need to buffer the output in
# order to calculate Content-Length
self.send_header('Connection', 'close')
self.end_headers()
try:
lexer = guess_lexer_for_filename(filename, data)
except ClassNotFound:
try:
lexer = guess_lexer(data)
except ClassNotFound:
lexer = TextLexer()
self.wfile.write(self.server.style_defs)
highlight(data, lexer, self.server.formatter, outfile=self.wfile)
def parse_args():
parser = argparse.ArgumentParser(description='syntax highlighting server',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--host', type=str, default='127.0.0.1',
help='''
The hostname or IP address to listen on. Note that it is
insecure to run syntax-highlighting-server on a public network.
''')
parser.add_argument('--port', type=int, default=4872,
help='the port to listen on')
parser.add_argument('--style', type=str, default='pastie',
help='pygments formatting style')
parser.add_argument('--listen-mode', type=str, default='single',
choices=['single', 'forking', 'reuseport'],
help='''
single, forking, or reuseport. single uses one
process/thread for all requests. forking forks a new
process for each request. reuseport forks NUM_SERVERS
servers at start, then binds them using SO_REUSEPORT (Linux
kernel does a round robin). single is best for low query
loads. forking is faster for high loads. reuseport is
fastest but uses more idle memory and requires Linux
''')
parser.add_argument('--num-servers', type=str, default='auto',
help='''
The number of servers to run in reuseport mode. Ignored in
other modes. auto means one for each CPU.
''')
return parser.parse_args()
def main():
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
args = parse_args()
if args.host not in ('localhost', '127.0.0.1', '::1'):
logging.warning('''
Warning: it is insecure to run syntax-highlighting-server on a
public network. Clients can easily attack http.server or pygments.
''')
logging.info('starting syntax-highlighting on {} port {}'
.format(args.host, args.port))
try:
# preload stuff to avoid first-request latency (every request for
# forking) and post-fork memory usage
# preload lexers
guess_lexer('')
# preload formatter
formatter = HtmlFormatter(style=args.style,
nobackground=True, encoding='utf-8')
# pre-compute style defs
style_defs = ('<style>' +
formatter.get_style_defs('.highlight') +
'</style>').encode('utf-8')
# used internally by socket
''.encode('idna')
def start_server(MyHTTPServer):
with MyHTTPServer((args.host, args.port), HighlightingHandler,
formatter=formatter, style_defs=style_defs) as server:
logging.info('started syntax-highlighting-server')
server.serve_forever()
if args.listen_mode == 'single':
start_server(HighlightingHTTPServer)
elif args.listen_mode == 'forking':
# note: Threading isn't useful for performance because of the GIL
from socketserver import ForkingMixIn
class ForkingHTTPServer(ForkingMixIn, HighlightingHTTPServer):
pass
start_server(ForkingHTTPServer)
elif args.listen_mode == 'reuseport':
import os
import selectors
import socket
import sys
class ReusePortHTTPServer(HighlightingHTTPServer):
def server_bind(self):
self.socket.setsockopt(
socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
super().server_bind()
# check that we can bind to the port
# keep tmp_server around to avoid TOCTOU
tmp_server = ReusePortHTTPServer((args.host, args.port), None,
formatter=None, style_defs=None)
if args.num_servers != 'auto':
num_servers = int(args.num_servers)
elif hasattr(os, 'sched_getaffinity'):
num_servers = len(os.sched_getaffinity(0))
else:
num_servers = os.cpu_count()
logging.info('starting {} servers'.format(num_servers))
pipe = os.pipe()
for i in range(num_servers):
pid = os.fork()
if pid == 0:
tmp_server.server_close()
try:
os.close(pipe[1])
with ReusePortHTTPServer((args.host, args.port),
HighlightingHandler, formatter=formatter,
style_defs=style_defs) as server:
with selectors.DefaultSelector() as selector:
selector.register(server, selectors.EVENT_READ)
selector.register(pipe[0], selectors.EVENT_READ)
while True:
ready = selector.select(None)
for key, events in ready:
if key.fd == pipe[0]:
sys.exit(0)
if ready:
server._handle_request_noblock()
server.service_actions()
except KeyboardInterrupt:
# Ctrl-C on the command line sends SIGINT to the whole
# process group. we could wait for the pipe, but just exit
# now
sys.exit(0)
except Exception:
# try to keep exception message together
# the default virtually guarantees mangled output
import traceback
sys.stderr.write(traceback.format_exc())
sys.exit(1)
os.close(pipe[0])
tmp_server.server_close()
logging.info('started syntax-highlighting-server')
pid, status = os.wait()
logging.info('worker {} died, shutting down syntax-highlighting-server'
.format(pid))
else:
raise Exception('invalid listen mode: {}'.format(args.listen_mode))
except KeyboardInterrupt:
logging.info('ctrl-c received, shutting down syntax-highlighting-server')
if __name__ == '__main__':
main()
|