56
58
aliases = ['lp-serve']
60
help='serve on stdin/out for use from inetd or sshd'),
62
help='listen for connections on nominated port of the form '
63
'[hostname:]portnumber. Passing 0 as the port number will'
64
' result in a dynamically allocated port. Default port is'
67
Option('upload-directory',
68
help='upload branches to this directory. Defaults to '
69
'config.codehosting.hosted_branches_root.',
71
Option('mirror-directory',
72
help='serve branches from this directory. Defaults to '
73
'config.codehosting.mirrored_branches_root.'),
74
Option('codehosting-endpoint',
75
help='the url of the internal XML-RPC server. Defaults to '
76
'config.codehosting.codehosting_endpoint.',
78
RegistryOption('protocol',
79
help="Protocol to serve.",
80
lazy_registry=('bzrlib.transport', 'transport_server_registry'),
63
help="serve on stdin/out for use from inetd or sshd"),
67
"listen for connections on nominated port of the form "
68
"[hostname:]portnumber. Passing 0 as the port number will "
69
"result in a dynamically allocated port. Default port is "
75
"upload branches to this directory. Defaults to "
76
"config.codehosting.hosted_branches_root."),
81
"serve branches from this directory. Defaults to "
82
"config.codehosting.mirrored_branches_root.")),
84
'codehosting-endpoint',
86
"the url of the internal XML-RPC server. Defaults to "
87
"config.codehosting.codehosting_endpoint."),
90
'protocol', help="Protocol to serve.",
91
lazy_registry=('bzrlib.transport', 'transport_server_registry'),
84
95
takes_args = ['user_id']
167
178
"fork-env <command>\n<env>\nend\n": Request a new subprocess to be
168
179
started. <command> is the bzr command to be run, such as "rocks"
169
180
or "lp-serve --inet 12".
170
The immediate response will be the path-on-disk to a directory full
171
of named pipes (fifos) that will be the stdout/stderr/stdin (named
172
accordingly) of the new process.
181
The immediate response will be the path-on-disk to a directory
182
full of named pipes (fifos) that will be the stdout/stderr/stdin
183
(named accordingly) of the new process.
173
184
If a client holds the socket open, when the child process exits,
174
185
the exit status (as given by 'wait()') will be written to the
218
229
# There are several actions that are done when we get a new request.
219
230
# We have to create the fifos on disk, fork a new child, connect the
220
231
# child to those handles, and inform the client of the new path (not
221
# necessarily in that order.) It makes sense to wait to send the path
222
# message until after the fifos have been created. That way the
232
# necessarily in that order.) It makes sense to wait to send the
233
# path message until after the fifos have been created. That way the
223
234
# client can just try to open them immediately, and the
224
235
# client-and-child will be synchronized by the open() calls.
225
236
# However, should the client be the one doing the mkfifo, should the
226
# server? Who should be sending the message? Should we fork after the
237
# server? Who should be sending the message? Should we fork after
238
# the mkfifo or before?
228
239
# The current thoughts:
229
# 1) Try to do work in the child when possible. This should allow
230
# for 'scaling' because the server is single-threaded.
240
# 1) Try to do work in the child when possible. This should
241
# allow for 'scaling' because the server is single-threaded.
231
242
# 2) We create the directory itself in the server, because that
232
243
# allows the server to monitor whether the client failed to
233
244
# clean up after itself or not.
235
246
# the message back.
237
248
# Exit information
238
# Inform the client that the child has exited on the socket they used
239
# to request the fork.
240
# 1) Arguably they could see that stdout and stderr have been closed,
241
# and thus stop reading. In testing, I wrote a client which uses
242
# select.poll() over stdin/stdout/stderr and used that to ferry
243
# the content to the appropriate local handle. However for the
244
# FIFOs, when the remote end closed, I wouldn't see any
249
# Inform the client that the child has exited on the socket they
250
# used to request the fork.
251
# 1) Arguably they could see that stdout and stderr have been
252
# closed, and thus stop reading. In testing, I wrote a client
253
# which uses select.poll() over stdin/stdout/stderr and used that
254
# to ferry the content to the appropriate local handle. However
255
# for the FIFOs, when the remote end closed, I wouldn't see any
245
256
# corresponding information on the local end. There obviously
246
257
# wasn't any data to be read, so they wouldn't show up as
247
258
# 'readable' (for me to try to read, and get 0 bytes, indicating
248
# it was closed). I also wasn't seeing POLLHUP, which seemed to be
249
# the correct indicator. As such, we decided to inform the client
250
# on the socket that they originally made the fork request, rather
251
# than just closing the socket immediately.
259
# it was closed). I also wasn't seeing POLLHUP, which seemed to
260
# be the correct indicator. As such, we decided to inform the
261
# client on the socket that they originally made the fork
262
# request, rather than just closing the socket immediately.
252
263
# 2) We could have had the forking server close the socket, and only
253
264
# the child hold the socket open. When the child exits, then the
254
265
# OS naturally closes the socket.
265
276
# cleanup once connected
266
277
# The child process blocks during 'open()' waiting for the client to
267
# connect to its fifos. Once the client has connected, the child then
268
# deletes the temporary directory and the fifos from disk. This means
269
# that there isn't much left for diagnosis, but it also means that
270
# the client won't leave garbage around if it crashes, etc.
278
# connect to its fifos. Once the client has connected, the child
279
# then deletes the temporary directory and the fifos from disk. This
280
# means that there isn't much left for diagnosis, but it also means
281
# that the client won't leave garbage around if it crashes, etc.
271
282
# Note that the forking service itself still monitors the paths
272
283
# created, and will delete garbage if it sees that a child failed to
275
286
# os._exit(retcode) in the child
276
287
# Calling sys.exit(retcode) raises an exception, which then bubbles
277
# up the stack and runs exit functions (and finally statements). When
278
# I tried using it originally, I would see the current child bubble
279
# all the way up the stack (through the server code that it fork()
280
# through), and then get to main() returning code 0. The process
281
# would still exit nonzero. My guess is that something in the atexit
282
# functions was failing, but that it was happening after logging, etc
283
# had been shut down.
288
# up the stack and runs exit functions (and finally statements).
289
# When I tried using it originally, I would see the current child
290
# bubble all the way up the stack (through the server code that it
291
# fork() through), and then get to main() returning code 0. The
292
# process would still exit nonzero. My guess is that something in
293
# the atexit functions was failing, but that it was happening after
294
# logging, etc had been shut down.
284
295
# Any global state from the child process should be flushed before
285
# run_bzr_* has exited (which we *do* wait for), and any other global
286
# state is probably a remnant from the service process. Which will be
287
# cleaned up by the service itself, rather than the child.
296
# run_bzr_* has exited (which we *do* wait for), and any other
297
# global state is probably a remnant from the service process. Which
298
# will be cleaned up by the service itself, rather than the child.
288
299
# There is some concern that log files may not get flushed, so we
289
300
# currently call sys.exitfunc() first. The main problem is that I
290
# don't know any way to *remove* a function registered via 'atexit()'
291
# so if the forking service has some state, we my try to clean it up
301
# don't know any way to *remove* a function registered via
302
# 'atexit()' so if the forking service has some state, we my try to
303
# clean it up incorrectly.
293
304
# Note that the bzr script itself uses sys.exitfunc(); os._exit() in
294
# the 'bzr' main script, as the teardown time of all the python state
295
# was quite noticeable in real-world runtime. As such, bzrlib should
296
# be pretty safe, or it would have been failing for people already.
305
# the 'bzr' main script, as the teardown time of all the python
306
# state was quite noticeable in real-world runtime. As such, bzrlib
307
# should be pretty safe, or it would have been failing for people
298
310
# prefork vs max children vs ?
299
311
# For simplicity it seemed easiest to just fork when requested. Over
317
329
# much as we know about what went wrong.
319
331
DEFAULT_PATH = '/var/run/launchpad_forking_service.sock'
320
DEFAULT_PERMISSIONS = 00660 # Permissions on the master socket (rw-rw----)
321
WAIT_FOR_CHILDREN_TIMEOUT = 5*60 # Wait no more than 5 min for children
333
# Permissions on the master socket (rw-rw----)
334
DEFAULT_PERMISSIONS = 00660
336
# Wait no more than 5 minutes for children.
337
WAIT_FOR_CHILDREN_TIMEOUT = 5 * 60
322
339
SOCKET_TIMEOUT = 1.0
323
340
SLEEP_FOR_CHILDREN_TIMEOUT = 1.0
324
WAIT_FOR_REQUEST_TIMEOUT = 1.0 # No request should take longer than this to
326
CHILD_CONNECT_TIMEOUT = 120 # If we get a fork() request, but nobody
328
# On a heavily loaded server, it could take a
329
# couple secs, but it should never take
342
# No request should take longer than this to be read.
343
WAIT_FOR_REQUEST_TIMEOUT = 1.0
345
# If we get a fork() request, but nobody connects, just exit.
346
# On a heavily loaded server it could take a few seconds, but it
347
# should never take minutes.
348
CHILD_CONNECT_TIMEOUT = 120
332
350
_fork_function = os.fork
358
377
self._server_socket.close()
360
379
os.remove(self.master_socket_path)
361
except (OSError, IOError), e:
380
except (OSError, IOError):
362
381
# If we don't delete it, then we get 'address already in
364
trace.mutter('failed to cleanup: %s'
365
% (self.master_socket_path,))
383
trace.mutter('failed to cleanup: %s' % (self.master_socket_path,))
367
385
def _handle_sigchld(self, signum, frm):
368
# We don't actually do anything here, we just want an interrupt (EINTR)
369
# on socket.accept() when SIGCHLD occurs.
386
# We don't actually do anything here, we just want an interrupt
387
# (EINTR) on socket.accept() when SIGCHLD occurs.
372
390
def _handle_sigterm(self, signum, frm):
373
391
# Unregister this as the default handler, 2 SIGTERMs will exit us.
374
392
signal.signal(signal.SIGTERM, signal.SIG_DFL)
375
# SIGTERM should also generate EINTR on our wait loop, so this should
393
# SIGTERM should also generate EINTR on our wait loop, so this
377
395
self._should_terminate.set()
379
397
def _register_signals(self):
413
431
def _open_handles(self, base_path):
414
432
"""Open the given file handles.
416
This will attempt to open all of these file handles, but will not block
417
while opening them, timing out after self._child_connect_timeout
434
This will attempt to open all of these file handles, but will not
435
block while opening them, timing out after self._child_connect_timeout
420
:param base_path: The directory where all FIFOs are located
421
:return: (stdin_fid, stdout_fid, stderr_fid)
438
:param base_path: The directory where all FIFOs are located.
439
:return: (stdin_fid, stdout_fid, stderr_fid).
423
441
stdin_path, stdout_path, stderr_path = self._compute_paths(base_path)
424
442
# These open calls will block until another process connects (which
453
471
def _cleanup_fifos(self, base_path):
454
472
"""Remove the FIFO objects and directory from disk."""
455
473
stdin_path, stdout_path, stderr_path = self._compute_paths(base_path)
456
# Now that we've opened the handles, delete everything so that we don't
457
# leave garbage around. Because the open() is done in blocking mode, we
458
# know that someone has already connected to them, and we don't want
459
# anyone else getting confused and connecting.
474
# Now that we've opened the handles, delete everything so that
475
# we don't leave garbage around. Because the open() is done in
476
# blocking mode, we know that someone has already connected to
477
# them, and we don't want anyone else getting confused and
461
480
os.remove(stdin_path)
462
481
os.remove(stdout_path)
463
482
os.remove(stderr_path)
466
485
def _bind_child_file_descriptors(self, base_path):
467
486
# Note: by this point bzrlib has opened stderr for logging
468
# (as part of starting the service process in the first place).
469
# As such, it has a stream handler that writes to stderr. logging
470
# tries to flush and close that, but the file is already closed.
471
# This just supresses that exception
487
# (as part of starting the service process in the first place).
488
# As such, it has a stream handler that writes to stderr.
489
# logging tries to flush and close that, but the file is already
491
# This just supresses that exception.
472
492
stdin_fid, stdout_fid, stderr_fid = self._open_handles(base_path)
473
493
logging.raiseExceptions = False
474
494
sys.stdin.close()
503
523
self._bind_child_file_descriptors(path)
504
524
retcode = self._run_child_command(command_argv)
506
# We force os._exit() here, because we don't want to unwind the
507
# stack, which has complex results. (We can get it to unwind back
508
# to the cmd_launchpad_forking_service code, and even back to
509
# main() reporting thereturn code, but after that, suddenly the
510
# return code changes from a '0' to a '1', with no logging of info.
526
# We force os._exit() here, because we don't want to unwind
527
# the stack, which has complex results. (We can get it to
528
# unwind back to the cmd_launchpad_forking_service code, and
529
# even back to main() reporting thereturn code, but after
530
# that, suddenly the return code changes from a '0' to a
531
# '1', with no logging of info.
511
532
os._exit(retcode)
513
534
def _run_child_command(self, command_argv):
514
535
# This is the point where we would actually want to do something with
516
# TODO: We may want to consider special-casing the 'lp-serve' command.
517
# As that is the primary use-case for this service, it might be
518
# interesting to have an already-instantiated instance, where we
519
# can just pop on an extra argument and be ready to go. However,
520
# that would probably only really be measurable if we prefork. As
521
# it looks like ~200ms is 'fork()' time, but only 50ms is
522
# run-the-command time.
537
# TODO: We may want to consider special-casing the 'lp-serve'
538
# command. As that is the primary use-case for this service, it
539
# might be interesting to have an already-instantiated instance,
540
# where we can just pop on an extra argument and be ready to go.
541
# However, that would probably only really be measurable if we
542
# prefork. As it looks like ~200ms is 'fork()' time, but only
543
# 50ms is run-the-command time.
523
544
retcode = commands.run_bzr_catch_errors(command_argv)
524
545
self._close_child_file_descriptors()
525
546
trace.mutter('%d finished %r'
561
582
def fork_one_request(self, conn, client_addr, command_argv, env):
562
583
"""Fork myself and serve a request."""
563
584
temp_name = tempfile.mkdtemp(prefix='lp-forking-service-child-')
564
# Now that we've set everything up, send the response to the client we
565
# create them first, so the client can start trying to connect to them,
566
# while we fork and have the child do the same.
585
# Now that we've set everything up, send the response to the
586
# client we create them first, so the client can start trying to
587
# connect to them, while we fork and have the child do the same.
567
588
self._children_spawned += 1
568
589
pid = self._fork_function()
611
632
conn, client_addr = self._server_socket.accept()
612
633
except self._socket_timeout:
613
pass # run shutdown and children checks
634
pass # Run shutdown and children checks.
614
635
except self._socket_error, e:
615
636
if e.args[0] == errno.EINTR:
616
pass # run shutdown and children checks
637
pass # Run shutdown and children checks.
617
638
elif e.args[0] != errno.EBADF:
618
639
# We can get EBADF here while we are shutting down
619
640
# So we just ignore it for now
623
644
trace.warning("listening socket error: %s", e)
625
646
self.log(client_addr, 'connected')
626
# TODO: We should probably trap exceptions coming out of this
627
# and log them, so that we don't kill the service because
628
# of an unhandled error
629
# Note: settimeout is used so that a malformed request doesn't
630
# cause us to hang forever. Note that the particular
631
# implementation means that a malicious client could
632
# probably send us one byte every Xms, and we would just
633
# keep trying to read it. However, as a local service, we
634
# aren't worrying about it.
647
# TODO: We should probably trap exceptions coming out of
648
# this and log them, so that we don't kill the service
649
# because of an unhandled error.
650
# Note: settimeout is used so that a malformed request
651
# doesn't cause us to hang forever. Also note that the
652
# particular implementation means that a malicious
653
# client could probably send us one byte every once in a
654
# while, and we would just keep trying to read it.
655
# However, as a local service, we aren't worrying about
635
657
conn.settimeout(self.WAIT_FOR_REQUEST_TIMEOUT)
637
659
self.serve_one_connection(conn, client_addr)
638
except self._socket_timeout, e:
660
except self._socket_timeout as e:
639
661
trace.log_exception_quietly()
640
self.log(client_addr, 'request timeout failure: %s' % (e,))
663
client_addr, 'request timeout failure: %s' % (e,))
641
664
conn.sendall('FAILURE\nrequest timed out\n')
666
except Exception as e:
644
667
trace.log_exception_quietly()
645
668
self.log(client_addr, 'trapped a failure while handling'
646
669
' connection: %s' % (e,))
826
849
class cmd_launchpad_forking_service(Command):
827
850
"""Launch a long-running process, where you can ask for new processes.
829
The process will block on a given AF_UNIX socket waiting for requests to be
830
made. When a request is made, it will fork itself and redirect
852
The process will block on a given AF_UNIX socket waiting for requests to
853
be made. When a request is made, it will fork itself and redirect
831
854
stdout/in/err to fifos on the filesystem, and start running the requested
832
command. The caller will be informed where those file handles can be found.
833
Thus it only makes sense that the process connecting to the port must be on
855
command. The caller will be informed where those file handles can be
856
found. Thus it only makes sense that the process connecting to the port
857
must be on the same system.
837
860
aliases = ['lp-service']
948
970
register_command(cmd_launchpad_replay)
950
# This list was generated by run lsprofing a spawned child, and looking for
951
# <module ...> times, which indicate an import occured. Other possibilities are
952
# to just run "bzr lp-serve --profile-imports" manually, and observe what was
953
# expensive to import. It doesn't seem very easy to get this right
972
# This list was generated by "run lsprof"ing a spawned child, and
973
# looking for <module ...> times, which indicate that an import
974
# occurred. Another option is to run "bzr lp-serve --profile-imports"
975
# manually, and observe what was expensive to import. It doesn't seem
976
# very easy to get this right automatically.
955
977
libraries_to_preload = [
957
979
'bzrlib.repofmt.groupcompress_repo',