~launchpad-pqm/launchpad/devel : revision 14571

1

2

# GNU Affero General Public License version 3 (see the file LICENSE).

3

4

"""Bazaar plugin to run the smart server on Launchpad.

28

import threading

29

import time

30

31

from bzrlib.commands import Command, register_command

32

from bzrlib.option import (

33

Option,

34

RegistryOption,

35

)

36

31

from bzrlib import (

37

32

commands,

38

33

errors,

41

36

trace,

42

37

ui,

43

38

)

44

39

from bzrlib.commands import (

40

Command,

41

register_command,

42

)

43

from bzrlib.option import (

44

Option,

45

RegistryOption,

46

)

45

47

from bzrlib.transport import (

46

48

get_transport,

47

49

transport_server_registry,

56

58

aliases = ['lp-serve']

57

59

58

60

takes_options = [

59

Option('inet',

60

help='serve on stdin/out for use from inetd or sshd'),

61

Option('port',

62

help='listen for connections on nominated port of the form '

63

'[hostname:]portnumber. Passing 0 as the port number will'

64

' result in a dynamically allocated port. Default port is'

65

' 4155.',

66

type=str),

67

Option('upload-directory',

68

help='upload branches to this directory. Defaults to '

69

'config.codehosting.hosted_branches_root.',

70

type=unicode),

71

Option('mirror-directory',

72

help='serve branches from this directory. Defaults to '

73

'config.codehosting.mirrored_branches_root.'),

74

Option('codehosting-endpoint',

75

help='the url of the internal XML-RPC server. Defaults to '

76

'config.codehosting.codehosting_endpoint.',

77

type=unicode),

78

RegistryOption('protocol',

79

help="Protocol to serve.",

80

lazy_registry=('bzrlib.transport', 'transport_server_registry'),

81

value_switches=True),

61

Option(

62

'inet',

63

help="serve on stdin/out for use from inetd or sshd"),

64

Option(

65

'port',

66

help=(

67

"listen for connections on nominated port of the form "

68

"[hostname:]portnumber. Passing 0 as the port number will "

69

"result in a dynamically allocated port. Default port is "

70

" 4155."),

71

type=str),

72

Option(

73

'upload-directory',

74

help=(

75

"upload branches to this directory. Defaults to "

76

"config.codehosting.hosted_branches_root."),

77

type=unicode),

78

Option(

79

'mirror-directory',

80

help=(

81

"serve branches from this directory. Defaults to "

82

"config.codehosting.mirrored_branches_root.")),

83

Option(

84

'codehosting-endpoint',

85

help=(

86

"the url of the internal XML-RPC server. Defaults to "

87

"config.codehosting.codehosting_endpoint."),

88

type=unicode),

89

RegistryOption(

90

'protocol', help="Protocol to serve.",

91

lazy_registry=('bzrlib.transport', 'transport_server_registry'),

92

value_switches=True),

82

93

]

83

94

84

95

takes_args = ['user_id']

167

178

"fork-env <command>\n<env>\nend\n": Request a new subprocess to be

168

179

started. <command> is the bzr command to be run, such as "rocks"

169

180

or "lp-serve --inet 12".

170

The immediate response will be the path-on-disk to a directory full

171

of named pipes (fifos) that will be the stdout/stderr/stdin (named

172

accordingly) of the new process.

181

The immediate response will be the path-on-disk to a directory

182

full of named pipes (fifos) that will be the stdout/stderr/stdin

183

(named accordingly) of the new process.

173

184

If a client holds the socket open, when the child process exits,

174

185

the exit status (as given by 'wait()') will be written to the

175

186

socket.

181

192

that should be reset.

182

193

183

194

fork-env allows you to supply environment variables such as

184

"BZR_EMAIL: joe@foo.com" which will be set in os.environ before the

185

command is run.

195

"BZR_EMAIL: joe@foo.com" which will be set in os.environ before

196

the command is run.

186

197

"""

187

198

188

199

# Design decisions. These are bits where we could have chosen a different

218

229

# There are several actions that are done when we get a new request.

219

230

# We have to create the fifos on disk, fork a new child, connect the

220

231

# child to those handles, and inform the client of the new path (not

221

# necessarily in that order.) It makes sense to wait to send the path

222

# message until after the fifos have been created. That way the

232

# necessarily in that order.) It makes sense to wait to send the

233

# path message until after the fifos have been created. That way the

223

234

# client can just try to open them immediately, and the

224

235

# client-and-child will be synchronized by the open() calls.

225

236

# However, should the client be the one doing the mkfifo, should the

226

# server? Who should be sending the message? Should we fork after the

227

# mkfifo or before.

237

# server? Who should be sending the message? Should we fork after

238

# the mkfifo or before?

228

239

# The current thoughts:

229

# 1) Try to do work in the child when possible. This should allow

230

# for 'scaling' because the server is single-threaded.

240

# 1) Try to do work in the child when possible. This should

241

# allow for 'scaling' because the server is single-threaded.

231

242

# 2) We create the directory itself in the server, because that

232

243

# allows the server to monitor whether the client failed to

233

244

# clean up after itself or not.

235

246

# the message back.

236

247

# [Decision #4]

237

248

# Exit information

238

# Inform the client that the child has exited on the socket they used

239

# to request the fork.

240

# 1) Arguably they could see that stdout and stderr have been closed,

241

# and thus stop reading. In testing, I wrote a client which uses

242

# select.poll() over stdin/stdout/stderr and used that to ferry

243

# the content to the appropriate local handle. However for the

244

# FIFOs, when the remote end closed, I wouldn't see any

249

# Inform the client that the child has exited on the socket they

250

# used to request the fork.

251

# 1) Arguably they could see that stdout and stderr have been

252

# closed, and thus stop reading. In testing, I wrote a client

253

# which uses select.poll() over stdin/stdout/stderr and used that

254

# to ferry the content to the appropriate local handle. However

255

# for the FIFOs, when the remote end closed, I wouldn't see any

245

256

# corresponding information on the local end. There obviously

246

257

# wasn't any data to be read, so they wouldn't show up as

247

258

# 'readable' (for me to try to read, and get 0 bytes, indicating

248

# it was closed). I also wasn't seeing POLLHUP, which seemed to be

249

# the correct indicator. As such, we decided to inform the client

250

# on the socket that they originally made the fork request, rather

251

# than just closing the socket immediately.

259

# it was closed). I also wasn't seeing POLLHUP, which seemed to

260

# be the correct indicator. As such, we decided to inform the

261

# client on the socket that they originally made the fork

262

# request, rather than just closing the socket immediately.

252

263

# 2) We could have had the forking server close the socket, and only

253

264

# the child hold the socket open. When the child exits, then the

254

265

# OS naturally closes the socket.

264

275

# [Decision #5]

265

276

# cleanup once connected

266

277

# The child process blocks during 'open()' waiting for the client to

267

# connect to its fifos. Once the client has connected, the child then

268

# deletes the temporary directory and the fifos from disk. This means

269

# that there isn't much left for diagnosis, but it also means that

270

# the client won't leave garbage around if it crashes, etc.

278

# connect to its fifos. Once the client has connected, the child

279

# then deletes the temporary directory and the fifos from disk. This

280

# means that there isn't much left for diagnosis, but it also means

281

# that the client won't leave garbage around if it crashes, etc.

271

282

# Note that the forking service itself still monitors the paths

272

283

# created, and will delete garbage if it sees that a child failed to

273

284

# do so.

274

285

# [Decision #6]

275

286

# os._exit(retcode) in the child

276

287

# Calling sys.exit(retcode) raises an exception, which then bubbles

277

# up the stack and runs exit functions (and finally statements). When

278

# I tried using it originally, I would see the current child bubble

279

# all the way up the stack (through the server code that it fork()

280

# through), and then get to main() returning code 0. The process

281

# would still exit nonzero. My guess is that something in the atexit

282

# functions was failing, but that it was happening after logging, etc

283

# had been shut down.

288

# up the stack and runs exit functions (and finally statements).

289

# When I tried using it originally, I would see the current child

290

# bubble all the way up the stack (through the server code that it

291

# fork() through), and then get to main() returning code 0. The

292

# process would still exit nonzero. My guess is that something in

293

# the atexit functions was failing, but that it was happening after

294

# logging, etc had been shut down.

284

295

# Any global state from the child process should be flushed before

285

# run_bzr_* has exited (which we *do* wait for), and any other global

286

# state is probably a remnant from the service process. Which will be

287

# cleaned up by the service itself, rather than the child.

296

# run_bzr_* has exited (which we *do* wait for), and any other

297

# global state is probably a remnant from the service process. Which

298

# will be cleaned up by the service itself, rather than the child.

288

299

# There is some concern that log files may not get flushed, so we

289

300

# currently call sys.exitfunc() first. The main problem is that I

290

# don't know any way to *remove* a function registered via 'atexit()'

291

# so if the forking service has some state, we my try to clean it up

292

# incorrectly.

301

# don't know any way to *remove* a function registered via

302

# 'atexit()' so if the forking service has some state, we my try to

303

# clean it up incorrectly.

293

304

# Note that the bzr script itself uses sys.exitfunc(); os._exit() in

294

# the 'bzr' main script, as the teardown time of all the python state

295

# was quite noticeable in real-world runtime. As such, bzrlib should

296

# be pretty safe, or it would have been failing for people already.

305

# the 'bzr' main script, as the teardown time of all the python

306

# state was quite noticeable in real-world runtime. As such, bzrlib

307

# should be pretty safe, or it would have been failing for people

308

# already.

297

309

# [Decision #7]

298

310

# prefork vs max children vs ?

299

311

# For simplicity it seemed easiest to just fork when requested. Over

317

329

# much as we know about what went wrong.

318

330

319

331

DEFAULT_PATH = '/var/run/launchpad_forking_service.sock'

320

DEFAULT_PERMISSIONS = 00660 # Permissions on the master socket (rw-rw----)

321

WAIT_FOR_CHILDREN_TIMEOUT = 5*60 # Wait no more than 5 min for children

332

333

# Permissions on the master socket (rw-rw----)

334

DEFAULT_PERMISSIONS = 00660

335

336

# Wait no more than 5 minutes for children.

337

WAIT_FOR_CHILDREN_TIMEOUT = 5 * 60

338

322

339

SOCKET_TIMEOUT = 1.0

323

340

SLEEP_FOR_CHILDREN_TIMEOUT = 1.0

324

WAIT_FOR_REQUEST_TIMEOUT = 1.0 # No request should take longer than this to

325

# be read

326

CHILD_CONNECT_TIMEOUT = 120 # If we get a fork() request, but nobody

327

# connects just exit

328

# On a heavily loaded server, it could take a

329

# couple secs, but it should never take

330

# minutes

341

342

# No request should take longer than this to be read.

343

WAIT_FOR_REQUEST_TIMEOUT = 1.0

344

345

# If we get a fork() request, but nobody connects, just exit.

346

# On a heavily loaded server it could take a few seconds, but it

347

# should never take minutes.

348

CHILD_CONNECT_TIMEOUT = 120

331

349

332

350

_fork_function = os.fork

333

351

346

364

self._child_connect_timeout = self.CHILD_CONNECT_TIMEOUT

347

365

348

366

def _create_master_socket(self):

349

self._server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)

367

self._server_socket = socket.socket(

368

socket.AF_UNIX, socket.SOCK_STREAM)

350

369

self._server_socket.bind(self.master_socket_path)

351

370

if self._perms is not None:

352

371

os.chmod(self.master_socket_path, self._perms)

358

377

self._server_socket.close()

359

378

try:

360

379

os.remove(self.master_socket_path)

361

except (OSError, IOError), e:

380

except (OSError, IOError):

362

381

# If we don't delete it, then we get 'address already in

363

# use' failures

364

trace.mutter('failed to cleanup: %s'

365

% (self.master_socket_path,))

382

# use' failures.

383

trace.mutter('failed to cleanup: %s' % (self.master_socket_path,))

366

384

367

385

def _handle_sigchld(self, signum, frm):

368

# We don't actually do anything here, we just want an interrupt (EINTR)

369

# on socket.accept() when SIGCHLD occurs.

386

# We don't actually do anything here, we just want an interrupt

387

# (EINTR) on socket.accept() when SIGCHLD occurs.

370

388

pass

371

389

372

390

def _handle_sigterm(self, signum, frm):

373

391

# Unregister this as the default handler, 2 SIGTERMs will exit us.

374

392

signal.signal(signal.SIGTERM, signal.SIG_DFL)

375

# SIGTERM should also generate EINTR on our wait loop, so this should

376

# be enough

393

# SIGTERM should also generate EINTR on our wait loop, so this

394

# should be enough.

377

395

self._should_terminate.set()

378

396

379

397

def _register_signals(self):

413

431

def _open_handles(self, base_path):

414

432

"""Open the given file handles.

415

433

416

This will attempt to open all of these file handles, but will not block

417

while opening them, timing out after self._child_connect_timeout

434

This will attempt to open all of these file handles, but will not

435

block while opening them, timing out after self._child_connect_timeout

418

436

seconds.

419

437

420

:param base_path: The directory where all FIFOs are located

421

:return: (stdin_fid, stdout_fid, stderr_fid)

438

:param base_path: The directory where all FIFOs are located.

439

:return: (stdin_fid, stdout_fid, stderr_fid).

422

440

"""

423

441

stdin_path, stdout_path, stderr_path = self._compute_paths(base_path)

424

442

# These open calls will block until another process connects (which

432

450

for path, flags in to_open:

433

451

try:

434

452

fids.append(os.open(path, flags))

435

except OSError, e:

453

except OSError:

436

454

# In production code, signal.alarm will generally just kill

437

455

# us. But if something installs a signal handler for SIGALRM,

438

456

# do what we can to die gracefully.

453

471

def _cleanup_fifos(self, base_path):

454

472

"""Remove the FIFO objects and directory from disk."""

455

473

stdin_path, stdout_path, stderr_path = self._compute_paths(base_path)

456

# Now that we've opened the handles, delete everything so that we don't

457

# leave garbage around. Because the open() is done in blocking mode, we

458

# know that someone has already connected to them, and we don't want

459

# anyone else getting confused and connecting.

460

# See [Decision #5]

474

# Now that we've opened the handles, delete everything so that

475

# we don't leave garbage around. Because the open() is done in

476

# blocking mode, we know that someone has already connected to

477

# them, and we don't want anyone else getting confused and

478

# connecting.

479

# See [Decision #5].

461

480

os.remove(stdin_path)

462

481

os.remove(stdout_path)

463

482

os.remove(stderr_path)

465

484

466

485

def _bind_child_file_descriptors(self, base_path):

467

486

# Note: by this point bzrlib has opened stderr for logging

468

# (as part of starting the service process in the first place).

469

# As such, it has a stream handler that writes to stderr. logging

470

# tries to flush and close that, but the file is already closed.

471

# This just supresses that exception

487

# (as part of starting the service process in the first place).

488

# As such, it has a stream handler that writes to stderr.

489

# logging tries to flush and close that, but the file is already

490

# closed.

491

# This just supresses that exception.

472

492

stdin_fid, stdout_fid, stderr_fid = self._open_handles(base_path)

473

493

logging.raiseExceptions = False

474

494

sys.stdin.close()

492

512

493

513

def become_child(self, command_argv, path):

494

514

"""We are in the spawned child code, do our magic voodoo."""

495

retcode = 127 # Failed in a bad way, poor cleanup, etc.

515

retcode = 127 # Failed in a bad way, poor cleanup, etc.

496

516

try:

497

517

# Stop tracking new signals

498

518

self._unregister_signals()

503

523

self._bind_child_file_descriptors(path)

504

524

retcode = self._run_child_command(command_argv)

505

525

finally:

506

# We force os._exit() here, because we don't want to unwind the

507

# stack, which has complex results. (We can get it to unwind back

508

# to the cmd_launchpad_forking_service code, and even back to

509

# main() reporting thereturn code, but after that, suddenly the

510

# return code changes from a '0' to a '1', with no logging of info.

526

# We force os._exit() here, because we don't want to unwind

527

# the stack, which has complex results. (We can get it to

528

# unwind back to the cmd_launchpad_forking_service code, and

529

# even back to main() reporting thereturn code, but after

530

# that, suddenly the return code changes from a '0' to a

531

# '1', with no logging of info.

511

532

os._exit(retcode)

512

533

513

534

def _run_child_command(self, command_argv):

514

535

# This is the point where we would actually want to do something with

515

536

# our life

516

# TODO: We may want to consider special-casing the 'lp-serve' command.

517

# As that is the primary use-case for this service, it might be

518

# interesting to have an already-instantiated instance, where we

519

# can just pop on an extra argument and be ready to go. However,

520

# that would probably only really be measurable if we prefork. As

521

# it looks like ~200ms is 'fork()' time, but only 50ms is

522

# run-the-command time.

537

# TODO: We may want to consider special-casing the 'lp-serve'

538

# command. As that is the primary use-case for this service, it

539

# might be interesting to have an already-instantiated instance,

540

# where we can just pop on an extra argument and be ready to go.

541

# However, that would probably only really be measurable if we

542

# prefork. As it looks like ~200ms is 'fork()' time, but only

543

# 50ms is run-the-command time.

523

544

retcode = commands.run_bzr_catch_errors(command_argv)

524

545

self._close_child_file_descriptors()

525

546

trace.mutter('%d finished %r'

561

582

def fork_one_request(self, conn, client_addr, command_argv, env):

562

583

"""Fork myself and serve a request."""

563

584

temp_name = tempfile.mkdtemp(prefix='lp-forking-service-child-')

564

# Now that we've set everything up, send the response to the client we

565

# create them first, so the client can start trying to connect to them,

566

# while we fork and have the child do the same.

585

# Now that we've set everything up, send the response to the

586

# client we create them first, so the client can start trying to

587

# connect to them, while we fork and have the child do the same.

567

588

self._children_spawned += 1

568

589

pid = self._fork_function()

569

590

if pid == 0:

610

631

try:

611

632

conn, client_addr = self._server_socket.accept()

612

633

except self._socket_timeout:

613

pass # run shutdown and children checks

634

pass # Run shutdown and children checks.

614

635

except self._socket_error, e:

615

636

if e.args[0] == errno.EINTR:

616

pass # run shutdown and children checks

637

pass # Run shutdown and children checks.

617

638

elif e.args[0] != errno.EBADF:

618

639

# We can get EBADF here while we are shutting down

619

640

# So we just ignore it for now

623

644

trace.warning("listening socket error: %s", e)

624

645

else:

625

646

self.log(client_addr, 'connected')

626

# TODO: We should probably trap exceptions coming out of this

627

# and log them, so that we don't kill the service because

628

# of an unhandled error

629

# Note: settimeout is used so that a malformed request doesn't

630

# cause us to hang forever. Note that the particular

631

# implementation means that a malicious client could

632

# probably send us one byte every Xms, and we would just

633

# keep trying to read it. However, as a local service, we

634

# aren't worrying about it.

647

# TODO: We should probably trap exceptions coming out of

648

# this and log them, so that we don't kill the service

649

# because of an unhandled error.

650

# Note: settimeout is used so that a malformed request

651

# doesn't cause us to hang forever. Also note that the

652

# particular implementation means that a malicious

653

# client could probably send us one byte every once in a

654

# while, and we would just keep trying to read it.

655

# However, as a local service, we aren't worrying about

656

# it.

635

657

conn.settimeout(self.WAIT_FOR_REQUEST_TIMEOUT)

636

658

try:

637

659

self.serve_one_connection(conn, client_addr)

638

except self._socket_timeout, e:

660

except self._socket_timeout as e:

639

661

trace.log_exception_quietly()

640

self.log(client_addr, 'request timeout failure: %s' % (e,))

662

self.log(

663

client_addr, 'request timeout failure: %s' % (e,))

641

664

conn.sendall('FAILURE\nrequest timed out\n')

642

665

conn.close()

643

except Exception, e:

666

except Exception as e:

644

667

trace.log_exception_quietly()

645

668

self.log(client_addr, 'trapped a failure while handling'

646

669

' connection: %s' % (e,))

764

787

command, env = request[9:].split('\n', 1)

765

788

else:

766

789

command = request[5:].strip()

767

env = 'end\n' # No env set

790

env = 'end\n' # No env set.

768

791

try:

769

792

command_argv = self.command_to_argv(command)

770

793

env = self.parse_env(env)

826

849

class cmd_launchpad_forking_service(Command):

827

850

"""Launch a long-running process, where you can ask for new processes.

828

851

829

The process will block on a given AF_UNIX socket waiting for requests to be

830

made. When a request is made, it will fork itself and redirect

852

The process will block on a given AF_UNIX socket waiting for requests to

853

be made. When a request is made, it will fork itself and redirect

831

854

stdout/in/err to fifos on the filesystem, and start running the requested

832

command. The caller will be informed where those file handles can be found.

833

Thus it only makes sense that the process connecting to the port must be on

834

the same system.

855

command. The caller will be informed where those file handles can be

856

found. Thus it only makes sense that the process connecting to the port

857

must be on the same system.

835

858

"""

836

859

837

860

aliases = ['lp-service']

857

880

except ImportError, e:

858

881

trace.mutter('failed to preload %s: %s' % (pyname, e))

859

882

860

861

883

def _daemonize(self, pid_filename):

862

884

"""Turn this process into a child-of-init daemon.

863

885

947

969

948

970

register_command(cmd_launchpad_replay)

949

971

950

# This list was generated by run lsprofing a spawned child, and looking for

951

# <module ...> times, which indicate an import occured. Other possibilities are

952

# to just run "bzr lp-serve --profile-imports" manually, and observe what was

953

# expensive to import. It doesn't seem very easy to get this right

954

# automatically.

972

# This list was generated by "run lsprof"ing a spawned child, and

973

# looking for <module ...> times, which indicate that an import

974

# occurred. Another option is to run "bzr lp-serve --profile-imports"

975

# manually, and observe what was expensive to import. It doesn't seem

976

# very easy to get this right automatically.

955

977

libraries_to_preload = [

956

978

'bzrlib.errors',

957

979

'bzrlib.repofmt.groupcompress_repo',