~launchpad-pqm/launchpad/devel

Viewing changes to lib/lp/buildmaster/manager.py

Committer: Launchpad Patch Queue Manager
Date: 2012-01-05 16:28:40 UTC
mfrom: (14542.2.34 builder-rescue-if-lost-ro-crash-bug-906079)
Revision ID: launchpad@pqm.canonical.com-20120105162840-mtzomjr372d7xz6t

[r=allenap, bac, gmb, julian-edwards, wallyworld][bug=905853, 905855,
906079] In buildmaster,
always shift into a read-write database transaction access mode before
updating PackageBuild statuses. Shift into read-write transactions in
appropriate places in TranslationTemplatesBuildBehavior. Ensure that all
lp.buildmaster tests to which it is relevant are running with
BuilddManagerTestFixture.

files added:
lib/lp/buildmaster/testing.py

files modified:
lib/lp/archiveuploader/tests/test_uploadprocessor.py

lib/lp/buildmaster/interfaces/builder.py

lib/lp/buildmaster/manager.py

lib/lp/buildmaster/model/builder.py

lib/lp/buildmaster/model/buildfarmjobbehavior.py

lib/lp/buildmaster/model/packagebuild.py

lib/lp/buildmaster/tests/test_builder.py

lib/lp/buildmaster/tests/test_manager.py

lib/lp/buildmaster/tests/test_packagebuild.py

lib/lp/code/model/tests/test_sourcepackagerecipebuild.py

lib/lp/services/database/tests/test_transaction_policy.py

lib/lp/services/database/transaction_policy.py

lib/lp/soyuz/tests/test_binarypackagebuild.py

lib/lp/translations/model/translationtemplatesbuildbehavior.py

lib/lp/translations/tests/test_translationtemplatesbuildbehavior.py

Show diffs side-by-side

added added

removed removed

lib/lp/buildmaster/manager.py

# GNU Affero General Public License version 3 (see the file LICENSE).

"""Soyuz buildd slave manager logic."""

BuildBehaviorMismatch,

)

from lp.buildmaster.model.builder import Builder

from lp.services.database.transaction_policy import DatabaseTransactionPolicy

BUILDD_MANAGER_LOG_NAME = "slave-scanner"

111

112

# algorithm for polling.

112

113

SCAN_INTERVAL = 15

113

114

def __init__(self, builder_name, logger):

115

def __init__(self, builder_name, logger, clock=None):

115

116

self.builder_name = builder_name

116

117

self.logger = logger

118

if clock is None:

119

clock = reactor

120

self._clock = clock

117

121

118

122

def startCycle(self):

119

123

"""Scan the builder and dispatch to it or deal with failures."""

120

124

self.loop = LoopingCall(self.singleCycle)

125

self.loop.clock = self._clock

121

126

self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)

122

127

return self.stopping_deferred

123

128

138

143

1. Print the error in the log

139

144

2. Increment and assess failure counts on the builder and job.

140

145

"""

141

# Make sure that pending database updates are removed as it

142

# could leave the database in an inconsistent state (e.g. The

143

# job says it's running but the buildqueue has no builder set).

146

# Since this is a failure path, we could be in a broken

147

# transaction. Get us a fresh one.

144

148

transaction.abort()

145

149

146

150

# If we don't recognise the exception include a stack trace with

147

151

# the error.

148

152

error_message = failure.getErrorMessage()

149

if failure.check(

153

familiar_error = failure.check(

150

154

BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,

151

CannotResumeHost, BuildDaemonError, CannotFetchFile):

152

self.logger.info("Scanning %s failed with: %s" % (

153

self.builder_name, error_message))

155

CannotResumeHost, BuildDaemonError, CannotFetchFile)

156

if familiar_error:

157

self.logger.info(

158

"Scanning %s failed with: %s",

159

self.builder_name, error_message)

154

160

else:

155

self.logger.info("Scanning %s failed with: %s\n%s" % (

161

self.logger.info(

162

"Scanning %s failed with: %s\n%s",

156

163

self.builder_name, failure.getErrorMessage(),

157

failure.getTraceback()))

164

failure.getTraceback())

158

165

159

166

# Decide if we need to terminate the job or fail the

160

167

# builder.

161

168

try:

162

169

builder = get_builder(self.builder_name)

163

builder.gotFailure()

164

if builder.currentjob is not None:

165

build_farm_job = builder.getCurrentBuildFarmJob()

166

build_farm_job.gotFailure()

167

self.logger.info(

168

"builder %s failure count: %s, "

169

"job '%s' failure count: %s" % (

170

transaction.commit()

171

172

with DatabaseTransactionPolicy(read_only=False):

173

builder.gotFailure()

174

175

if builder.currentjob is None:

176

self.logger.info(

177

"Builder %s failed a probe, count: %s",

178

self.builder_name, builder.failure_count)

179

else:

180

build_farm_job = builder.getCurrentBuildFarmJob()

181

build_farm_job.gotFailure()

182

self.logger.info(

183

"builder %s failure count: %s, "

184

"job '%s' failure count: %s",

170

185

self.builder_name,

171

186

builder.failure_count,

172

187

build_farm_job.title,

173

build_farm_job.failure_count))

174

else:

175

self.logger.info(

176

"Builder %s failed a probe, count: %s" % (

177

self.builder_name, builder.failure_count))

178

assessFailureCounts(builder, failure.getErrorMessage())

179

transaction.commit()

188

build_farm_job.failure_count)

189

190

assessFailureCounts(builder, failure.getErrorMessage())

191

transaction.commit()

180

192

except:

181

193

# Catastrophic code failure! Not much we can do.

194

transaction.abort()

182

195

self.logger.error(

183

196

"Miserable failure when trying to examine failure counts:\n",

184

197

exc_info=True)

185

transaction.abort()

186

198

187

199

def checkCancellation(self, builder):

188

200

"""See if there is a pending cancellation request.

205

217

return defer.succeed(True)

206

218

207

219

self.logger.info("Cancelling build '%s'" % build.title)

208

buildqueue.cancel()

209

transaction.commit()

220

with DatabaseTransactionPolicy(read_only=False):

221

buildqueue.cancel()

222

transaction.commit()

210

223

d = builder.resumeSlaveHost()

211

224

d.addCallback(resume_done)

212

225

return d

236

249

"""

237

250

# We need to re-fetch the builder object on each cycle as the

238

251

# Storm store is invalidated over transaction boundaries.

239

240

252

self.builder = get_builder(self.builder_name)

241

253

242

254

def status_updated(ignored):

243

# Commit the changes done while possibly rescuing jobs, to

244

# avoid holding table locks.

245

transaction.commit()

246

247

255

# See if we think there's an active build on the builder.

248

256

buildqueue = self.builder.getBuildQueue()

249

257

253

261

return self.builder.updateBuild(buildqueue)

254

262

255

263

def build_updated(ignored):

256

# Commit changes done while updating the build, to avoid

257

# holding table locks.

258

transaction.commit()

259

260

264

# If the builder is in manual mode, don't dispatch anything.

261

265

if self.builder.manual:

262

266

self.logger.debug(

263

'%s is in manual mode, not dispatching.' %

267

'%s is in manual mode, not dispatching.',

264

268

self.builder.name)

265

269

return

266

270

278

282

job = self.builder.currentjob

279

283

if job is not None and not self.builder.builderok:

280

284

self.logger.info(

281

"%s was made unavailable, resetting attached "

282

"job" % self.builder.name)

283

job.reset()

285

"%s was made unavailable; resetting attached job.",

286

self.builder.name)

284

287

transaction.commit()

288

with DatabaseTransactionPolicy(read_only=False):

289

job.reset()

290

transaction.commit()

285

291

return

286

292

287

293

# See if there is a job we can dispatch to the builder slave.

288

294

295

# XXX JeroenVermeulen 2011-10-11, bug=872112: The job's

296

# failure count will be reset once the job has started

297

# successfully. Because of intervening commits, you may see

298

# a build with a nonzero failure count that's actually going

299

# to succeed later (and have a failure count of zero). Or

300

# it may fail yet end up with a lower failure count than you

301

# saw earlier.

289

302

d = self.builder.findAndStartJob()

290

303

291

304

def job_started(candidate):

292

305

if self.builder.currentjob is not None:

293

306

# After a successful dispatch we can reset the

294

307

# failure_count.

295

self.builder.resetFailureCount()

296

308

transaction.commit()

309

with DatabaseTransactionPolicy(read_only=False):

310

self.builder.resetFailureCount()

311

transaction.commit()

297

312

return self.builder.slave

298

313

else:

299

314

return None

372

387

self.logger = self._setupLogger()

373

388

self.new_builders_scanner = NewBuildersScanner(

374

389

manager=self, clock=clock)

390

self.transaction_policy = DatabaseTransactionPolicy(read_only=True)

375

391

376

392

def _setupLogger(self):

377

393

"""Set up a 'slave-scanner' logger that redirects to twisted.

390

406

logger.setLevel(level)

391

407

return logger

392

408

409

def enterReadOnlyDatabasePolicy(self):

410

"""Set the database transaction policy to read-only.

411

412

Any previously pending changes are committed first.

413

"""

414

transaction.commit()

415

self.transaction_policy.__enter__()

416

417

def exitReadOnlyDatabasePolicy(self, *args):

418

"""Reset database transaction policy to the default read-write."""

419

self.transaction_policy.__exit__(None, None, None)

420

393

421

def startService(self):

394

422

"""Service entry point, called when the application starts."""

423

# Avoiding circular imports.

424

from lp.buildmaster.interfaces.builder import IBuilderSet

425

426

self.enterReadOnlyDatabasePolicy()

395

427

396

428

# Get a list of builders and set up scanners on each one.

397

398

# Avoiding circular imports.

399

from lp.buildmaster.interfaces.builder import IBuilderSet

400

builder_set = getUtility(IBuilderSet)

401

builders = [builder.name for builder in builder_set]

402

self.addScanForBuilders(builders)

429

self.addScanForBuilders(

430

[builder.name for builder in getUtility(IBuilderSet)])

403

431

self.new_builders_scanner.scheduleScan()

404

432

405

433

# Events will now fire in the SlaveScanner objects to scan each

420

448

# stopped, so we can wait on them all at once here before

421

449

# exiting.

422

450

d = defer.DeferredList(deferreds, consumeErrors=True)

451

d.addCallback(self.exitReadOnlyDatabasePolicy)

423

452

return d

424

453

425

454

def addScanForBuilders(self, builders):

Older »