115
116
# algorithm for polling.
116
117
SCAN_INTERVAL = 15
118
def __init__(self, builder_name, logger):
119
def __init__(self, builder_name, logger, clock=None):
119
120
self.builder_name = builder_name
120
121
self.logger = logger
122
126
def startCycle(self):
123
127
"""Scan the builder and dispatch to it or deal with failures."""
124
128
self.loop = LoopingCall(self.singleCycle)
129
self.loop.clock = self._clock
125
130
self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
126
131
return self.stopping_deferred
142
147
1. Print the error in the log
143
148
2. Increment and assess failure counts on the builder and job.
145
# Make sure that pending database updates are removed as it
146
# could leave the database in an inconsistent state (e.g. The
147
# job says it's running but the buildqueue has no builder set).
150
# Since this is a failure path, we could be in a broken
151
# transaction. Get us a fresh one.
148
152
transaction.abort()
150
154
# If we don't recognise the exception include a stack trace with
152
156
error_message = failure.getErrorMessage()
157
familiar_error = failure.check(
154
158
BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
155
CannotResumeHost, BuildDaemonError, CannotFetchFile):
156
self.logger.info("Scanning %s failed with: %s" % (
157
self.builder_name, error_message))
159
CannotResumeHost, BuildDaemonError, CannotFetchFile)
162
"Scanning %s failed with: %s",
163
self.builder_name, error_message)
159
self.logger.info("Scanning %s failed with: %s\n%s" % (
166
"Scanning %s failed with: %s\n%s",
160
167
self.builder_name, failure.getErrorMessage(),
161
failure.getTraceback()))
168
failure.getTraceback())
163
170
# Decide if we need to terminate the job or fail the
166
173
builder = get_builder(self.builder_name)
168
if builder.currentjob is not None:
169
build_farm_job = builder.getCurrentBuildFarmJob()
170
build_farm_job.gotFailure()
172
"builder %s failure count: %s, "
173
"job '%s' failure count: %s" % (
176
with DatabaseTransactionPolicy(read_only=False):
179
if builder.currentjob is None:
181
"Builder %s failed a probe, count: %s",
182
self.builder_name, builder.failure_count)
184
build_farm_job = builder.getCurrentBuildFarmJob()
185
build_farm_job.gotFailure()
187
"builder %s failure count: %s, "
188
"job '%s' failure count: %s",
174
189
self.builder_name,
175
190
builder.failure_count,
176
191
build_farm_job.title,
177
build_farm_job.failure_count))
180
"Builder %s failed a probe, count: %s" % (
181
self.builder_name, builder.failure_count))
182
assessFailureCounts(builder, failure.getErrorMessage())
192
build_farm_job.failure_count)
194
assessFailureCounts(builder, failure.getErrorMessage())
185
197
# Catastrophic code failure! Not much we can do.
186
199
self.logger.error(
187
200
"Miserable failure when trying to examine failure counts:\n",
191
203
def checkCancellation(self, builder):
192
204
"""See if there is a pending cancellation request.
241
254
# We need to re-fetch the builder object on each cycle as the
242
255
# Storm store is invalidated over transaction boundaries.
244
256
self.builder = get_builder(self.builder_name)
246
258
def status_updated(ignored):
247
# Commit the changes done while possibly rescuing jobs, to
248
# avoid holding table locks.
251
259
# See if we think there's an active build on the builder.
252
260
buildqueue = self.builder.getBuildQueue()
257
265
return self.builder.updateBuild(buildqueue)
259
267
def build_updated(ignored):
260
# Commit changes done while updating the build, to avoid
261
# holding table locks.
264
268
# If the builder is in manual mode, don't dispatch anything.
265
269
if self.builder.manual:
266
270
self.logger.debug(
267
'%s is in manual mode, not dispatching.' %
271
'%s is in manual mode, not dispatching.',
268
272
self.builder.name)
282
286
job = self.builder.currentjob
283
287
if job is not None and not self.builder.builderok:
284
288
self.logger.info(
285
"%s was made unavailable, resetting attached "
286
"job" % self.builder.name)
289
"%s was made unavailable; resetting attached job.",
288
291
transaction.commit()
292
with DatabaseTransactionPolicy(read_only=False):
291
297
# See if there is a job we can dispatch to the builder slave.
299
# XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
300
# failure count will be reset once the job has started
301
# successfully. Because of intervening commits, you may see
302
# a build with a nonzero failure count that's actually going
303
# to succeed later (and have a failure count of zero). Or
304
# it may fail yet end up with a lower failure count than you
293
306
d = self.builder.findAndStartJob()
295
308
def job_started(candidate):
296
309
if self.builder.currentjob is not None:
297
310
# After a successful dispatch we can reset the
299
self.builder.resetFailureCount()
300
312
transaction.commit()
313
with DatabaseTransactionPolicy(read_only=False):
314
self.builder.resetFailureCount()
301
316
return self.builder.slave
394
410
logger.setLevel(level)
413
def enterReadOnlyDatabasePolicy(self):
414
"""Set the database transaction policy to read-only.
416
Any previously pending changes are committed first.
419
self.transaction_policy.__enter__()
421
def exitReadOnlyDatabasePolicy(self, *args):
422
"""Reset database transaction policy to the default read-write."""
423
self.transaction_policy.__exit__(None, None, None)
397
425
def startService(self):
398
426
"""Service entry point, called when the application starts."""
427
# Avoiding circular imports.
428
from lp.buildmaster.interfaces.builder import IBuilderSet
430
self.enterReadOnlyDatabasePolicy()
400
432
# Get a list of builders and set up scanners on each one.
402
# Avoiding circular imports.
403
from lp.buildmaster.interfaces.builder import IBuilderSet
404
builder_set = getUtility(IBuilderSet)
405
builders = [builder.name for builder in builder_set]
406
self.addScanForBuilders(builders)
433
self.addScanForBuilders(
434
[builder.name for builder in getUtility(IBuilderSet)])
407
435
self.new_builders_scanner.scheduleScan()
409
437
# Events will now fire in the SlaveScanner objects to scan each