111
112
# algorithm for polling.
112
113
SCAN_INTERVAL = 15
114
def __init__(self, builder_name, logger):
115
def __init__(self, builder_name, logger, clock=None):
115
116
self.builder_name = builder_name
116
117
self.logger = logger
118
122
def startCycle(self):
119
123
"""Scan the builder and dispatch to it or deal with failures."""
120
124
self.loop = LoopingCall(self.singleCycle)
125
self.loop.clock = self._clock
121
126
self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
122
127
return self.stopping_deferred
138
143
1. Print the error in the log
139
144
2. Increment and assess failure counts on the builder and job.
141
# Make sure that pending database updates are removed as it
142
# could leave the database in an inconsistent state (e.g. The
143
# job says it's running but the buildqueue has no builder set).
146
# Since this is a failure path, we could be in a broken
147
# transaction. Get us a fresh one.
144
148
transaction.abort()
146
150
# If we don't recognise the exception include a stack trace with
148
152
error_message = failure.getErrorMessage()
153
familiar_error = failure.check(
150
154
BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
151
CannotResumeHost, BuildDaemonError, CannotFetchFile):
152
self.logger.info("Scanning %s failed with: %s" % (
153
self.builder_name, error_message))
155
CannotResumeHost, BuildDaemonError, CannotFetchFile)
158
"Scanning %s failed with: %s",
159
self.builder_name, error_message)
155
self.logger.info("Scanning %s failed with: %s\n%s" % (
162
"Scanning %s failed with: %s\n%s",
156
163
self.builder_name, failure.getErrorMessage(),
157
failure.getTraceback()))
164
failure.getTraceback())
159
166
# Decide if we need to terminate the job or fail the
162
169
builder = get_builder(self.builder_name)
164
if builder.currentjob is not None:
165
build_farm_job = builder.getCurrentBuildFarmJob()
166
build_farm_job.gotFailure()
168
"builder %s failure count: %s, "
169
"job '%s' failure count: %s" % (
172
with DatabaseTransactionPolicy(read_only=False):
175
if builder.currentjob is None:
177
"Builder %s failed a probe, count: %s",
178
self.builder_name, builder.failure_count)
180
build_farm_job = builder.getCurrentBuildFarmJob()
181
build_farm_job.gotFailure()
183
"builder %s failure count: %s, "
184
"job '%s' failure count: %s",
170
185
self.builder_name,
171
186
builder.failure_count,
172
187
build_farm_job.title,
173
build_farm_job.failure_count))
176
"Builder %s failed a probe, count: %s" % (
177
self.builder_name, builder.failure_count))
178
assessFailureCounts(builder, failure.getErrorMessage())
188
build_farm_job.failure_count)
190
assessFailureCounts(builder, failure.getErrorMessage())
181
193
# Catastrophic code failure! Not much we can do.
182
195
self.logger.error(
183
196
"Miserable failure when trying to examine failure counts:\n",
187
199
def checkCancellation(self, builder):
188
200
"""See if there is a pending cancellation request.
237
249
# We need to re-fetch the builder object on each cycle as the
238
250
# Storm store is invalidated over transaction boundaries.
240
251
self.builder = get_builder(self.builder_name)
242
253
def status_updated(ignored):
243
# Commit the changes done while possibly rescuing jobs, to
244
# avoid holding table locks.
247
254
# See if we think there's an active build on the builder.
248
255
buildqueue = self.builder.getBuildQueue()
253
260
return self.builder.updateBuild(buildqueue)
255
262
def build_updated(ignored):
256
# Commit changes done while updating the build, to avoid
257
# holding table locks.
260
263
# If the builder is in manual mode, don't dispatch anything.
261
264
if self.builder.manual:
262
265
self.logger.debug(
263
'%s is in manual mode, not dispatching.' %
266
'%s is in manual mode, not dispatching.',
264
267
self.builder.name)
278
281
job = self.builder.currentjob
279
282
if job is not None and not self.builder.builderok:
280
283
self.logger.info(
281
"%s was made unavailable, resetting attached "
282
"job" % self.builder.name)
284
"%s was made unavailable; resetting attached job.",
284
286
transaction.commit()
287
with DatabaseTransactionPolicy(read_only=False):
287
292
# See if there is a job we can dispatch to the builder slave.
294
# XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
295
# failure count will be reset once the job has started
296
# successfully. Because of intervening commits, you may see
297
# a build with a nonzero failure count that's actually going
298
# to succeed later (and have a failure count of zero). Or
299
# it may fail yet end up with a lower failure count than you
289
301
d = self.builder.findAndStartJob()
291
303
def job_started(candidate):
292
304
if self.builder.currentjob is not None:
293
305
# After a successful dispatch we can reset the
295
self.builder.resetFailureCount()
296
307
transaction.commit()
308
with DatabaseTransactionPolicy(read_only=False):
309
self.builder.resetFailureCount()
297
311
return self.builder.slave
390
405
logger.setLevel(level)
408
def enterReadOnlyDatabasePolicy(self):
409
"""Set the database transaction policy to read-only.
411
Any previously pending changes are committed first.
414
self.transaction_policy.__enter__()
416
def exitReadOnlyDatabasePolicy(self, *args):
417
"""Reset database transaction policy to the default read-write."""
418
self.transaction_policy.__exit__(None, None, None)
393
420
def startService(self):
394
421
"""Service entry point, called when the application starts."""
422
# Avoiding circular imports.
423
from lp.buildmaster.interfaces.builder import IBuilderSet
425
self.enterReadOnlyDatabasePolicy()
396
427
# Get a list of builders and set up scanners on each one.
398
# Avoiding circular imports.
399
from lp.buildmaster.interfaces.builder import IBuilderSet
400
builder_set = getUtility(IBuilderSet)
401
builders = [builder.name for builder in builder_set]
402
self.addScanForBuilders(builders)
428
self.addScanForBuilders(
429
[builder.name for builder in getUtility(IBuilderSet)])
403
430
self.new_builders_scanner.scheduleScan()
405
432
# Events will now fire in the SlaveScanner objects to scan each