111
112
# algorithm for polling.
112
113
SCAN_INTERVAL = 15
114
def __init__(self, builder_name, logger):
115
def __init__(self, builder_name, logger, clock=None):
115
116
self.builder_name = builder_name
116
117
self.logger = logger
118
122
def startCycle(self):
119
123
"""Scan the builder and dispatch to it or deal with failures."""
120
124
self.loop = LoopingCall(self.singleCycle)
125
self.loop.clock = self._clock
121
126
self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
122
127
return self.stopping_deferred
138
143
1. Print the error in the log
139
144
2. Increment and assess failure counts on the builder and job.
141
# Make sure that pending database updates are removed as it
142
# could leave the database in an inconsistent state (e.g. The
143
# job says it's running but the buildqueue has no builder set).
146
# Since this is a failure path, we could be in a broken
147
# transaction. Get us a fresh one.
144
148
transaction.abort()
146
150
# If we don't recognise the exception include a stack trace with
148
152
error_message = failure.getErrorMessage()
153
familiar_error = failure.check(
150
154
BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
151
CannotResumeHost, BuildDaemonError, CannotFetchFile):
152
self.logger.info("Scanning %s failed with: %s" % (
153
self.builder_name, error_message))
155
CannotResumeHost, BuildDaemonError, CannotFetchFile)
158
"Scanning %s failed with: %s",
159
self.builder_name, error_message)
155
self.logger.info("Scanning %s failed with: %s\n%s" % (
162
"Scanning %s failed with: %s\n%s",
156
163
self.builder_name, failure.getErrorMessage(),
157
failure.getTraceback()))
164
failure.getTraceback())
159
166
# Decide if we need to terminate the job or fail the
162
169
builder = get_builder(self.builder_name)
164
if builder.currentjob is not None:
165
build_farm_job = builder.getCurrentBuildFarmJob()
166
build_farm_job.gotFailure()
168
"builder %s failure count: %s, "
169
"job '%s' failure count: %s" % (
172
with DatabaseTransactionPolicy(read_only=False):
175
if builder.currentjob is None:
177
"Builder %s failed a probe, count: %s",
178
self.builder_name, builder.failure_count)
180
build_farm_job = builder.getCurrentBuildFarmJob()
181
build_farm_job.gotFailure()
183
"builder %s failure count: %s, "
184
"job '%s' failure count: %s",
170
185
self.builder_name,
171
186
builder.failure_count,
172
187
build_farm_job.title,
173
build_farm_job.failure_count))
176
"Builder %s failed a probe, count: %s" % (
177
self.builder_name, builder.failure_count))
178
assessFailureCounts(builder, failure.getErrorMessage())
188
build_farm_job.failure_count)
190
assessFailureCounts(builder, failure.getErrorMessage())
181
193
# Catastrophic code failure! Not much we can do.
182
195
self.logger.error(
183
196
"Miserable failure when trying to examine failure counts:\n",
187
199
def checkCancellation(self, builder):
188
200
"""See if there is a pending cancellation request.
237
250
# We need to re-fetch the builder object on each cycle as the
238
251
# Storm store is invalidated over transaction boundaries.
240
252
self.builder = get_builder(self.builder_name)
242
254
def status_updated(ignored):
243
# Commit the changes done while possibly rescuing jobs, to
244
# avoid holding table locks.
247
255
# See if we think there's an active build on the builder.
248
256
buildqueue = self.builder.getBuildQueue()
253
261
return self.builder.updateBuild(buildqueue)
255
263
def build_updated(ignored):
256
# Commit changes done while updating the build, to avoid
257
# holding table locks.
260
264
# If the builder is in manual mode, don't dispatch anything.
261
265
if self.builder.manual:
262
266
self.logger.debug(
263
'%s is in manual mode, not dispatching.' %
267
'%s is in manual mode, not dispatching.',
264
268
self.builder.name)
278
282
job = self.builder.currentjob
279
283
if job is not None and not self.builder.builderok:
280
284
self.logger.info(
281
"%s was made unavailable, resetting attached "
282
"job" % self.builder.name)
285
"%s was made unavailable; resetting attached job.",
284
287
transaction.commit()
288
with DatabaseTransactionPolicy(read_only=False):
287
293
# See if there is a job we can dispatch to the builder slave.
295
# XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
296
# failure count will be reset once the job has started
297
# successfully. Because of intervening commits, you may see
298
# a build with a nonzero failure count that's actually going
299
# to succeed later (and have a failure count of zero). Or
300
# it may fail yet end up with a lower failure count than you
289
302
d = self.builder.findAndStartJob()
291
304
def job_started(candidate):
292
305
if self.builder.currentjob is not None:
293
306
# After a successful dispatch we can reset the
295
self.builder.resetFailureCount()
296
308
transaction.commit()
309
with DatabaseTransactionPolicy(read_only=False):
310
self.builder.resetFailureCount()
297
312
return self.builder.slave
390
406
logger.setLevel(level)
409
def enterReadOnlyDatabasePolicy(self):
410
"""Set the database transaction policy to read-only.
412
Any previously pending changes are committed first.
415
self.transaction_policy.__enter__()
417
def exitReadOnlyDatabasePolicy(self, *args):
418
"""Reset database transaction policy to the default read-write."""
419
self.transaction_policy.__exit__(None, None, None)
393
421
def startService(self):
394
422
"""Service entry point, called when the application starts."""
423
# Avoiding circular imports.
424
from lp.buildmaster.interfaces.builder import IBuilderSet
426
self.enterReadOnlyDatabasePolicy()
396
428
# Get a list of builders and set up scanners on each one.
398
# Avoiding circular imports.
399
from lp.buildmaster.interfaces.builder import IBuilderSet
400
builder_set = getUtility(IBuilderSet)
401
builders = [builder.name for builder in builder_set]
402
self.addScanForBuilders(builders)
429
self.addScanForBuilders(
430
[builder.name for builder in getUtility(IBuilderSet)])
403
431
self.new_builders_scanner.scheduleScan()
405
433
# Events will now fire in the SlaveScanner objects to scan each