~launchpad-pqm/launchpad/devel

« back to all changes in this revision

Viewing changes to lib/lp/buildmaster/manager.py

  • Committer: Raphael Badin
  • Date: 2012-01-06 08:27:55 UTC
  • mfrom: (14513.5.4 builder-history-lfa)
  • mto: This revision was merged to the branch mainline in revision 14654.
  • Revision ID: raphael.badin@canonical.com-20120106082755-95a0eh6nakv5hj3b
Merge devel.

Show diffs side-by-side

added added

removed removed

Lines of Context:
35
35
    )
36
36
from lp.buildmaster.model.builder import Builder
37
37
from lp.services.propertycache import get_property_cache
 
38
from lp.services.database.transaction_policy import DatabaseTransactionPolicy
38
39
 
39
40
 
40
41
BUILDD_MANAGER_LOG_NAME = "slave-scanner"
115
116
    # algorithm for polling.
116
117
    SCAN_INTERVAL = 15
117
118
 
118
 
    def __init__(self, builder_name, logger):
 
119
    def __init__(self, builder_name, logger, clock=None):
119
120
        self.builder_name = builder_name
120
121
        self.logger = logger
 
122
        if clock is None:
 
123
            clock = reactor
 
124
        self._clock = clock
121
125
 
122
126
    def startCycle(self):
123
127
        """Scan the builder and dispatch to it or deal with failures."""
124
128
        self.loop = LoopingCall(self.singleCycle)
 
129
        self.loop.clock = self._clock
125
130
        self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
126
131
        return self.stopping_deferred
127
132
 
142
147
        1. Print the error in the log
143
148
        2. Increment and assess failure counts on the builder and job.
144
149
        """
145
 
        # Make sure that pending database updates are removed as it
146
 
        # could leave the database in an inconsistent state (e.g. The
147
 
        # job says it's running but the buildqueue has no builder set).
 
150
        # Since this is a failure path, we could be in a broken
 
151
        # transaction.  Get us a fresh one.
148
152
        transaction.abort()
149
153
 
150
154
        # If we don't recognise the exception include a stack trace with
151
155
        # the error.
152
156
        error_message = failure.getErrorMessage()
153
 
        if failure.check(
 
157
        familiar_error = failure.check(
154
158
            BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
155
 
            CannotResumeHost, BuildDaemonError, CannotFetchFile):
156
 
            self.logger.info("Scanning %s failed with: %s" % (
157
 
                self.builder_name, error_message))
 
159
            CannotResumeHost, BuildDaemonError, CannotFetchFile)
 
160
        if familiar_error:
 
161
            self.logger.info(
 
162
                "Scanning %s failed with: %s",
 
163
                self.builder_name, error_message)
158
164
        else:
159
 
            self.logger.info("Scanning %s failed with: %s\n%s" % (
 
165
            self.logger.info(
 
166
                "Scanning %s failed with: %s\n%s",
160
167
                self.builder_name, failure.getErrorMessage(),
161
 
                failure.getTraceback()))
 
168
                failure.getTraceback())
162
169
 
163
170
        # Decide if we need to terminate the job or fail the
164
171
        # builder.
165
172
        try:
166
173
            builder = get_builder(self.builder_name)
167
 
            builder.gotFailure()
168
 
            if builder.currentjob is not None:
169
 
                build_farm_job = builder.getCurrentBuildFarmJob()
170
 
                build_farm_job.gotFailure()
171
 
                self.logger.info(
172
 
                    "builder %s failure count: %s, "
173
 
                    "job '%s' failure count: %s" % (
 
174
            transaction.commit()
 
175
 
 
176
            with DatabaseTransactionPolicy(read_only=False):
 
177
                builder.gotFailure()
 
178
 
 
179
                if builder.currentjob is None:
 
180
                    self.logger.info(
 
181
                        "Builder %s failed a probe, count: %s",
 
182
                        self.builder_name, builder.failure_count)
 
183
                else:
 
184
                    build_farm_job = builder.getCurrentBuildFarmJob()
 
185
                    build_farm_job.gotFailure()
 
186
                    self.logger.info(
 
187
                        "builder %s failure count: %s, "
 
188
                        "job '%s' failure count: %s",
174
189
                        self.builder_name,
175
190
                        builder.failure_count,
176
191
                        build_farm_job.title,
177
 
                        build_farm_job.failure_count))
178
 
            else:
179
 
                self.logger.info(
180
 
                    "Builder %s failed a probe, count: %s" % (
181
 
                        self.builder_name, builder.failure_count))
182
 
            assessFailureCounts(builder, failure.getErrorMessage())
183
 
            transaction.commit()
 
192
                        build_farm_job.failure_count)
 
193
 
 
194
                assessFailureCounts(builder, failure.getErrorMessage())
 
195
                transaction.commit()
184
196
        except:
185
197
            # Catastrophic code failure! Not much we can do.
 
198
            transaction.abort()
186
199
            self.logger.error(
187
200
                "Miserable failure when trying to examine failure counts:\n",
188
201
                exc_info=True)
189
 
            transaction.abort()
190
202
 
191
203
    def checkCancellation(self, builder):
192
204
        """See if there is a pending cancellation request.
209
221
            return defer.succeed(True)
210
222
 
211
223
        self.logger.info("Cancelling build '%s'" % build.title)
212
 
        buildqueue.cancel()
213
 
        transaction.commit()
 
224
        with DatabaseTransactionPolicy(read_only=False):
 
225
            buildqueue.cancel()
 
226
            transaction.commit()
214
227
        d = builder.resumeSlaveHost()
215
228
        d.addCallback(resume_done)
216
229
        return d
240
253
        """
241
254
        # We need to re-fetch the builder object on each cycle as the
242
255
        # Storm store is invalidated over transaction boundaries.
243
 
 
244
256
        self.builder = get_builder(self.builder_name)
245
257
 
246
258
        def status_updated(ignored):
247
 
            # Commit the changes done while possibly rescuing jobs, to
248
 
            # avoid holding table locks.
249
 
            transaction.commit()
250
 
 
251
259
            # See if we think there's an active build on the builder.
252
260
            buildqueue = self.builder.getBuildQueue()
253
261
 
257
265
                return self.builder.updateBuild(buildqueue)
258
266
 
259
267
        def build_updated(ignored):
260
 
            # Commit changes done while updating the build, to avoid
261
 
            # holding table locks.
262
 
            transaction.commit()
263
 
 
264
268
            # If the builder is in manual mode, don't dispatch anything.
265
269
            if self.builder.manual:
266
270
                self.logger.debug(
267
 
                    '%s is in manual mode, not dispatching.' %
 
271
                    '%s is in manual mode, not dispatching.',
268
272
                    self.builder.name)
269
273
                return
270
274
 
282
286
                job = self.builder.currentjob
283
287
                if job is not None and not self.builder.builderok:
284
288
                    self.logger.info(
285
 
                        "%s was made unavailable, resetting attached "
286
 
                        "job" % self.builder.name)
287
 
                    job.reset()
 
289
                        "%s was made unavailable; resetting attached job.",
 
290
                        self.builder.name)
288
291
                    transaction.commit()
 
292
                    with DatabaseTransactionPolicy(read_only=False):
 
293
                        job.reset()
 
294
                        transaction.commit()
289
295
                return
290
296
 
291
297
            # See if there is a job we can dispatch to the builder slave.
292
298
 
 
299
            # XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
 
300
            # failure count will be reset once the job has started
 
301
            # successfully.  Because of intervening commits, you may see
 
302
            # a build with a nonzero failure count that's actually going
 
303
            # to succeed later (and have a failure count of zero).  Or
 
304
            # it may fail yet end up with a lower failure count than you
 
305
            # saw earlier.
293
306
            d = self.builder.findAndStartJob()
294
307
 
295
308
            def job_started(candidate):
296
309
                if self.builder.currentjob is not None:
297
310
                    # After a successful dispatch we can reset the
298
311
                    # failure_count.
299
 
                    self.builder.resetFailureCount()
300
312
                    transaction.commit()
 
313
                    with DatabaseTransactionPolicy(read_only=False):
 
314
                        self.builder.resetFailureCount()
 
315
                        transaction.commit()
301
316
                    return self.builder.slave
302
317
                else:
303
318
                    return None
376
391
        self.logger = self._setupLogger()
377
392
        self.new_builders_scanner = NewBuildersScanner(
378
393
            manager=self, clock=clock)
 
394
        self.transaction_policy = DatabaseTransactionPolicy(read_only=True)
379
395
 
380
396
    def _setupLogger(self):
381
397
        """Set up a 'slave-scanner' logger that redirects to twisted.
394
410
        logger.setLevel(level)
395
411
        return logger
396
412
 
 
413
    def enterReadOnlyDatabasePolicy(self):
 
414
        """Set the database transaction policy to read-only.
 
415
 
 
416
        Any previously pending changes are committed first.
 
417
        """
 
418
        transaction.commit()
 
419
        self.transaction_policy.__enter__()
 
420
 
 
421
    def exitReadOnlyDatabasePolicy(self, *args):
 
422
        """Reset database transaction policy to the default read-write."""
 
423
        self.transaction_policy.__exit__(None, None, None)
 
424
 
397
425
    def startService(self):
398
426
        """Service entry point, called when the application starts."""
 
427
        # Avoiding circular imports.
 
428
        from lp.buildmaster.interfaces.builder import IBuilderSet
 
429
 
 
430
        self.enterReadOnlyDatabasePolicy()
399
431
 
400
432
        # Get a list of builders and set up scanners on each one.
401
 
 
402
 
        # Avoiding circular imports.
403
 
        from lp.buildmaster.interfaces.builder import IBuilderSet
404
 
        builder_set = getUtility(IBuilderSet)
405
 
        builders = [builder.name for builder in builder_set]
406
 
        self.addScanForBuilders(builders)
 
433
        self.addScanForBuilders(
 
434
            [builder.name for builder in getUtility(IBuilderSet)])
407
435
        self.new_builders_scanner.scheduleScan()
408
436
 
409
437
        # Events will now fire in the SlaveScanner objects to scan each
424
452
        # stopped, so we can wait on them all at once here before
425
453
        # exiting.
426
454
        d = defer.DeferredList(deferreds, consumeErrors=True)
 
455
        d.addCallback(self.exitReadOnlyDatabasePolicy)
427
456
        return d
428
457
 
429
458
    def addScanForBuilders(self, builders):