~launchpad-pqm/launchpad/devel

« back to all changes in this revision

Viewing changes to lib/lp/buildmaster/manager.py

  • Committer: Launchpad Patch Queue Manager
  • Date: 2011-12-09 07:52:37 UTC
  • mfrom: (14047.3.42 bug-717969)
  • Revision ID: launchpad@pqm.canonical.com-20111209075237-4u2pxpo653uecm4i
[r=bigjools][bug=717969] Read-only transactions by default in
 buildmaster.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
# Copyright 2009 Canonical Ltd.  This software is licensed under the
 
1
# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
2
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
3
 
4
4
"""Soyuz buildd slave manager logic."""
23
23
from zope.component import getUtility
24
24
 
25
25
from lp.buildmaster.enums import BuildStatus
26
 
from lp.buildmaster.interfaces.buildfarmjobbehavior import (
27
 
    BuildBehaviorMismatch,
28
 
    )
29
 
from lp.buildmaster.model.builder import Builder
30
26
from lp.buildmaster.interfaces.builder import (
31
27
    BuildDaemonError,
32
28
    BuildSlaveFailure,
34
30
    CannotFetchFile,
35
31
    CannotResumeHost,
36
32
    )
 
33
from lp.buildmaster.interfaces.buildfarmjobbehavior import (
 
34
    BuildBehaviorMismatch,
 
35
    )
 
36
from lp.buildmaster.model.builder import Builder
 
37
from lp.services.database.transaction_policy import DatabaseTransactionPolicy
37
38
 
38
39
 
39
40
BUILDD_MANAGER_LOG_NAME = "slave-scanner"
111
112
    # algorithm for polling.
112
113
    SCAN_INTERVAL = 15
113
114
 
114
 
    def __init__(self, builder_name, logger):
 
115
    def __init__(self, builder_name, logger, clock=None):
115
116
        self.builder_name = builder_name
116
117
        self.logger = logger
 
118
        if clock is None:
 
119
            clock = reactor
 
120
        self._clock = clock
117
121
 
118
122
    def startCycle(self):
119
123
        """Scan the builder and dispatch to it or deal with failures."""
120
124
        self.loop = LoopingCall(self.singleCycle)
 
125
        self.loop.clock = self._clock
121
126
        self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
122
127
        return self.stopping_deferred
123
128
 
138
143
        1. Print the error in the log
139
144
        2. Increment and assess failure counts on the builder and job.
140
145
        """
141
 
        # Make sure that pending database updates are removed as it
142
 
        # could leave the database in an inconsistent state (e.g. The
143
 
        # job says it's running but the buildqueue has no builder set).
 
146
        # Since this is a failure path, we could be in a broken
 
147
        # transaction.  Get us a fresh one.
144
148
        transaction.abort()
145
149
 
146
150
        # If we don't recognise the exception include a stack trace with
147
151
        # the error.
148
152
        error_message = failure.getErrorMessage()
149
 
        if failure.check(
 
153
        familiar_error = failure.check(
150
154
            BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
151
 
            CannotResumeHost, BuildDaemonError, CannotFetchFile):
152
 
            self.logger.info("Scanning %s failed with: %s" % (
153
 
                self.builder_name, error_message))
 
155
            CannotResumeHost, BuildDaemonError, CannotFetchFile)
 
156
        if familiar_error:
 
157
            self.logger.info(
 
158
                "Scanning %s failed with: %s",
 
159
                self.builder_name, error_message)
154
160
        else:
155
 
            self.logger.info("Scanning %s failed with: %s\n%s" % (
 
161
            self.logger.info(
 
162
                "Scanning %s failed with: %s\n%s",
156
163
                self.builder_name, failure.getErrorMessage(),
157
 
                failure.getTraceback()))
 
164
                failure.getTraceback())
158
165
 
159
166
        # Decide if we need to terminate the job or fail the
160
167
        # builder.
161
168
        try:
162
169
            builder = get_builder(self.builder_name)
163
 
            builder.gotFailure()
164
 
            if builder.currentjob is not None:
165
 
                build_farm_job = builder.getCurrentBuildFarmJob()
166
 
                build_farm_job.gotFailure()
167
 
                self.logger.info(
168
 
                    "builder %s failure count: %s, "
169
 
                    "job '%s' failure count: %s" % (
 
170
            transaction.commit()
 
171
 
 
172
            with DatabaseTransactionPolicy(read_only=False):
 
173
                builder.gotFailure()
 
174
 
 
175
                if builder.currentjob is None:
 
176
                    self.logger.info(
 
177
                        "Builder %s failed a probe, count: %s",
 
178
                        self.builder_name, builder.failure_count)
 
179
                else:
 
180
                    build_farm_job = builder.getCurrentBuildFarmJob()
 
181
                    build_farm_job.gotFailure()
 
182
                    self.logger.info(
 
183
                        "builder %s failure count: %s, "
 
184
                        "job '%s' failure count: %s",
170
185
                        self.builder_name,
171
186
                        builder.failure_count,
172
187
                        build_farm_job.title,
173
 
                        build_farm_job.failure_count))
174
 
            else:
175
 
                self.logger.info(
176
 
                    "Builder %s failed a probe, count: %s" % (
177
 
                        self.builder_name, builder.failure_count))
178
 
            assessFailureCounts(builder, failure.getErrorMessage())
179
 
            transaction.commit()
 
188
                        build_farm_job.failure_count)
 
189
 
 
190
                assessFailureCounts(builder, failure.getErrorMessage())
 
191
                transaction.commit()
180
192
        except:
181
193
            # Catastrophic code failure! Not much we can do.
 
194
            transaction.abort()
182
195
            self.logger.error(
183
196
                "Miserable failure when trying to examine failure counts:\n",
184
197
                exc_info=True)
185
 
            transaction.abort()
186
198
 
187
199
    def checkCancellation(self, builder):
188
200
        """See if there is a pending cancellation request.
236
248
        """
237
249
        # We need to re-fetch the builder object on each cycle as the
238
250
        # Storm store is invalidated over transaction boundaries.
239
 
 
240
251
        self.builder = get_builder(self.builder_name)
241
252
 
242
253
        def status_updated(ignored):
243
 
            # Commit the changes done while possibly rescuing jobs, to
244
 
            # avoid holding table locks.
245
 
            transaction.commit()
246
 
 
247
254
            # See if we think there's an active build on the builder.
248
255
            buildqueue = self.builder.getBuildQueue()
249
256
 
253
260
                return self.builder.updateBuild(buildqueue)
254
261
 
255
262
        def build_updated(ignored):
256
 
            # Commit changes done while updating the build, to avoid
257
 
            # holding table locks.
258
 
            transaction.commit()
259
 
 
260
263
            # If the builder is in manual mode, don't dispatch anything.
261
264
            if self.builder.manual:
262
265
                self.logger.debug(
263
 
                    '%s is in manual mode, not dispatching.' %
 
266
                    '%s is in manual mode, not dispatching.',
264
267
                    self.builder.name)
265
268
                return
266
269
 
278
281
                job = self.builder.currentjob
279
282
                if job is not None and not self.builder.builderok:
280
283
                    self.logger.info(
281
 
                        "%s was made unavailable, resetting attached "
282
 
                        "job" % self.builder.name)
283
 
                    job.reset()
 
284
                        "%s was made unavailable; resetting attached job.",
 
285
                        self.builder.name)
284
286
                    transaction.commit()
 
287
                    with DatabaseTransactionPolicy(read_only=False):
 
288
                        job.reset()
 
289
                        transaction.commit()
285
290
                return
286
291
 
287
292
            # See if there is a job we can dispatch to the builder slave.
288
293
 
 
294
            # XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
 
295
            # failure count will be reset once the job has started
 
296
            # successfully.  Because of intervening commits, you may see
 
297
            # a build with a nonzero failure count that's actually going
 
298
            # to succeed later (and have a failure count of zero).  Or
 
299
            # it may fail yet end up with a lower failure count than you
 
300
            # saw earlier.
289
301
            d = self.builder.findAndStartJob()
290
302
 
291
303
            def job_started(candidate):
292
304
                if self.builder.currentjob is not None:
293
305
                    # After a successful dispatch we can reset the
294
306
                    # failure_count.
295
 
                    self.builder.resetFailureCount()
296
307
                    transaction.commit()
 
308
                    with DatabaseTransactionPolicy(read_only=False):
 
309
                        self.builder.resetFailureCount()
 
310
                        transaction.commit()
297
311
                    return self.builder.slave
298
312
                else:
299
313
                    return None
372
386
        self.logger = self._setupLogger()
373
387
        self.new_builders_scanner = NewBuildersScanner(
374
388
            manager=self, clock=clock)
 
389
        self.transaction_policy = DatabaseTransactionPolicy(read_only=True)
375
390
 
376
391
    def _setupLogger(self):
377
392
        """Set up a 'slave-scanner' logger that redirects to twisted.
390
405
        logger.setLevel(level)
391
406
        return logger
392
407
 
 
408
    def enterReadOnlyDatabasePolicy(self):
 
409
        """Set the database transaction policy to read-only.
 
410
 
 
411
        Any previously pending changes are committed first.
 
412
        """
 
413
        transaction.commit()
 
414
        self.transaction_policy.__enter__()
 
415
 
 
416
    def exitReadOnlyDatabasePolicy(self, *args):
 
417
        """Reset database transaction policy to the default read-write."""
 
418
        self.transaction_policy.__exit__(None, None, None)
 
419
 
393
420
    def startService(self):
394
421
        """Service entry point, called when the application starts."""
 
422
        # Avoiding circular imports.
 
423
        from lp.buildmaster.interfaces.builder import IBuilderSet
 
424
 
 
425
        self.enterReadOnlyDatabasePolicy()
395
426
 
396
427
        # Get a list of builders and set up scanners on each one.
397
 
 
398
 
        # Avoiding circular imports.
399
 
        from lp.buildmaster.interfaces.builder import IBuilderSet
400
 
        builder_set = getUtility(IBuilderSet)
401
 
        builders = [builder.name for builder in builder_set]
402
 
        self.addScanForBuilders(builders)
 
428
        self.addScanForBuilders(
 
429
            [builder.name for builder in getUtility(IBuilderSet)])
403
430
        self.new_builders_scanner.scheduleScan()
404
431
 
405
432
        # Events will now fire in the SlaveScanner objects to scan each
420
447
        # stopped, so we can wait on them all at once here before
421
448
        # exiting.
422
449
        d = defer.DeferredList(deferreds, consumeErrors=True)
 
450
        d.addCallback(self.exitReadOnlyDatabasePolicy)
423
451
        return d
424
452
 
425
453
    def addScanForBuilders(self, builders):