~launchpad-pqm/launchpad/devel

« back to all changes in this revision

Viewing changes to lib/lp/buildmaster/manager.py

[r=allenap, bac, gmb, julian-edwards, wallyworld][bug=905853, 905855,
 906079] In buildmaster,
 always shift into a read-write database transaction access mode before
 updating PackageBuild statuses. Shift into read-write transactions in
 appropriate places in TranslationTemplatesBuildBehavior. Ensure that all
 lp.buildmaster tests to which it is relevant are running with
 BuilddManagerTestFixture.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
# Copyright 2009 Canonical Ltd.  This software is licensed under the
 
1
# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
2
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
3
 
4
4
"""Soyuz buildd slave manager logic."""
34
34
    BuildBehaviorMismatch,
35
35
    )
36
36
from lp.buildmaster.model.builder import Builder
 
37
from lp.services.database.transaction_policy import DatabaseTransactionPolicy
37
38
 
38
39
 
39
40
BUILDD_MANAGER_LOG_NAME = "slave-scanner"
111
112
    # algorithm for polling.
112
113
    SCAN_INTERVAL = 15
113
114
 
114
 
    def __init__(self, builder_name, logger):
 
115
    def __init__(self, builder_name, logger, clock=None):
115
116
        self.builder_name = builder_name
116
117
        self.logger = logger
 
118
        if clock is None:
 
119
            clock = reactor
 
120
        self._clock = clock
117
121
 
118
122
    def startCycle(self):
119
123
        """Scan the builder and dispatch to it or deal with failures."""
120
124
        self.loop = LoopingCall(self.singleCycle)
 
125
        self.loop.clock = self._clock
121
126
        self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
122
127
        return self.stopping_deferred
123
128
 
138
143
        1. Print the error in the log
139
144
        2. Increment and assess failure counts on the builder and job.
140
145
        """
141
 
        # Make sure that pending database updates are removed as it
142
 
        # could leave the database in an inconsistent state (e.g. The
143
 
        # job says it's running but the buildqueue has no builder set).
 
146
        # Since this is a failure path, we could be in a broken
 
147
        # transaction.  Get us a fresh one.
144
148
        transaction.abort()
145
149
 
146
150
        # If we don't recognise the exception include a stack trace with
147
151
        # the error.
148
152
        error_message = failure.getErrorMessage()
149
 
        if failure.check(
 
153
        familiar_error = failure.check(
150
154
            BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
151
 
            CannotResumeHost, BuildDaemonError, CannotFetchFile):
152
 
            self.logger.info("Scanning %s failed with: %s" % (
153
 
                self.builder_name, error_message))
 
155
            CannotResumeHost, BuildDaemonError, CannotFetchFile)
 
156
        if familiar_error:
 
157
            self.logger.info(
 
158
                "Scanning %s failed with: %s",
 
159
                self.builder_name, error_message)
154
160
        else:
155
 
            self.logger.info("Scanning %s failed with: %s\n%s" % (
 
161
            self.logger.info(
 
162
                "Scanning %s failed with: %s\n%s",
156
163
                self.builder_name, failure.getErrorMessage(),
157
 
                failure.getTraceback()))
 
164
                failure.getTraceback())
158
165
 
159
166
        # Decide if we need to terminate the job or fail the
160
167
        # builder.
161
168
        try:
162
169
            builder = get_builder(self.builder_name)
163
 
            builder.gotFailure()
164
 
            if builder.currentjob is not None:
165
 
                build_farm_job = builder.getCurrentBuildFarmJob()
166
 
                build_farm_job.gotFailure()
167
 
                self.logger.info(
168
 
                    "builder %s failure count: %s, "
169
 
                    "job '%s' failure count: %s" % (
 
170
            transaction.commit()
 
171
 
 
172
            with DatabaseTransactionPolicy(read_only=False):
 
173
                builder.gotFailure()
 
174
 
 
175
                if builder.currentjob is None:
 
176
                    self.logger.info(
 
177
                        "Builder %s failed a probe, count: %s",
 
178
                        self.builder_name, builder.failure_count)
 
179
                else:
 
180
                    build_farm_job = builder.getCurrentBuildFarmJob()
 
181
                    build_farm_job.gotFailure()
 
182
                    self.logger.info(
 
183
                        "builder %s failure count: %s, "
 
184
                        "job '%s' failure count: %s",
170
185
                        self.builder_name,
171
186
                        builder.failure_count,
172
187
                        build_farm_job.title,
173
 
                        build_farm_job.failure_count))
174
 
            else:
175
 
                self.logger.info(
176
 
                    "Builder %s failed a probe, count: %s" % (
177
 
                        self.builder_name, builder.failure_count))
178
 
            assessFailureCounts(builder, failure.getErrorMessage())
179
 
            transaction.commit()
 
188
                        build_farm_job.failure_count)
 
189
 
 
190
                assessFailureCounts(builder, failure.getErrorMessage())
 
191
                transaction.commit()
180
192
        except:
181
193
            # Catastrophic code failure! Not much we can do.
 
194
            transaction.abort()
182
195
            self.logger.error(
183
196
                "Miserable failure when trying to examine failure counts:\n",
184
197
                exc_info=True)
185
 
            transaction.abort()
186
198
 
187
199
    def checkCancellation(self, builder):
188
200
        """See if there is a pending cancellation request.
205
217
            return defer.succeed(True)
206
218
 
207
219
        self.logger.info("Cancelling build '%s'" % build.title)
208
 
        buildqueue.cancel()
209
 
        transaction.commit()
 
220
        with DatabaseTransactionPolicy(read_only=False):
 
221
            buildqueue.cancel()
 
222
            transaction.commit()
210
223
        d = builder.resumeSlaveHost()
211
224
        d.addCallback(resume_done)
212
225
        return d
236
249
        """
237
250
        # We need to re-fetch the builder object on each cycle as the
238
251
        # Storm store is invalidated over transaction boundaries.
239
 
 
240
252
        self.builder = get_builder(self.builder_name)
241
253
 
242
254
        def status_updated(ignored):
243
 
            # Commit the changes done while possibly rescuing jobs, to
244
 
            # avoid holding table locks.
245
 
            transaction.commit()
246
 
 
247
255
            # See if we think there's an active build on the builder.
248
256
            buildqueue = self.builder.getBuildQueue()
249
257
 
253
261
                return self.builder.updateBuild(buildqueue)
254
262
 
255
263
        def build_updated(ignored):
256
 
            # Commit changes done while updating the build, to avoid
257
 
            # holding table locks.
258
 
            transaction.commit()
259
 
 
260
264
            # If the builder is in manual mode, don't dispatch anything.
261
265
            if self.builder.manual:
262
266
                self.logger.debug(
263
 
                    '%s is in manual mode, not dispatching.' %
 
267
                    '%s is in manual mode, not dispatching.',
264
268
                    self.builder.name)
265
269
                return
266
270
 
278
282
                job = self.builder.currentjob
279
283
                if job is not None and not self.builder.builderok:
280
284
                    self.logger.info(
281
 
                        "%s was made unavailable, resetting attached "
282
 
                        "job" % self.builder.name)
283
 
                    job.reset()
 
285
                        "%s was made unavailable; resetting attached job.",
 
286
                        self.builder.name)
284
287
                    transaction.commit()
 
288
                    with DatabaseTransactionPolicy(read_only=False):
 
289
                        job.reset()
 
290
                        transaction.commit()
285
291
                return
286
292
 
287
293
            # See if there is a job we can dispatch to the builder slave.
288
294
 
 
295
            # XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
 
296
            # failure count will be reset once the job has started
 
297
            # successfully.  Because of intervening commits, you may see
 
298
            # a build with a nonzero failure count that's actually going
 
299
            # to succeed later (and have a failure count of zero).  Or
 
300
            # it may fail yet end up with a lower failure count than you
 
301
            # saw earlier.
289
302
            d = self.builder.findAndStartJob()
290
303
 
291
304
            def job_started(candidate):
292
305
                if self.builder.currentjob is not None:
293
306
                    # After a successful dispatch we can reset the
294
307
                    # failure_count.
295
 
                    self.builder.resetFailureCount()
296
308
                    transaction.commit()
 
309
                    with DatabaseTransactionPolicy(read_only=False):
 
310
                        self.builder.resetFailureCount()
 
311
                        transaction.commit()
297
312
                    return self.builder.slave
298
313
                else:
299
314
                    return None
372
387
        self.logger = self._setupLogger()
373
388
        self.new_builders_scanner = NewBuildersScanner(
374
389
            manager=self, clock=clock)
 
390
        self.transaction_policy = DatabaseTransactionPolicy(read_only=True)
375
391
 
376
392
    def _setupLogger(self):
377
393
        """Set up a 'slave-scanner' logger that redirects to twisted.
390
406
        logger.setLevel(level)
391
407
        return logger
392
408
 
 
409
    def enterReadOnlyDatabasePolicy(self):
 
410
        """Set the database transaction policy to read-only.
 
411
 
 
412
        Any previously pending changes are committed first.
 
413
        """
 
414
        transaction.commit()
 
415
        self.transaction_policy.__enter__()
 
416
 
 
417
    def exitReadOnlyDatabasePolicy(self, *args):
 
418
        """Reset database transaction policy to the default read-write."""
 
419
        self.transaction_policy.__exit__(None, None, None)
 
420
 
393
421
    def startService(self):
394
422
        """Service entry point, called when the application starts."""
 
423
        # Avoiding circular imports.
 
424
        from lp.buildmaster.interfaces.builder import IBuilderSet
 
425
 
 
426
        self.enterReadOnlyDatabasePolicy()
395
427
 
396
428
        # Get a list of builders and set up scanners on each one.
397
 
 
398
 
        # Avoiding circular imports.
399
 
        from lp.buildmaster.interfaces.builder import IBuilderSet
400
 
        builder_set = getUtility(IBuilderSet)
401
 
        builders = [builder.name for builder in builder_set]
402
 
        self.addScanForBuilders(builders)
 
429
        self.addScanForBuilders(
 
430
            [builder.name for builder in getUtility(IBuilderSet)])
403
431
        self.new_builders_scanner.scheduleScan()
404
432
 
405
433
        # Events will now fire in the SlaveScanner objects to scan each
420
448
        # stopped, so we can wait on them all at once here before
421
449
        # exiting.
422
450
        d = defer.DeferredList(deferreds, consumeErrors=True)
 
451
        d.addCallback(self.exitReadOnlyDatabasePolicy)
423
452
        return d
424
453
 
425
454
    def addScanForBuilders(self, builders):