~launchpad-pqm/launchpad/devel

« back to all changes in this revision

Viewing changes to lib/lp/buildmaster/manager.py

  • Committer: Launchpad Patch Queue Manager
  • Date: 2012-01-05 01:16:46 UTC
  • mfrom: (14628.1.2 js-client-link)
  • Revision ID: launchpad@pqm.canonical.com-20120105011646-dlhd7oqm9pui6yk1
[r=benji][bug=911973] Set lp_original_uri properly for objects
        returned by named_get and named_post that have a self_link.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
 
1
# Copyright 2009 Canonical Ltd.  This software is licensed under the
2
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
3
 
4
4
"""Soyuz buildd slave manager logic."""
34
34
    BuildBehaviorMismatch,
35
35
    )
36
36
from lp.buildmaster.model.builder import Builder
37
 
from lp.services.database.transaction_policy import DatabaseTransactionPolicy
38
37
 
39
38
 
40
39
BUILDD_MANAGER_LOG_NAME = "slave-scanner"
112
111
    # algorithm for polling.
113
112
    SCAN_INTERVAL = 15
114
113
 
115
 
    def __init__(self, builder_name, logger, clock=None):
 
114
    def __init__(self, builder_name, logger):
116
115
        self.builder_name = builder_name
117
116
        self.logger = logger
118
 
        if clock is None:
119
 
            clock = reactor
120
 
        self._clock = clock
121
117
 
122
118
    def startCycle(self):
123
119
        """Scan the builder and dispatch to it or deal with failures."""
124
120
        self.loop = LoopingCall(self.singleCycle)
125
 
        self.loop.clock = self._clock
126
121
        self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
127
122
        return self.stopping_deferred
128
123
 
143
138
        1. Print the error in the log
144
139
        2. Increment and assess failure counts on the builder and job.
145
140
        """
146
 
        # Since this is a failure path, we could be in a broken
147
 
        # transaction.  Get us a fresh one.
 
141
        # Make sure that pending database updates are removed as it
 
142
        # could leave the database in an inconsistent state (e.g. The
 
143
        # job says it's running but the buildqueue has no builder set).
148
144
        transaction.abort()
149
145
 
150
146
        # If we don't recognise the exception include a stack trace with
151
147
        # the error.
152
148
        error_message = failure.getErrorMessage()
153
 
        familiar_error = failure.check(
 
149
        if failure.check(
154
150
            BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
155
 
            CannotResumeHost, BuildDaemonError, CannotFetchFile)
156
 
        if familiar_error:
157
 
            self.logger.info(
158
 
                "Scanning %s failed with: %s",
159
 
                self.builder_name, error_message)
 
151
            CannotResumeHost, BuildDaemonError, CannotFetchFile):
 
152
            self.logger.info("Scanning %s failed with: %s" % (
 
153
                self.builder_name, error_message))
160
154
        else:
161
 
            self.logger.info(
162
 
                "Scanning %s failed with: %s\n%s",
 
155
            self.logger.info("Scanning %s failed with: %s\n%s" % (
163
156
                self.builder_name, failure.getErrorMessage(),
164
 
                failure.getTraceback())
 
157
                failure.getTraceback()))
165
158
 
166
159
        # Decide if we need to terminate the job or fail the
167
160
        # builder.
168
161
        try:
169
162
            builder = get_builder(self.builder_name)
170
 
            transaction.commit()
171
 
 
172
 
            with DatabaseTransactionPolicy(read_only=False):
173
 
                builder.gotFailure()
174
 
 
175
 
                if builder.currentjob is None:
176
 
                    self.logger.info(
177
 
                        "Builder %s failed a probe, count: %s",
178
 
                        self.builder_name, builder.failure_count)
179
 
                else:
180
 
                    build_farm_job = builder.getCurrentBuildFarmJob()
181
 
                    build_farm_job.gotFailure()
182
 
                    self.logger.info(
183
 
                        "builder %s failure count: %s, "
184
 
                        "job '%s' failure count: %s",
 
163
            builder.gotFailure()
 
164
            if builder.currentjob is not None:
 
165
                build_farm_job = builder.getCurrentBuildFarmJob()
 
166
                build_farm_job.gotFailure()
 
167
                self.logger.info(
 
168
                    "builder %s failure count: %s, "
 
169
                    "job '%s' failure count: %s" % (
185
170
                        self.builder_name,
186
171
                        builder.failure_count,
187
172
                        build_farm_job.title,
188
 
                        build_farm_job.failure_count)
189
 
 
190
 
                assessFailureCounts(builder, failure.getErrorMessage())
191
 
                transaction.commit()
 
173
                        build_farm_job.failure_count))
 
174
            else:
 
175
                self.logger.info(
 
176
                    "Builder %s failed a probe, count: %s" % (
 
177
                        self.builder_name, builder.failure_count))
 
178
            assessFailureCounts(builder, failure.getErrorMessage())
 
179
            transaction.commit()
192
180
        except:
193
181
            # Catastrophic code failure! Not much we can do.
194
 
            transaction.abort()
195
182
            self.logger.error(
196
183
                "Miserable failure when trying to examine failure counts:\n",
197
184
                exc_info=True)
 
185
            transaction.abort()
198
186
 
199
187
    def checkCancellation(self, builder):
200
188
        """See if there is a pending cancellation request.
217
205
            return defer.succeed(True)
218
206
 
219
207
        self.logger.info("Cancelling build '%s'" % build.title)
220
 
        with DatabaseTransactionPolicy(read_only=False):
221
 
            buildqueue.cancel()
222
 
            transaction.commit()
 
208
        buildqueue.cancel()
 
209
        transaction.commit()
223
210
        d = builder.resumeSlaveHost()
224
211
        d.addCallback(resume_done)
225
212
        return d
249
236
        """
250
237
        # We need to re-fetch the builder object on each cycle as the
251
238
        # Storm store is invalidated over transaction boundaries.
 
239
 
252
240
        self.builder = get_builder(self.builder_name)
253
241
 
254
242
        def status_updated(ignored):
 
243
            # Commit the changes done while possibly rescuing jobs, to
 
244
            # avoid holding table locks.
 
245
            transaction.commit()
 
246
 
255
247
            # See if we think there's an active build on the builder.
256
248
            buildqueue = self.builder.getBuildQueue()
257
249
 
261
253
                return self.builder.updateBuild(buildqueue)
262
254
 
263
255
        def build_updated(ignored):
 
256
            # Commit changes done while updating the build, to avoid
 
257
            # holding table locks.
 
258
            transaction.commit()
 
259
 
264
260
            # If the builder is in manual mode, don't dispatch anything.
265
261
            if self.builder.manual:
266
262
                self.logger.debug(
267
 
                    '%s is in manual mode, not dispatching.',
 
263
                    '%s is in manual mode, not dispatching.' %
268
264
                    self.builder.name)
269
265
                return
270
266
 
282
278
                job = self.builder.currentjob
283
279
                if job is not None and not self.builder.builderok:
284
280
                    self.logger.info(
285
 
                        "%s was made unavailable; resetting attached job.",
286
 
                        self.builder.name)
 
281
                        "%s was made unavailable, resetting attached "
 
282
                        "job" % self.builder.name)
 
283
                    job.reset()
287
284
                    transaction.commit()
288
 
                    with DatabaseTransactionPolicy(read_only=False):
289
 
                        job.reset()
290
 
                        transaction.commit()
291
285
                return
292
286
 
293
287
            # See if there is a job we can dispatch to the builder slave.
294
288
 
295
 
            # XXX JeroenVermeulen 2011-10-11, bug=872112: The job's
296
 
            # failure count will be reset once the job has started
297
 
            # successfully.  Because of intervening commits, you may see
298
 
            # a build with a nonzero failure count that's actually going
299
 
            # to succeed later (and have a failure count of zero).  Or
300
 
            # it may fail yet end up with a lower failure count than you
301
 
            # saw earlier.
302
289
            d = self.builder.findAndStartJob()
303
290
 
304
291
            def job_started(candidate):
305
292
                if self.builder.currentjob is not None:
306
293
                    # After a successful dispatch we can reset the
307
294
                    # failure_count.
 
295
                    self.builder.resetFailureCount()
308
296
                    transaction.commit()
309
 
                    with DatabaseTransactionPolicy(read_only=False):
310
 
                        self.builder.resetFailureCount()
311
 
                        transaction.commit()
312
297
                    return self.builder.slave
313
298
                else:
314
299
                    return None
387
372
        self.logger = self._setupLogger()
388
373
        self.new_builders_scanner = NewBuildersScanner(
389
374
            manager=self, clock=clock)
390
 
        self.transaction_policy = DatabaseTransactionPolicy(read_only=True)
391
375
 
392
376
    def _setupLogger(self):
393
377
        """Set up a 'slave-scanner' logger that redirects to twisted.
406
390
        logger.setLevel(level)
407
391
        return logger
408
392
 
409
 
    def enterReadOnlyDatabasePolicy(self):
410
 
        """Set the database transaction policy to read-only.
411
 
 
412
 
        Any previously pending changes are committed first.
413
 
        """
414
 
        transaction.commit()
415
 
        self.transaction_policy.__enter__()
416
 
 
417
 
    def exitReadOnlyDatabasePolicy(self, *args):
418
 
        """Reset database transaction policy to the default read-write."""
419
 
        self.transaction_policy.__exit__(None, None, None)
420
 
 
421
393
    def startService(self):
422
394
        """Service entry point, called when the application starts."""
 
395
 
 
396
        # Get a list of builders and set up scanners on each one.
 
397
 
423
398
        # Avoiding circular imports.
424
399
        from lp.buildmaster.interfaces.builder import IBuilderSet
425
 
 
426
 
        self.enterReadOnlyDatabasePolicy()
427
 
 
428
 
        # Get a list of builders and set up scanners on each one.
429
 
        self.addScanForBuilders(
430
 
            [builder.name for builder in getUtility(IBuilderSet)])
 
400
        builder_set = getUtility(IBuilderSet)
 
401
        builders = [builder.name for builder in builder_set]
 
402
        self.addScanForBuilders(builders)
431
403
        self.new_builders_scanner.scheduleScan()
432
404
 
433
405
        # Events will now fire in the SlaveScanner objects to scan each
448
420
        # stopped, so we can wait on them all at once here before
449
421
        # exiting.
450
422
        d = defer.DeferredList(deferreds, consumeErrors=True)
451
 
        d.addCallback(self.exitReadOnlyDatabasePolicy)
452
423
        return d
453
424
 
454
425
    def addScanForBuilders(self, builders):