~launchpad-pqm/launchpad/devel

7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
1
#!/usr/bin/python2.6 -S
7675.395.224 by Stuart Bishop
Update copyright
2
# Copyright 2011-2012 Canonical Ltd.  This software is licensed under the
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
3
# GNU Affero General Public License version 3 (see the file LICENSE).
4
5
"""Confirm the database systems are ready to be patched as best we can."""
6
13465.2.10 by Stuart Bishop
Inline preflight checks
7
__all__ = [
8
    'DatabasePreflight',
9
    'KillConnectionsPreflight',
10
    'NoConnectionCheckPreflight',
11
    ]
12
14606.3.4 by William Grant
Replace canonical.database usage everywhere, and format-imports.
13
import _pythonpath
13465.2.10 by Stuart Bishop
Inline preflight checks
14
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
15
from datetime import timedelta
16
from optparse import OptionParser
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
17
import os.path
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
18
import time
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
19
7675.395.196 by Stuart Bishop
preflight check for database upgrades
20
import psycopg2
21
14606.3.4 by William Grant
Replace canonical.database usage everywhere, and format-imports.
22
from lp.services.database.sqlbase import (
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
23
    connect,
24
    ISOLATION_LEVEL_AUTOCOMMIT,
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
25
    sqlvalues,
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
26
    )
14565.2.15 by Curtis Hovey
Moved canonical.launchpad.scripts __init__ to lp.services.scripts.
27
from lp.services.scripts import (
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
28
    db_options,
29
    logger,
30
    logger_options,
31
    )
32
import replication.helpers
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
33
import upgrade
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
34
35
# Ignore connections by these users.
7675.395.202 by Stuart Bishop
Add lagmon as a system user, correct variable name
36
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon'])
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
37
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
38
# Fail checks if these users are connected. If a process should not be
39
# interrupted by a rollout, the database user it connects as should be
40
# added here. The preflight check will fail if any of these users are
41
# connected, so these systems will need to be shut down manually before
42
# a database update.
9893.9.11 by Stuart Bishop
process_accepted is fragile
43
FRAGILE_USERS = frozenset([
44
    'buildd_manager',
45
    # process_accepted is fragile, but also fast so we likely shouldn't
46
    # need to ever manually shut it down.
47
    'process_accepted',
9893.9.12 by Stuart Bishop
process-upload.py connects as a distinct database user and is fragile
48
    'process_upload',
9893.9.13 by Stuart Bishop
publish-distro.py connects as a distinct database user and is fragile
49
    'publish_distro',
14016.1.1 by William Grant
publish_ftpmaster is fragile too.
50
    'publish_ftpmaster',
9893.9.11 by Stuart Bishop
process_accepted is fragile
51
    ])
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
52
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
53
# If these users have long running transactions, just kill 'em. Entries
54
# added here must come with a bug number, a if part of Launchpad holds
55
# open a long running transaction it is a bug we need to fix.
56
BAD_USERS = frozenset([
57
    'karma',  # Bug #863109
58
    'rosettaadmin',  # Bug #863122
7675.395.222 by Stuart Bishop
More BAD_USERS we can kill automatically during fastdowntime deployments
59
    'update-pkg-cache',  # Bug #912144
60
    'process_death_row',  # Bug #912146
61
    'langpack',  # Bug #912147
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
62
    ])
63
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
64
# How lagged the cluster can be before failing the preflight check.
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
65
# If this is set too low, perfectly normal state will abort rollouts. If
66
# this is set too high, then we will have unacceptable downtime as
67
# replication needs to catch up before the database patches will apply.
13465.2.18 by Stuart Bishop
delint
68
MAX_LAG = timedelta(seconds=60)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
69
70
71
class DatabasePreflight:
13465.2.10 by Stuart Bishop
Inline preflight checks
72
    def __init__(self, log):
13879.1.6 by William Grant
Some more that I missed.
73
        master_con = connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT)
13465.2.10 by Stuart Bishop
Inline preflight checks
74
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
75
        self.log = log
76
        self.is_replicated = replication.helpers.slony_installed(master_con)
77
        if self.is_replicated:
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
78
            self.nodes = set(
79
                replication.helpers.get_all_cluster_nodes(master_con))
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
80
            for node in self.nodes:
81
                node.con = psycopg2.connect(node.connection_string)
82
                node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
13465.2.2 by Stuart Bishop
Add option to skip open connection check
83
84
            # Create a list of nodes subscribed to the replicated sets we
85
            # are modifying.
86
            cur = master_con.cursor()
87
            cur.execute("""
88
                WITH subscriptions AS (
89
                    SELECT *
90
                    FROM _sl.sl_subscribe
91
                    WHERE sub_set = 1 AND sub_active IS TRUE)
92
                SELECT sub_provider FROM subscriptions
93
                UNION
94
                SELECT sub_receiver FROM subscriptions
95
                """)
96
            lpmain_node_ids = set(row[0] for row in cur.fetchall())
97
            self.lpmain_nodes = set(
98
                node for node in self.nodes
99
                if node.node_id in lpmain_node_ids)
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
100
101
            # Store a reference to the lpmain origin.
102
            lpmain_master_node_id = replication.helpers.get_master_node(
103
                master_con, 1).node_id
104
            self.lpmain_master_node = [
105
                node for node in self.lpmain_nodes
106
                    if node.node_id == lpmain_master_node_id][0]
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
107
        else:
108
            node = replication.helpers.Node(None, None, None, True)
109
            node.con = master_con
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
110
            self.nodes = set([node])
13465.2.2 by Stuart Bishop
Add option to skip open connection check
111
            self.lpmain_nodes = self.nodes
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
112
            self.lpmain_master_node = node
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
113
114
    def check_is_superuser(self):
115
        """Return True if all the node connections are as superusers."""
116
        success = True
117
        for node in self.nodes:
118
            cur = node.con.cursor()
119
            cur.execute("""
120
                SELECT current_database(), pg_user.usesuper
121
                FROM pg_user
122
                WHERE usename = current_user
123
                """)
124
            dbname, is_super = cur.fetchone()
125
            if is_super:
126
                self.log.debug("Connected to %s as a superuser.", dbname)
127
            else:
128
                self.log.fatal("Not connected to %s as a superuser.", dbname)
129
                success = False
130
        return success
131
132
    def check_open_connections(self):
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
133
        """False if any lpmain nodes have connections from non-system users.
134
135
        We only check on subscribed nodes, as there will be active systems
136
        connected to other nodes in the replication cluster (such as the
137
        SSO servers).
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
138
139
        System users are defined by SYSTEM_USERS.
140
        """
141
        success = True
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
142
        for node in self.lpmain_nodes:
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
143
            cur = node.con.cursor()
144
            cur.execute("""
145
                SELECT datname, usename, COUNT(*) AS num_connections
146
                FROM pg_stat_activity
7675.395.196 by Stuart Bishop
preflight check for database upgrades
147
                WHERE
148
                    datname=current_database()
149
                    AND procpid <> pg_backend_pid()
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
150
                GROUP BY datname, usename
151
                """)
152
            for datname, usename, num_connections in cur.fetchall():
153
                if usename in SYSTEM_USERS:
154
                    self.log.debug(
155
                        "%s has %d connections by %s",
156
                        datname, num_connections, usename)
157
                else:
158
                    self.log.fatal(
159
                        "%s has %d connections by %s",
160
                        datname, num_connections, usename)
161
                    success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
162
        if success:
163
            self.log.info("Only system users connected to the cluster")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
164
        return success
165
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
166
    def check_fragile_connections(self):
167
        """Fail if any FRAGILE_USERS are connected to the cluster.
168
169
        If we interrupt these processes, we may have a mess to clean
170
        up. If they are connected, the preflight check should fail.
171
        """
172
        success = True
173
        for node in self.lpmain_nodes:
174
            cur = node.con.cursor()
175
            cur.execute("""
176
                SELECT datname, usename, COUNT(*) AS num_connections
177
                FROM pg_stat_activity
178
                WHERE
179
                    datname=current_database()
180
                    AND procpid <> pg_backend_pid()
181
                    AND usename IN %s
182
                GROUP BY datname, usename
183
                """ % sqlvalues(FRAGILE_USERS))
184
            for datname, usename, num_connections in cur.fetchall():
185
                self.log.fatal(
186
                    "Fragile system %s running. %s has %d connections.",
187
                    usename, datname, num_connections)
188
                success = False
189
        if success:
190
            self.log.info(
191
                "No fragile systems connected to the cluster (%s)"
192
                % ', '.join(FRAGILE_USERS))
193
        return success
194
7675.1212.16 by Stuart Bishop
Saner definition of 'long running transactions'
195
    def check_long_running_transactions(self, max_secs=60):
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
196
        """Return False if any nodes have long running transactions open.
197
198
        max_secs defines what is long running. For database rollouts,
199
        this will be short. Even if the transaction is benign like a
200
        autovacuum task, we should wait until things have settled down.
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
201
7675.395.206 by Stuart Bishop
Fix old name
202
        We ignore transactions held open by BAD_USERS. These are bugs
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
203
        that need to be fixed, but we have determined that rudely aborting
204
        them is fine for now and there is no need to block a rollout on
205
        their behalf.
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
206
        """
207
        success = True
208
        for node in self.nodes:
209
            cur = node.con.cursor()
210
            cur.execute("""
211
                SELECT
212
                    datname, usename,
213
                    age(current_timestamp, xact_start) AS age, current_query
214
                FROM pg_stat_activity
215
                WHERE
216
                    age(current_timestamp, xact_start) > interval '%d secs'
217
                    AND datname=current_database()
218
                """ % max_secs)
219
            for datname, usename, age, current_query in cur.fetchall():
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
220
                if usename in BAD_USERS:
221
                    self.log.info(
222
                        "%s has transactions by %s open %s (ignoring)",
223
                        datname, usename, age)
224
                else:
225
                    self.log.fatal(
226
                        "%s has transaction by %s open %s",
227
                        datname, usename, age)
228
                    success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
229
        if success:
230
            self.log.info("No long running transactions detected.")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
231
        return success
232
233
    def check_replication_lag(self):
234
        """Return False if the replication cluster is badly lagged."""
235
        if not self.is_replicated:
236
            self.log.debug("Not replicated - no replication lag.")
237
            return True
238
239
        # Check replication lag on every node just in case there are
240
        # disagreements.
7675.395.196 by Stuart Bishop
preflight check for database upgrades
241
        max_lag = timedelta(seconds=-1)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
242
        for node in self.nodes:
243
            cur = node.con.cursor()
244
            cur.execute("""
245
                SELECT current_database(),
246
                max(st_lag_time) AS lag FROM _sl.sl_status
247
            """)
248
            dbname, lag = cur.fetchone()
249
            if lag > max_lag:
250
                max_lag = lag
251
            self.log.debug(
7675.395.196 by Stuart Bishop
preflight check for database upgrades
252
                "%s reports database lag of %s.", dbname, lag)
253
        if max_lag <= MAX_LAG:
254
            self.log.info("Database cluster lag is ok (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
255
            return True
256
        else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
257
            self.log.fatal("Database cluster lag is high (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
258
            return False
259
7675.395.196 by Stuart Bishop
preflight check for database upgrades
260
    def check_can_sync(self):
261
        """Return True if a sync event is acknowledged by all nodes.
262
263
        We only wait 30 seconds for the sync, because we require the
264
        cluster to be quiescent.
265
        """
266
        if self.is_replicated:
13465.2.32 by Stuart Bishop
Continue preflight checks if sync check failed, rather than abort
267
            success = replication.helpers.sync(30, exit_on_fail=False)
7675.395.196 by Stuart Bishop
preflight check for database upgrades
268
            if success:
269
                self.log.info(
270
                    "Replication events are being propagated.")
271
            else:
272
                self.log.fatal(
273
                    "Replication events are not being propagated.")
274
                self.log.fatal(
275
                    "One or more replication daemons may be down.")
276
                self.log.fatal(
277
                    "Bounce the replication daemons and check the logs.")
278
            return success
279
        else:
280
            return True
281
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
282
    def report_patches(self):
283
        """Report what patches are due to be applied from this tree."""
284
        con = self.lpmain_master_node.con
285
        upgrade.log = self.log
286
        for patch_num, patch_file in upgrade.get_patchlist(con):
287
            self.log.info("%s is pending", os.path.basename(patch_file))
288
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
289
    def check_all(self):
290
        """Run all checks.
291
292
        If any failed, return False. Otherwise return True.
293
        """
294
        if not self.check_is_superuser():
295
            # No point continuing - results will be bogus without access
296
            # to pg_stat_activity
297
            return False
298
7675.395.207 by Stuart Bishop
Report what patches are pending in preflight check
299
        self.report_patches()
300
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
301
        success = True
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
302
        if not self.check_replication_lag():
303
            success = False
304
        if not self.check_can_sync():
305
            success = False
13465.2.13 by Stuart Bishop
note
306
        # Do checks on open transactions last to minimize race
307
        # conditions.
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
308
        if not self.check_open_connections():
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
309
            success = False
310
        if not self.check_long_running_transactions():
311
            success = False
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
312
        if not self.check_fragile_connections():
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
313
            success = False
314
        return success
315
316
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
317
class NoConnectionCheckPreflight(DatabasePreflight):
318
    def check_open_connections(self):
319
        return True
320
321
322
class KillConnectionsPreflight(DatabasePreflight):
323
    def check_open_connections(self):
324
        """Kill all non-system connections to Launchpad databases.
325
326
        We only check on subscribed nodes, as there will be active systems
327
        connected to other nodes in the replication cluster (such as the
328
        SSO servers).
329
330
        System users are defined by SYSTEM_USERS.
331
        """
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
332
        # We keep trying to terminate connections every 0.5 seconds for
333
        # up to 10 seconds.
334
        num_tries = 20
335
        seconds_to_pause = 0.5
336
        for loop_count in range(num_tries):
337
            all_clear = True
338
            for node in self.lpmain_nodes:
339
                cur = node.con.cursor()
340
                cur.execute("""
341
                    SELECT
342
                        procpid, datname, usename,
343
                        pg_terminate_backend(procpid)
344
                    FROM pg_stat_activity
345
                    WHERE
346
                        datname=current_database()
347
                        AND procpid <> pg_backend_pid()
348
                        AND usename NOT IN %s
349
                    """ % sqlvalues(SYSTEM_USERS))
350
                for procpid, datname, usename, ignored in cur.fetchall():
351
                    all_clear = False
352
                    if loop_count == num_tries - 1:
353
                        self.log.fatal(
354
                            "Unable to kill %s [%s] on %s",
355
                            usename, procpid, datname)
7675.395.205 by Stuart Bishop
Don't abort rollouts for scripts we know are badly behaved - steamroller over them
356
                    elif usename in BAD_USERS:
357
                        self.log.info(
358
                            "Killed %s [%s] on %s", usename, procpid, datname)
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
359
                    else:
360
                        self.log.warning(
361
                            "Killed %s [%s] on %s", usename, procpid, datname)
362
            if all_clear:
363
                break
364
365
            # Wait a little for any terminated connections to actually
366
            # terminate.
367
            time.sleep(seconds_to_pause)
368
        return all_clear
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
369
370
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
371
def main():
372
    parser = OptionParser()
373
    db_options(parser)
374
    logger_options(parser)
13465.2.2 by Stuart Bishop
Add option to skip open connection check
375
    parser.add_option(
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
376
        "--skip-connection-check", dest='skip_connection_check',
377
        default=False, action="store_true",
13465.2.2 by Stuart Bishop
Add option to skip open connection check
378
        help="Don't check open connections.")
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
379
    parser.add_option(
13465.2.4 by Stuart Bishop
Simplify cli
380
        "--kill-connections", dest='kill_connections',
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
381
        default=False, action="store_true",
13465.2.4 by Stuart Bishop
Simplify cli
382
        help="Kill non-system connections instead of reporting an error.")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
383
    (options, args) = parser.parse_args()
384
    if args:
385
        parser.error("Too many arguments")
386
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
387
    if options.kill_connections and options.skip_connection_check:
388
        parser.error(
13465.2.4 by Stuart Bishop
Simplify cli
389
            "--skip-connection-check conflicts with --kill-connections")
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
390
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
391
    log = logger(options)
392
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
393
    if options.kill_connections:
13465.2.10 by Stuart Bishop
Inline preflight checks
394
        preflight_check = KillConnectionsPreflight(log)
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
395
    elif options.skip_connection_check:
13465.2.10 by Stuart Bishop
Inline preflight checks
396
        preflight_check = NoConnectionCheckPreflight(log)
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
397
    else:
13465.2.10 by Stuart Bishop
Inline preflight checks
398
        preflight_check = DatabasePreflight(log)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
399
400
    if preflight_check.check_all():
7675.395.196 by Stuart Bishop
preflight check for database upgrades
401
        log.info('Preflight check succeeded. Good to go.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
402
        return 0
403
    else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
404
        log.error('Preflight check failed.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
405
        return 1
406
407
408
if __name__ == '__main__':
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
409
    raise SystemExit(main())