~launchpad-pqm/launchpad/devel

7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
1
#!/usr/bin/python2.6 -S
2
# Copyright 2011 Canonical Ltd.  This software is licensed under the
3
# GNU Affero General Public License version 3 (see the file LICENSE).
4
5
"""Confirm the database systems are ready to be patched as best we can."""
6
7
import _pythonpath
8
13465.2.10 by Stuart Bishop
Inline preflight checks
9
__all__ = [
10
    'DatabasePreflight',
11
    'KillConnectionsPreflight',
12
    'NoConnectionCheckPreflight',
13
    ]
14
15
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
16
from datetime import timedelta
17
from optparse import OptionParser
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
18
import time
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
19
7675.395.196 by Stuart Bishop
preflight check for database upgrades
20
import psycopg2
21
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
22
from canonical.database.sqlbase import (
23
    connect,
24
    ISOLATION_LEVEL_AUTOCOMMIT,
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
25
    sqlvalues,
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
26
    )
27
from canonical.launchpad.scripts import (
28
    db_options,
29
    logger,
30
    logger_options,
31
    )
32
import replication.helpers
33
34
35
# Ignore connections by these users.
7675.395.202 by Stuart Bishop
Add lagmon as a system user, correct variable name
36
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon'])
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
37
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
38
# Fail checks if these users are connected. If a process should not be
39
# interrupted by a rollout, the database user it connects as should be
40
# added here. The preflight check will fail if any of these users are
41
# connected, so these systems will need to be shut down manually before
42
# a database update.
9893.9.6 by Stuart Bishop
fiera is no longer fragile
43
FRAGILE_USERS = frozenset(['archivepublisher', 'buildd_manager'])
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
44
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
45
# How lagged the cluster can be before failing the preflight check.
13465.2.18 by Stuart Bishop
delint
46
MAX_LAG = timedelta(seconds=60)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
47
48
49
class DatabasePreflight:
13465.2.10 by Stuart Bishop
Inline preflight checks
50
    def __init__(self, log):
13879.1.6 by William Grant
Some more that I missed.
51
        master_con = connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT)
13465.2.10 by Stuart Bishop
Inline preflight checks
52
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
53
        self.log = log
54
        self.is_replicated = replication.helpers.slony_installed(master_con)
55
        if self.is_replicated:
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
56
            self.nodes = set(
57
                replication.helpers.get_all_cluster_nodes(master_con))
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
58
            for node in self.nodes:
59
                node.con = psycopg2.connect(node.connection_string)
60
                node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
13465.2.2 by Stuart Bishop
Add option to skip open connection check
61
62
            # Create a list of nodes subscribed to the replicated sets we
63
            # are modifying.
64
            cur = master_con.cursor()
65
            cur.execute("""
66
                WITH subscriptions AS (
67
                    SELECT *
68
                    FROM _sl.sl_subscribe
69
                    WHERE sub_set = 1 AND sub_active IS TRUE)
70
                SELECT sub_provider FROM subscriptions
71
                UNION
72
                SELECT sub_receiver FROM subscriptions
73
                """)
74
            lpmain_node_ids = set(row[0] for row in cur.fetchall())
75
            self.lpmain_nodes = set(
76
                node for node in self.nodes
77
                if node.node_id in lpmain_node_ids)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
78
        else:
79
            node = replication.helpers.Node(None, None, None, True)
80
            node.con = master_con
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
81
            self.nodes = set([node])
13465.2.2 by Stuart Bishop
Add option to skip open connection check
82
            self.lpmain_nodes = self.nodes
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
83
84
    def check_is_superuser(self):
85
        """Return True if all the node connections are as superusers."""
86
        success = True
87
        for node in self.nodes:
88
            cur = node.con.cursor()
89
            cur.execute("""
90
                SELECT current_database(), pg_user.usesuper
91
                FROM pg_user
92
                WHERE usename = current_user
93
                """)
94
            dbname, is_super = cur.fetchone()
95
            if is_super:
96
                self.log.debug("Connected to %s as a superuser.", dbname)
97
            else:
98
                self.log.fatal("Not connected to %s as a superuser.", dbname)
99
                success = False
100
        return success
101
102
    def check_open_connections(self):
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
103
        """False if any lpmain nodes have connections from non-system users.
104
105
        We only check on subscribed nodes, as there will be active systems
106
        connected to other nodes in the replication cluster (such as the
107
        SSO servers).
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
108
109
        System users are defined by SYSTEM_USERS.
110
        """
111
        success = True
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
112
        for node in self.lpmain_nodes:
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
113
            cur = node.con.cursor()
114
            cur.execute("""
115
                SELECT datname, usename, COUNT(*) AS num_connections
116
                FROM pg_stat_activity
7675.395.196 by Stuart Bishop
preflight check for database upgrades
117
                WHERE
118
                    datname=current_database()
119
                    AND procpid <> pg_backend_pid()
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
120
                GROUP BY datname, usename
121
                """)
122
            for datname, usename, num_connections in cur.fetchall():
123
                if usename in SYSTEM_USERS:
124
                    self.log.debug(
125
                        "%s has %d connections by %s",
126
                        datname, num_connections, usename)
127
                else:
128
                    self.log.fatal(
129
                        "%s has %d connections by %s",
130
                        datname, num_connections, usename)
131
                    success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
132
        if success:
133
            self.log.info("Only system users connected to the cluster")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
134
        return success
135
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
136
    def check_fragile_connections(self):
137
        """Fail if any FRAGILE_USERS are connected to the cluster.
138
139
        If we interrupt these processes, we may have a mess to clean
140
        up. If they are connected, the preflight check should fail.
141
        """
142
        success = True
143
        for node in self.lpmain_nodes:
144
            cur = node.con.cursor()
145
            cur.execute("""
146
                SELECT datname, usename, COUNT(*) AS num_connections
147
                FROM pg_stat_activity
148
                WHERE
149
                    datname=current_database()
150
                    AND procpid <> pg_backend_pid()
151
                    AND usename IN %s
152
                GROUP BY datname, usename
153
                """ % sqlvalues(FRAGILE_USERS))
154
            for datname, usename, num_connections in cur.fetchall():
155
                self.log.fatal(
156
                    "Fragile system %s running. %s has %d connections.",
157
                    usename, datname, num_connections)
158
                success = False
159
        if success:
160
            self.log.info(
161
                "No fragile systems connected to the cluster (%s)"
162
                % ', '.join(FRAGILE_USERS))
163
        return success
164
7675.1212.16 by Stuart Bishop
Saner definition of 'long running transactions'
165
    def check_long_running_transactions(self, max_secs=60):
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
166
        """Return False if any nodes have long running transactions open.
167
168
        max_secs defines what is long running. For database rollouts,
169
        this will be short. Even if the transaction is benign like a
170
        autovacuum task, we should wait until things have settled down.
171
        """
172
        success = True
173
        for node in self.nodes:
174
            cur = node.con.cursor()
175
            cur.execute("""
176
                SELECT
177
                    datname, usename,
178
                    age(current_timestamp, xact_start) AS age, current_query
179
                FROM pg_stat_activity
180
                WHERE
181
                    age(current_timestamp, xact_start) > interval '%d secs'
182
                    AND datname=current_database()
183
                """ % max_secs)
184
            for datname, usename, age, current_query in cur.fetchall():
185
                self.log.fatal(
186
                    "%s has transaction by %s open %s",
187
                    datname, usename, age)
188
                success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
189
        if success:
190
            self.log.info("No long running transactions detected.")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
191
        return success
192
193
    def check_replication_lag(self):
194
        """Return False if the replication cluster is badly lagged."""
195
        if not self.is_replicated:
196
            self.log.debug("Not replicated - no replication lag.")
197
            return True
198
199
        # Check replication lag on every node just in case there are
200
        # disagreements.
7675.395.196 by Stuart Bishop
preflight check for database upgrades
201
        max_lag = timedelta(seconds=-1)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
202
        for node in self.nodes:
203
            cur = node.con.cursor()
204
            cur.execute("""
205
                SELECT current_database(),
206
                max(st_lag_time) AS lag FROM _sl.sl_status
207
            """)
208
            dbname, lag = cur.fetchone()
209
            if lag > max_lag:
210
                max_lag = lag
211
            self.log.debug(
7675.395.196 by Stuart Bishop
preflight check for database upgrades
212
                "%s reports database lag of %s.", dbname, lag)
213
        if max_lag <= MAX_LAG:
214
            self.log.info("Database cluster lag is ok (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
215
            return True
216
        else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
217
            self.log.fatal("Database cluster lag is high (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
218
            return False
219
7675.395.196 by Stuart Bishop
preflight check for database upgrades
220
    def check_can_sync(self):
221
        """Return True if a sync event is acknowledged by all nodes.
222
223
        We only wait 30 seconds for the sync, because we require the
224
        cluster to be quiescent.
225
        """
226
        if self.is_replicated:
13465.2.32 by Stuart Bishop
Continue preflight checks if sync check failed, rather than abort
227
            success = replication.helpers.sync(30, exit_on_fail=False)
7675.395.196 by Stuart Bishop
preflight check for database upgrades
228
            if success:
229
                self.log.info(
230
                    "Replication events are being propagated.")
231
            else:
232
                self.log.fatal(
233
                    "Replication events are not being propagated.")
234
                self.log.fatal(
235
                    "One or more replication daemons may be down.")
236
                self.log.fatal(
237
                    "Bounce the replication daemons and check the logs.")
238
            return success
239
        else:
240
            return True
241
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
242
    def check_all(self):
243
        """Run all checks.
244
245
        If any failed, return False. Otherwise return True.
246
        """
247
        if not self.check_is_superuser():
248
            # No point continuing - results will be bogus without access
249
            # to pg_stat_activity
250
            return False
251
252
        success = True
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
253
        if not self.check_replication_lag():
254
            success = False
255
        if not self.check_can_sync():
256
            success = False
13465.2.13 by Stuart Bishop
note
257
        # Do checks on open transactions last to minimize race
258
        # conditions.
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
259
        if not self.check_open_connections():
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
260
            success = False
261
        if not self.check_long_running_transactions():
262
            success = False
13465.2.12 by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting
263
        if not self.check_fragile_connections():
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
264
            success = False
265
        return success
266
267
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
268
class NoConnectionCheckPreflight(DatabasePreflight):
269
    def check_open_connections(self):
270
        return True
271
272
273
class KillConnectionsPreflight(DatabasePreflight):
274
    def check_open_connections(self):
275
        """Kill all non-system connections to Launchpad databases.
276
277
        We only check on subscribed nodes, as there will be active systems
278
        connected to other nodes in the replication cluster (such as the
279
        SSO servers).
280
281
        System users are defined by SYSTEM_USERS.
282
        """
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
283
        # We keep trying to terminate connections every 0.5 seconds for
284
        # up to 10 seconds.
285
        num_tries = 20
286
        seconds_to_pause = 0.5
287
        for loop_count in range(num_tries):
288
            all_clear = True
289
            for node in self.lpmain_nodes:
290
                cur = node.con.cursor()
291
                cur.execute("""
292
                    SELECT
293
                        procpid, datname, usename,
294
                        pg_terminate_backend(procpid)
295
                    FROM pg_stat_activity
296
                    WHERE
297
                        datname=current_database()
298
                        AND procpid <> pg_backend_pid()
299
                        AND usename NOT IN %s
300
                    """ % sqlvalues(SYSTEM_USERS))
301
                for procpid, datname, usename, ignored in cur.fetchall():
302
                    all_clear = False
303
                    if loop_count == num_tries - 1:
304
                        self.log.fatal(
305
                            "Unable to kill %s [%s] on %s",
306
                            usename, procpid, datname)
307
                    else:
308
                        self.log.warning(
309
                            "Killed %s [%s] on %s", usename, procpid, datname)
310
            if all_clear:
311
                break
312
313
            # Wait a little for any terminated connections to actually
314
            # terminate.
315
            time.sleep(seconds_to_pause)
316
        return all_clear
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
317
318
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
319
def main():
320
    parser = OptionParser()
321
    db_options(parser)
322
    logger_options(parser)
13465.2.2 by Stuart Bishop
Add option to skip open connection check
323
    parser.add_option(
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
324
        "--skip-connection-check", dest='skip_connection_check',
325
        default=False, action="store_true",
13465.2.2 by Stuart Bishop
Add option to skip open connection check
326
        help="Don't check open connections.")
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
327
    parser.add_option(
13465.2.4 by Stuart Bishop
Simplify cli
328
        "--kill-connections", dest='kill_connections',
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
329
        default=False, action="store_true",
13465.2.4 by Stuart Bishop
Simplify cli
330
        help="Kill non-system connections instead of reporting an error.")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
331
    (options, args) = parser.parse_args()
332
    if args:
333
        parser.error("Too many arguments")
334
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
335
    if options.kill_connections and options.skip_connection_check:
336
        parser.error(
13465.2.4 by Stuart Bishop
Simplify cli
337
            "--skip-connection-check conflicts with --kill-connections")
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
338
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
339
    log = logger(options)
340
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
341
    if options.kill_connections:
13465.2.10 by Stuart Bishop
Inline preflight checks
342
        preflight_check = KillConnectionsPreflight(log)
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
343
    elif options.skip_connection_check:
13465.2.10 by Stuart Bishop
Inline preflight checks
344
        preflight_check = NoConnectionCheckPreflight(log)
13465.2.3 by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections
345
    else:
13465.2.10 by Stuart Bishop
Inline preflight checks
346
        preflight_check = DatabasePreflight(log)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
347
348
    if preflight_check.check_all():
7675.395.196 by Stuart Bishop
preflight check for database upgrades
349
        log.info('Preflight check succeeded. Good to go.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
350
        return 0
351
    else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
352
        log.error('Preflight check failed.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
353
        return 1
354
355
356
if __name__ == '__main__':
13465.2.36 by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting)
357
    raise SystemExit(main())