~launchpad-pqm/launchpad/devel

7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
1
#!/usr/bin/python2.6 -S
2
# Copyright 2011 Canonical Ltd.  This software is licensed under the
3
# GNU Affero General Public License version 3 (see the file LICENSE).
4
5
"""Confirm the database systems are ready to be patched as best we can."""
6
7
import _pythonpath
8
9
from datetime import timedelta
10
from optparse import OptionParser
11
import sys
12
7675.395.196 by Stuart Bishop
preflight check for database upgrades
13
import psycopg2
14
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
15
from canonical.database.sqlbase import (
16
    connect,
17
    ISOLATION_LEVEL_AUTOCOMMIT,
18
    )
19
from canonical.launchpad.scripts import (
20
    db_options,
21
    logger,
22
    logger_options,
23
    )
24
from canonical import lp
25
import replication.helpers
26
27
28
# Ignore connections by these users.
7675.395.202 by Stuart Bishop
Add lagmon as a system user, correct variable name
29
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon'])
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
30
31
# How lagged the cluster can be before failing the preflight check.
7675.395.196 by Stuart Bishop
preflight check for database upgrades
32
MAX_LAG = timedelta(seconds=45)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
33
34
35
class DatabasePreflight:
36
    def __init__(self, log, master_con):
37
        self.log = log
38
        self.is_replicated = replication.helpers.slony_installed(master_con)
39
        if self.is_replicated:
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
40
            self.nodes = set(
41
                replication.helpers.get_all_cluster_nodes(master_con))
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
42
            for node in self.nodes:
43
                node.con = psycopg2.connect(node.connection_string)
44
                node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
45
        else:
46
            node = replication.helpers.Node(None, None, None, True)
47
            node.con = master_con
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
48
            self.nodes = set([node])
49
50
        # Create a list of nodes subscribed to the replicated sets we
51
        # are modifying.
52
        cur = master_con.cursor()
53
        cur.execute("""
54
            WITH subscriptions AS (
55
                SELECT *
56
                FROM _sl.sl_subscribe
57
                WHERE sub_set = 1 AND sub_active IS TRUE)
58
            SELECT sub_provider FROM subscriptions
59
            UNION
60
            SELECT sub_receiver FROM subscriptions
61
            """)
62
        lpmain_node_ids = set(row[0] for row in cur.fetchall())
63
        self.lpmain_nodes = set(
7675.395.202 by Stuart Bishop
Add lagmon as a system user, correct variable name
64
            node for node in self.nodes
65
            if node.node_id in lpmain_node_ids)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
66
67
    def check_is_superuser(self):
68
        """Return True if all the node connections are as superusers."""
69
        success = True
70
        for node in self.nodes:
71
            cur = node.con.cursor()
72
            cur.execute("""
73
                SELECT current_database(), pg_user.usesuper
74
                FROM pg_user
75
                WHERE usename = current_user
76
                """)
77
            dbname, is_super = cur.fetchone()
78
            if is_super:
79
                self.log.debug("Connected to %s as a superuser.", dbname)
80
            else:
81
                self.log.fatal("Not connected to %s as a superuser.", dbname)
82
                success = False
83
        return success
84
85
    def check_open_connections(self):
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
86
        """False if any lpmain nodes have connections from non-system users.
87
88
        We only check on subscribed nodes, as there will be active systems
89
        connected to other nodes in the replication cluster (such as the
90
        SSO servers).
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
91
92
        System users are defined by SYSTEM_USERS.
93
        """
94
        success = True
7675.395.201 by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set.
95
        for node in self.lpmain_nodes:
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
96
            cur = node.con.cursor()
97
            cur.execute("""
98
                SELECT datname, usename, COUNT(*) AS num_connections
99
                FROM pg_stat_activity
7675.395.196 by Stuart Bishop
preflight check for database upgrades
100
                WHERE
101
                    datname=current_database()
102
                    AND procpid <> pg_backend_pid()
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
103
                GROUP BY datname, usename
104
                """)
105
            for datname, usename, num_connections in cur.fetchall():
106
                if usename in SYSTEM_USERS:
107
                    self.log.debug(
108
                        "%s has %d connections by %s",
109
                        datname, num_connections, usename)
110
                else:
111
                    self.log.fatal(
112
                        "%s has %d connections by %s",
113
                        datname, num_connections, usename)
114
                    success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
115
        if success:
116
            self.log.info("Only system users connected to the cluster")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
117
        return success
118
119
    def check_long_running_transactions(self, max_secs=10):
120
        """Return False if any nodes have long running transactions open.
121
122
        max_secs defines what is long running. For database rollouts,
123
        this will be short. Even if the transaction is benign like a
124
        autovacuum task, we should wait until things have settled down.
125
        """
126
        success = True
127
        for node in self.nodes:
128
            cur = node.con.cursor()
129
            cur.execute("""
130
                SELECT
131
                    datname, usename,
132
                    age(current_timestamp, xact_start) AS age, current_query
133
                FROM pg_stat_activity
134
                WHERE
135
                    age(current_timestamp, xact_start) > interval '%d secs'
136
                    AND datname=current_database()
137
                """ % max_secs)
138
            for datname, usename, age, current_query in cur.fetchall():
139
                self.log.fatal(
140
                    "%s has transaction by %s open %s",
141
                    datname, usename, age)
142
                success = False
7675.395.196 by Stuart Bishop
preflight check for database upgrades
143
        if success:
144
            self.log.info("No long running transactions detected.")
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
145
        return success
146
147
    def check_replication_lag(self):
148
        """Return False if the replication cluster is badly lagged."""
149
        if not self.is_replicated:
150
            self.log.debug("Not replicated - no replication lag.")
151
            return True
152
153
        # Check replication lag on every node just in case there are
154
        # disagreements.
7675.395.196 by Stuart Bishop
preflight check for database upgrades
155
        max_lag = timedelta(seconds=-1)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
156
        max_lag_node = None
157
        for node in self.nodes:
158
            cur = node.con.cursor()
159
            cur.execute("""
160
                SELECT current_database(),
161
                max(st_lag_time) AS lag FROM _sl.sl_status
162
            """)
163
            dbname, lag = cur.fetchone()
164
            if lag > max_lag:
165
                max_lag = lag
166
                max_lag_node = node
167
            self.log.debug(
7675.395.196 by Stuart Bishop
preflight check for database upgrades
168
                "%s reports database lag of %s.", dbname, lag)
169
        if max_lag <= MAX_LAG:
170
            self.log.info("Database cluster lag is ok (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
171
            return True
172
        else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
173
            self.log.fatal("Database cluster lag is high (%s)", max_lag)
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
174
            return False
175
7675.395.196 by Stuart Bishop
preflight check for database upgrades
176
    def check_can_sync(self):
177
        """Return True if a sync event is acknowledged by all nodes.
178
179
        We only wait 30 seconds for the sync, because we require the
180
        cluster to be quiescent.
181
        """
182
        if self.is_replicated:
183
            success = replication.helpers.sync(30)
184
            if success:
185
                self.log.info(
186
                    "Replication events are being propagated.")
187
            else:
188
                self.log.fatal(
189
                    "Replication events are not being propagated.")
190
                self.log.fatal(
191
                    "One or more replication daemons may be down.")
192
                self.log.fatal(
193
                    "Bounce the replication daemons and check the logs.")
194
            return success
195
        else:
196
            return True
197
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
198
    def check_all(self):
199
        """Run all checks.
200
201
        If any failed, return False. Otherwise return True.
202
        """
203
        if not self.check_is_superuser():
204
            # No point continuing - results will be bogus without access
205
            # to pg_stat_activity
206
            return False
207
208
        success = True
209
        if not self.check_open_connections():
210
            success = False
211
        if not self.check_long_running_transactions():
212
            success = False
213
        if not self.check_replication_lag():
214
            success = False
215
        if not self.check_can_sync():
216
            success = False
217
        return success
218
219
220
def main():
221
    parser = OptionParser()
222
    db_options(parser)
223
    logger_options(parser)
224
    (options, args) = parser.parse_args()
225
    if args:
226
        parser.error("Too many arguments")
227
228
    log = logger(options)
229
230
    master_con = connect(lp.dbuser)
231
    master_con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
232
233
    preflight_check = DatabasePreflight(log, master_con)
234
235
    if preflight_check.check_all():
7675.395.196 by Stuart Bishop
preflight check for database upgrades
236
        log.info('Preflight check succeeded. Good to go.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
237
        return 0
238
    else:
7675.395.196 by Stuart Bishop
preflight check for database upgrades
239
        log.error('Preflight check failed.')
7675.395.195 by Stuart Bishop
database deploy preflight check work in progress
240
        return 1
241
242
243
if __name__ == '__main__':
244
    sys.exit(main())