1
#!/usr/bin/python2.6 -S
2
# Copyright 2011 Canonical Ltd. This software is licensed under the
3
# GNU Affero General Public License version 3 (see the file LICENSE).
5
"""Confirm the database systems are ready to be patched as best we can."""
9
from datetime import timedelta
10
from optparse import OptionParser
15
from canonical.database.sqlbase import (
17
ISOLATION_LEVEL_AUTOCOMMIT,
19
from canonical.launchpad.scripts import (
24
from canonical import lp
25
import replication.helpers
28
# Ignore connections by these users.
29
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios'])
31
# How lagged the cluster can be before failing the preflight check.
32
MAX_LAG = timedelta(seconds=45)
35
class DatabasePreflight:
36
def __init__(self, log, master_con):
38
self.is_replicated = replication.helpers.slony_installed(master_con)
39
if self.is_replicated:
40
self.nodes = replication.helpers.get_all_cluster_nodes(master_con)
41
for node in self.nodes:
42
node.con = psycopg2.connect(node.connection_string)
43
node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
45
node = replication.helpers.Node(None, None, None, True)
49
def check_is_superuser(self):
50
"""Return True if all the node connections are as superusers."""
52
for node in self.nodes:
53
cur = node.con.cursor()
55
SELECT current_database(), pg_user.usesuper
57
WHERE usename = current_user
59
dbname, is_super = cur.fetchone()
61
self.log.debug("Connected to %s as a superuser.", dbname)
63
self.log.fatal("Not connected to %s as a superuser.", dbname)
67
def check_open_connections(self):
68
"""Return False if any nodes have connections from non-system users.
70
System users are defined by SYSTEM_USERS.
73
for node in self.nodes:
74
cur = node.con.cursor()
76
SELECT datname, usename, COUNT(*) AS num_connections
79
datname=current_database()
80
AND procpid <> pg_backend_pid()
81
GROUP BY datname, usename
83
for datname, usename, num_connections in cur.fetchall():
84
if usename in SYSTEM_USERS:
86
"%s has %d connections by %s",
87
datname, num_connections, usename)
90
"%s has %d connections by %s",
91
datname, num_connections, usename)
94
self.log.info("Only system users connected to the cluster")
97
def check_long_running_transactions(self, max_secs=10):
98
"""Return False if any nodes have long running transactions open.
100
max_secs defines what is long running. For database rollouts,
101
this will be short. Even if the transaction is benign like a
102
autovacuum task, we should wait until things have settled down.
105
for node in self.nodes:
106
cur = node.con.cursor()
110
age(current_timestamp, xact_start) AS age, current_query
111
FROM pg_stat_activity
113
age(current_timestamp, xact_start) > interval '%d secs'
114
AND datname=current_database()
116
for datname, usename, age, current_query in cur.fetchall():
118
"%s has transaction by %s open %s",
119
datname, usename, age)
122
self.log.info("No long running transactions detected.")
125
def check_replication_lag(self):
126
"""Return False if the replication cluster is badly lagged."""
127
if not self.is_replicated:
128
self.log.debug("Not replicated - no replication lag.")
131
# Check replication lag on every node just in case there are
133
max_lag = timedelta(seconds=-1)
135
for node in self.nodes:
136
cur = node.con.cursor()
138
SELECT current_database(),
139
max(st_lag_time) AS lag FROM _sl.sl_status
141
dbname, lag = cur.fetchone()
146
"%s reports database lag of %s.", dbname, lag)
147
if max_lag <= MAX_LAG:
148
self.log.info("Database cluster lag is ok (%s)", max_lag)
151
self.log.fatal("Database cluster lag is high (%s)", max_lag)
154
def check_can_sync(self):
155
"""Return True if a sync event is acknowledged by all nodes.
157
We only wait 30 seconds for the sync, because we require the
158
cluster to be quiescent.
160
if self.is_replicated:
161
success = replication.helpers.sync(30)
164
"Replication events are being propagated.")
167
"Replication events are not being propagated.")
169
"One or more replication daemons may be down.")
171
"Bounce the replication daemons and check the logs.")
179
If any failed, return False. Otherwise return True.
181
if not self.check_is_superuser():
182
# No point continuing - results will be bogus without access
183
# to pg_stat_activity
187
if not self.check_open_connections():
189
if not self.check_long_running_transactions():
191
if not self.check_replication_lag():
193
if not self.check_can_sync():
199
parser = OptionParser()
201
logger_options(parser)
202
(options, args) = parser.parse_args()
204
parser.error("Too many arguments")
206
log = logger(options)
208
master_con = connect(lp.dbuser)
209
master_con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
211
preflight_check = DatabasePreflight(log, master_con)
213
if preflight_check.check_all():
214
log.info('Preflight check succeeded. Good to go.')
217
log.error('Preflight check failed.')
221
if __name__ == '__main__':