7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
1 |
#!/usr/bin/python2.6 -S
|
2 |
# Copyright 2011 Canonical Ltd. This software is licensed under the
|
|
3 |
# GNU Affero General Public License version 3 (see the file LICENSE).
|
|
4 |
||
5 |
"""Confirm the database systems are ready to be patched as best we can."""
|
|
6 |
||
7 |
import _pythonpath |
|
8 |
||
9 |
from datetime import timedelta |
|
10 |
from optparse import OptionParser |
|
11 |
import sys |
|
12 |
||
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
13 |
import psycopg2 |
14 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
15 |
from canonical.database.sqlbase import ( |
16 |
connect, |
|
17 |
ISOLATION_LEVEL_AUTOCOMMIT, |
|
18 |
)
|
|
19 |
from canonical.launchpad.scripts import ( |
|
20 |
db_options, |
|
21 |
logger, |
|
22 |
logger_options, |
|
23 |
)
|
|
24 |
from canonical import lp |
|
25 |
import replication.helpers |
|
26 |
||
27 |
||
28 |
# Ignore connections by these users.
|
|
7675.395.202
by Stuart Bishop
Add lagmon as a system user, correct variable name |
29 |
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon']) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
30 |
|
31 |
# How lagged the cluster can be before failing the preflight check.
|
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
32 |
MAX_LAG = timedelta(seconds=45) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
33 |
|
34 |
||
35 |
class DatabasePreflight: |
|
36 |
def __init__(self, log, master_con): |
|
37 |
self.log = log |
|
38 |
self.is_replicated = replication.helpers.slony_installed(master_con) |
|
39 |
if self.is_replicated: |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
40 |
self.nodes = set( |
41 |
replication.helpers.get_all_cluster_nodes(master_con)) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
42 |
for node in self.nodes: |
43 |
node.con = psycopg2.connect(node.connection_string) |
|
44 |
node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) |
|
45 |
else: |
|
46 |
node = replication.helpers.Node(None, None, None, True) |
|
47 |
node.con = master_con |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
48 |
self.nodes = set([node]) |
49 |
||
50 |
# Create a list of nodes subscribed to the replicated sets we
|
|
51 |
# are modifying.
|
|
52 |
cur = master_con.cursor() |
|
53 |
cur.execute(""" |
|
54 |
WITH subscriptions AS (
|
|
55 |
SELECT *
|
|
56 |
FROM _sl.sl_subscribe
|
|
57 |
WHERE sub_set = 1 AND sub_active IS TRUE)
|
|
58 |
SELECT sub_provider FROM subscriptions
|
|
59 |
UNION
|
|
60 |
SELECT sub_receiver FROM subscriptions
|
|
61 |
""") |
|
62 |
lpmain_node_ids = set(row[0] for row in cur.fetchall()) |
|
63 |
self.lpmain_nodes = set( |
|
7675.395.202
by Stuart Bishop
Add lagmon as a system user, correct variable name |
64 |
node for node in self.nodes |
65 |
if node.node_id in lpmain_node_ids) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
66 |
|
67 |
def check_is_superuser(self): |
|
68 |
"""Return True if all the node connections are as superusers."""
|
|
69 |
success = True |
|
70 |
for node in self.nodes: |
|
71 |
cur = node.con.cursor() |
|
72 |
cur.execute(""" |
|
73 |
SELECT current_database(), pg_user.usesuper
|
|
74 |
FROM pg_user
|
|
75 |
WHERE usename = current_user
|
|
76 |
""") |
|
77 |
dbname, is_super = cur.fetchone() |
|
78 |
if is_super: |
|
79 |
self.log.debug("Connected to %s as a superuser.", dbname) |
|
80 |
else: |
|
81 |
self.log.fatal("Not connected to %s as a superuser.", dbname) |
|
82 |
success = False |
|
83 |
return success |
|
84 |
||
85 |
def check_open_connections(self): |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
86 |
"""False if any lpmain nodes have connections from non-system users.
|
87 |
||
88 |
We only check on subscribed nodes, as there will be active systems
|
|
89 |
connected to other nodes in the replication cluster (such as the
|
|
90 |
SSO servers).
|
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
91 |
|
92 |
System users are defined by SYSTEM_USERS.
|
|
93 |
"""
|
|
94 |
success = True |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
95 |
for node in self.lpmain_nodes: |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
96 |
cur = node.con.cursor() |
97 |
cur.execute(""" |
|
98 |
SELECT datname, usename, COUNT(*) AS num_connections
|
|
99 |
FROM pg_stat_activity
|
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
100 |
WHERE
|
101 |
datname=current_database()
|
|
102 |
AND procpid <> pg_backend_pid()
|
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
103 |
GROUP BY datname, usename
|
104 |
""") |
|
105 |
for datname, usename, num_connections in cur.fetchall(): |
|
106 |
if usename in SYSTEM_USERS: |
|
107 |
self.log.debug( |
|
108 |
"%s has %d connections by %s", |
|
109 |
datname, num_connections, usename) |
|
110 |
else: |
|
111 |
self.log.fatal( |
|
112 |
"%s has %d connections by %s", |
|
113 |
datname, num_connections, usename) |
|
114 |
success = False |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
115 |
if success: |
116 |
self.log.info("Only system users connected to the cluster") |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
117 |
return success |
118 |
||
119 |
def check_long_running_transactions(self, max_secs=10): |
|
120 |
"""Return False if any nodes have long running transactions open.
|
|
121 |
||
122 |
max_secs defines what is long running. For database rollouts,
|
|
123 |
this will be short. Even if the transaction is benign like a
|
|
124 |
autovacuum task, we should wait until things have settled down.
|
|
125 |
"""
|
|
126 |
success = True |
|
127 |
for node in self.nodes: |
|
128 |
cur = node.con.cursor() |
|
129 |
cur.execute(""" |
|
130 |
SELECT
|
|
131 |
datname, usename,
|
|
132 |
age(current_timestamp, xact_start) AS age, current_query
|
|
133 |
FROM pg_stat_activity
|
|
134 |
WHERE
|
|
135 |
age(current_timestamp, xact_start) > interval '%d secs' |
|
136 |
AND datname=current_database()
|
|
137 |
""" % max_secs) |
|
138 |
for datname, usename, age, current_query in cur.fetchall(): |
|
139 |
self.log.fatal( |
|
140 |
"%s has transaction by %s open %s", |
|
141 |
datname, usename, age) |
|
142 |
success = False |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
143 |
if success: |
144 |
self.log.info("No long running transactions detected.") |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
145 |
return success |
146 |
||
147 |
def check_replication_lag(self): |
|
148 |
"""Return False if the replication cluster is badly lagged."""
|
|
149 |
if not self.is_replicated: |
|
150 |
self.log.debug("Not replicated - no replication lag.") |
|
151 |
return True |
|
152 |
||
153 |
# Check replication lag on every node just in case there are
|
|
154 |
# disagreements.
|
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
155 |
max_lag = timedelta(seconds=-1) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
156 |
max_lag_node = None |
157 |
for node in self.nodes: |
|
158 |
cur = node.con.cursor() |
|
159 |
cur.execute(""" |
|
160 |
SELECT current_database(),
|
|
161 |
max(st_lag_time) AS lag FROM _sl.sl_status
|
|
162 |
""") |
|
163 |
dbname, lag = cur.fetchone() |
|
164 |
if lag > max_lag: |
|
165 |
max_lag = lag |
|
166 |
max_lag_node = node |
|
167 |
self.log.debug( |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
168 |
"%s reports database lag of %s.", dbname, lag) |
169 |
if max_lag <= MAX_LAG: |
|
170 |
self.log.info("Database cluster lag is ok (%s)", max_lag) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
171 |
return True |
172 |
else: |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
173 |
self.log.fatal("Database cluster lag is high (%s)", max_lag) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
174 |
return False |
175 |
||
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
176 |
def check_can_sync(self): |
177 |
"""Return True if a sync event is acknowledged by all nodes.
|
|
178 |
||
179 |
We only wait 30 seconds for the sync, because we require the
|
|
180 |
cluster to be quiescent.
|
|
181 |
"""
|
|
182 |
if self.is_replicated: |
|
183 |
success = replication.helpers.sync(30) |
|
184 |
if success: |
|
185 |
self.log.info( |
|
186 |
"Replication events are being propagated.") |
|
187 |
else: |
|
188 |
self.log.fatal( |
|
189 |
"Replication events are not being propagated.") |
|
190 |
self.log.fatal( |
|
191 |
"One or more replication daemons may be down.") |
|
192 |
self.log.fatal( |
|
193 |
"Bounce the replication daemons and check the logs.") |
|
194 |
return success |
|
195 |
else: |
|
196 |
return True |
|
197 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
198 |
def check_all(self): |
199 |
"""Run all checks.
|
|
200 |
||
201 |
If any failed, return False. Otherwise return True.
|
|
202 |
"""
|
|
203 |
if not self.check_is_superuser(): |
|
204 |
# No point continuing - results will be bogus without access
|
|
205 |
# to pg_stat_activity
|
|
206 |
return False |
|
207 |
||
208 |
success = True |
|
209 |
if not self.check_open_connections(): |
|
210 |
success = False |
|
211 |
if not self.check_long_running_transactions(): |
|
212 |
success = False |
|
213 |
if not self.check_replication_lag(): |
|
214 |
success = False |
|
215 |
if not self.check_can_sync(): |
|
216 |
success = False |
|
217 |
return success |
|
218 |
||
219 |
||
220 |
def main(): |
|
221 |
parser = OptionParser() |
|
222 |
db_options(parser) |
|
223 |
logger_options(parser) |
|
224 |
(options, args) = parser.parse_args() |
|
225 |
if args: |
|
226 |
parser.error("Too many arguments") |
|
227 |
||
228 |
log = logger(options) |
|
229 |
||
230 |
master_con = connect(lp.dbuser) |
|
231 |
master_con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) |
|
232 |
||
233 |
preflight_check = DatabasePreflight(log, master_con) |
|
234 |
||
235 |
if preflight_check.check_all(): |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
236 |
log.info('Preflight check succeeded. Good to go.') |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
237 |
return 0 |
238 |
else: |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
239 |
log.error('Preflight check failed.') |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
240 |
return 1 |
241 |
||
242 |
||
243 |
if __name__ == '__main__': |
|
244 |
sys.exit(main()) |