7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
1 |
#!/usr/bin/python2.6 -S
|
2 |
# Copyright 2011 Canonical Ltd. This software is licensed under the
|
|
3 |
# GNU Affero General Public License version 3 (see the file LICENSE).
|
|
4 |
||
5 |
"""Confirm the database systems are ready to be patched as best we can."""
|
|
6 |
||
7 |
import _pythonpath |
|
8 |
||
13465.2.10
by Stuart Bishop
Inline preflight checks |
9 |
__all__ = [ |
10 |
'DatabasePreflight', |
|
11 |
'KillConnectionsPreflight', |
|
12 |
'NoConnectionCheckPreflight', |
|
13 |
]
|
|
14 |
||
15 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
16 |
from datetime import timedelta |
17 |
from optparse import OptionParser |
|
13465.2.36
by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting) |
18 |
import time |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
19 |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
20 |
import psycopg2 |
21 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
22 |
from canonical.database.sqlbase import ( |
23 |
connect, |
|
24 |
ISOLATION_LEVEL_AUTOCOMMIT, |
|
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
25 |
sqlvalues, |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
26 |
)
|
27 |
from canonical.launchpad.scripts import ( |
|
28 |
db_options, |
|
29 |
logger, |
|
30 |
logger_options, |
|
31 |
)
|
|
32 |
import replication.helpers |
|
33 |
||
34 |
||
35 |
# Ignore connections by these users.
|
|
7675.395.202
by Stuart Bishop
Add lagmon as a system user, correct variable name |
36 |
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon']) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
37 |
|
13465.2.12
by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting |
38 |
# Fail checks if these users are connected. If a process should not be
|
39 |
# interrupted by a rollout, the database user it connects as should be
|
|
40 |
# added here. The preflight check will fail if any of these users are
|
|
41 |
# connected, so these systems will need to be shut down manually before
|
|
42 |
# a database update.
|
|
9893.9.6
by Stuart Bishop
fiera is no longer fragile |
43 |
FRAGILE_USERS = frozenset(['archivepublisher', 'buildd_manager']) |
13465.2.12
by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting |
44 |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
45 |
# How lagged the cluster can be before failing the preflight check.
|
13465.2.18
by Stuart Bishop
delint |
46 |
MAX_LAG = timedelta(seconds=60) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
47 |
|
48 |
||
49 |
class DatabasePreflight: |
|
13465.2.10
by Stuart Bishop
Inline preflight checks |
50 |
def __init__(self, log): |
13879.1.6
by William Grant
Some more that I missed. |
51 |
master_con = connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT) |
13465.2.10
by Stuart Bishop
Inline preflight checks |
52 |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
53 |
self.log = log |
54 |
self.is_replicated = replication.helpers.slony_installed(master_con) |
|
55 |
if self.is_replicated: |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
56 |
self.nodes = set( |
57 |
replication.helpers.get_all_cluster_nodes(master_con)) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
58 |
for node in self.nodes: |
59 |
node.con = psycopg2.connect(node.connection_string) |
|
60 |
node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) |
|
13465.2.2
by Stuart Bishop
Add option to skip open connection check |
61 |
|
62 |
# Create a list of nodes subscribed to the replicated sets we
|
|
63 |
# are modifying.
|
|
64 |
cur = master_con.cursor() |
|
65 |
cur.execute(""" |
|
66 |
WITH subscriptions AS (
|
|
67 |
SELECT *
|
|
68 |
FROM _sl.sl_subscribe
|
|
69 |
WHERE sub_set = 1 AND sub_active IS TRUE)
|
|
70 |
SELECT sub_provider FROM subscriptions
|
|
71 |
UNION
|
|
72 |
SELECT sub_receiver FROM subscriptions
|
|
73 |
""") |
|
74 |
lpmain_node_ids = set(row[0] for row in cur.fetchall()) |
|
75 |
self.lpmain_nodes = set( |
|
76 |
node for node in self.nodes |
|
77 |
if node.node_id in lpmain_node_ids) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
78 |
else: |
79 |
node = replication.helpers.Node(None, None, None, True) |
|
80 |
node.con = master_con |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
81 |
self.nodes = set([node]) |
13465.2.2
by Stuart Bishop
Add option to skip open connection check |
82 |
self.lpmain_nodes = self.nodes |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
83 |
|
84 |
def check_is_superuser(self): |
|
85 |
"""Return True if all the node connections are as superusers."""
|
|
86 |
success = True |
|
87 |
for node in self.nodes: |
|
88 |
cur = node.con.cursor() |
|
89 |
cur.execute(""" |
|
90 |
SELECT current_database(), pg_user.usesuper
|
|
91 |
FROM pg_user
|
|
92 |
WHERE usename = current_user
|
|
93 |
""") |
|
94 |
dbname, is_super = cur.fetchone() |
|
95 |
if is_super: |
|
96 |
self.log.debug("Connected to %s as a superuser.", dbname) |
|
97 |
else: |
|
98 |
self.log.fatal("Not connected to %s as a superuser.", dbname) |
|
99 |
success = False |
|
100 |
return success |
|
101 |
||
102 |
def check_open_connections(self): |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
103 |
"""False if any lpmain nodes have connections from non-system users.
|
104 |
||
105 |
We only check on subscribed nodes, as there will be active systems
|
|
106 |
connected to other nodes in the replication cluster (such as the
|
|
107 |
SSO servers).
|
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
108 |
|
109 |
System users are defined by SYSTEM_USERS.
|
|
110 |
"""
|
|
111 |
success = True |
|
7675.395.201
by Stuart Bishop
Only check for open database connections on nodes subscribed to the main launchpad replication set. |
112 |
for node in self.lpmain_nodes: |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
113 |
cur = node.con.cursor() |
114 |
cur.execute(""" |
|
115 |
SELECT datname, usename, COUNT(*) AS num_connections
|
|
116 |
FROM pg_stat_activity
|
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
117 |
WHERE
|
118 |
datname=current_database()
|
|
119 |
AND procpid <> pg_backend_pid()
|
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
120 |
GROUP BY datname, usename
|
121 |
""") |
|
122 |
for datname, usename, num_connections in cur.fetchall(): |
|
123 |
if usename in SYSTEM_USERS: |
|
124 |
self.log.debug( |
|
125 |
"%s has %d connections by %s", |
|
126 |
datname, num_connections, usename) |
|
127 |
else: |
|
128 |
self.log.fatal( |
|
129 |
"%s has %d connections by %s", |
|
130 |
datname, num_connections, usename) |
|
131 |
success = False |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
132 |
if success: |
133 |
self.log.info("Only system users connected to the cluster") |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
134 |
return success |
135 |
||
13465.2.12
by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting |
136 |
def check_fragile_connections(self): |
137 |
"""Fail if any FRAGILE_USERS are connected to the cluster.
|
|
138 |
||
139 |
If we interrupt these processes, we may have a mess to clean
|
|
140 |
up. If they are connected, the preflight check should fail.
|
|
141 |
"""
|
|
142 |
success = True |
|
143 |
for node in self.lpmain_nodes: |
|
144 |
cur = node.con.cursor() |
|
145 |
cur.execute(""" |
|
146 |
SELECT datname, usename, COUNT(*) AS num_connections
|
|
147 |
FROM pg_stat_activity
|
|
148 |
WHERE
|
|
149 |
datname=current_database()
|
|
150 |
AND procpid <> pg_backend_pid()
|
|
151 |
AND usename IN %s |
|
152 |
GROUP BY datname, usename
|
|
153 |
""" % sqlvalues(FRAGILE_USERS)) |
|
154 |
for datname, usename, num_connections in cur.fetchall(): |
|
155 |
self.log.fatal( |
|
156 |
"Fragile system %s running. %s has %d connections.", |
|
157 |
usename, datname, num_connections) |
|
158 |
success = False |
|
159 |
if success: |
|
160 |
self.log.info( |
|
161 |
"No fragile systems connected to the cluster (%s)" |
|
162 |
% ', '.join(FRAGILE_USERS)) |
|
163 |
return success |
|
164 |
||
7675.1212.16
by Stuart Bishop
Saner definition of 'long running transactions' |
165 |
def check_long_running_transactions(self, max_secs=60): |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
166 |
"""Return False if any nodes have long running transactions open.
|
167 |
||
168 |
max_secs defines what is long running. For database rollouts,
|
|
169 |
this will be short. Even if the transaction is benign like a
|
|
170 |
autovacuum task, we should wait until things have settled down.
|
|
171 |
"""
|
|
172 |
success = True |
|
173 |
for node in self.nodes: |
|
174 |
cur = node.con.cursor() |
|
175 |
cur.execute(""" |
|
176 |
SELECT
|
|
177 |
datname, usename,
|
|
178 |
age(current_timestamp, xact_start) AS age, current_query
|
|
179 |
FROM pg_stat_activity
|
|
180 |
WHERE
|
|
181 |
age(current_timestamp, xact_start) > interval '%d secs' |
|
182 |
AND datname=current_database()
|
|
183 |
""" % max_secs) |
|
184 |
for datname, usename, age, current_query in cur.fetchall(): |
|
185 |
self.log.fatal( |
|
186 |
"%s has transaction by %s open %s", |
|
187 |
datname, usename, age) |
|
188 |
success = False |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
189 |
if success: |
190 |
self.log.info("No long running transactions detected.") |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
191 |
return success |
192 |
||
193 |
def check_replication_lag(self): |
|
194 |
"""Return False if the replication cluster is badly lagged."""
|
|
195 |
if not self.is_replicated: |
|
196 |
self.log.debug("Not replicated - no replication lag.") |
|
197 |
return True |
|
198 |
||
199 |
# Check replication lag on every node just in case there are
|
|
200 |
# disagreements.
|
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
201 |
max_lag = timedelta(seconds=-1) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
202 |
for node in self.nodes: |
203 |
cur = node.con.cursor() |
|
204 |
cur.execute(""" |
|
205 |
SELECT current_database(),
|
|
206 |
max(st_lag_time) AS lag FROM _sl.sl_status
|
|
207 |
""") |
|
208 |
dbname, lag = cur.fetchone() |
|
209 |
if lag > max_lag: |
|
210 |
max_lag = lag |
|
211 |
self.log.debug( |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
212 |
"%s reports database lag of %s.", dbname, lag) |
213 |
if max_lag <= MAX_LAG: |
|
214 |
self.log.info("Database cluster lag is ok (%s)", max_lag) |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
215 |
return True |
216 |
else: |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
217 |
self.log.fatal("Database cluster lag is high (%s)", max_lag) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
218 |
return False |
219 |
||
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
220 |
def check_can_sync(self): |
221 |
"""Return True if a sync event is acknowledged by all nodes.
|
|
222 |
||
223 |
We only wait 30 seconds for the sync, because we require the
|
|
224 |
cluster to be quiescent.
|
|
225 |
"""
|
|
226 |
if self.is_replicated: |
|
13465.2.32
by Stuart Bishop
Continue preflight checks if sync check failed, rather than abort |
227 |
success = replication.helpers.sync(30, exit_on_fail=False) |
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
228 |
if success: |
229 |
self.log.info( |
|
230 |
"Replication events are being propagated.") |
|
231 |
else: |
|
232 |
self.log.fatal( |
|
233 |
"Replication events are not being propagated.") |
|
234 |
self.log.fatal( |
|
235 |
"One or more replication daemons may be down.") |
|
236 |
self.log.fatal( |
|
237 |
"Bounce the replication daemons and check the logs.") |
|
238 |
return success |
|
239 |
else: |
|
240 |
return True |
|
241 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
242 |
def check_all(self): |
243 |
"""Run all checks.
|
|
244 |
||
245 |
If any failed, return False. Otherwise return True.
|
|
246 |
"""
|
|
247 |
if not self.check_is_superuser(): |
|
248 |
# No point continuing - results will be bogus without access
|
|
249 |
# to pg_stat_activity
|
|
250 |
return False |
|
251 |
||
252 |
success = True |
|
13465.2.12
by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting |
253 |
if not self.check_replication_lag(): |
254 |
success = False |
|
255 |
if not self.check_can_sync(): |
|
256 |
success = False |
|
13465.2.13
by Stuart Bishop
note |
257 |
# Do checks on open transactions last to minimize race
|
258 |
# conditions.
|
|
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
259 |
if not self.check_open_connections(): |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
260 |
success = False |
261 |
if not self.check_long_running_transactions(): |
|
262 |
success = False |
|
13465.2.12
by Stuart Bishop
Fragile user check for connections we don't want to risk interrupting |
263 |
if not self.check_fragile_connections(): |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
264 |
success = False |
265 |
return success |
|
266 |
||
267 |
||
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
268 |
class NoConnectionCheckPreflight(DatabasePreflight): |
269 |
def check_open_connections(self): |
|
270 |
return True |
|
271 |
||
272 |
||
273 |
class KillConnectionsPreflight(DatabasePreflight): |
|
274 |
def check_open_connections(self): |
|
275 |
"""Kill all non-system connections to Launchpad databases.
|
|
276 |
||
277 |
We only check on subscribed nodes, as there will be active systems
|
|
278 |
connected to other nodes in the replication cluster (such as the
|
|
279 |
SSO servers).
|
|
280 |
||
281 |
System users are defined by SYSTEM_USERS.
|
|
282 |
"""
|
|
13465.2.36
by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting) |
283 |
# We keep trying to terminate connections every 0.5 seconds for
|
284 |
# up to 10 seconds.
|
|
285 |
num_tries = 20 |
|
286 |
seconds_to_pause = 0.5 |
|
287 |
for loop_count in range(num_tries): |
|
288 |
all_clear = True |
|
289 |
for node in self.lpmain_nodes: |
|
290 |
cur = node.con.cursor() |
|
291 |
cur.execute(""" |
|
292 |
SELECT
|
|
293 |
procpid, datname, usename,
|
|
294 |
pg_terminate_backend(procpid)
|
|
295 |
FROM pg_stat_activity
|
|
296 |
WHERE
|
|
297 |
datname=current_database()
|
|
298 |
AND procpid <> pg_backend_pid()
|
|
299 |
AND usename NOT IN %s |
|
300 |
""" % sqlvalues(SYSTEM_USERS)) |
|
301 |
for procpid, datname, usename, ignored in cur.fetchall(): |
|
302 |
all_clear = False |
|
303 |
if loop_count == num_tries - 1: |
|
304 |
self.log.fatal( |
|
305 |
"Unable to kill %s [%s] on %s", |
|
306 |
usename, procpid, datname) |
|
307 |
else: |
|
308 |
self.log.warning( |
|
309 |
"Killed %s [%s] on %s", usename, procpid, datname) |
|
310 |
if all_clear: |
|
311 |
break
|
|
312 |
||
313 |
# Wait a little for any terminated connections to actually
|
|
314 |
# terminate.
|
|
315 |
time.sleep(seconds_to_pause) |
|
316 |
return all_clear |
|
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
317 |
|
318 |
||
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
319 |
def main(): |
320 |
parser = OptionParser() |
|
321 |
db_options(parser) |
|
322 |
logger_options(parser) |
|
13465.2.2
by Stuart Bishop
Add option to skip open connection check |
323 |
parser.add_option( |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
324 |
"--skip-connection-check", dest='skip_connection_check', |
325 |
default=False, action="store_true", |
|
13465.2.2
by Stuart Bishop
Add option to skip open connection check |
326 |
help="Don't check open connections.") |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
327 |
parser.add_option( |
13465.2.4
by Stuart Bishop
Simplify cli |
328 |
"--kill-connections", dest='kill_connections', |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
329 |
default=False, action="store_true", |
13465.2.4
by Stuart Bishop
Simplify cli |
330 |
help="Kill non-system connections instead of reporting an error.") |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
331 |
(options, args) = parser.parse_args() |
332 |
if args: |
|
333 |
parser.error("Too many arguments") |
|
334 |
||
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
335 |
if options.kill_connections and options.skip_connection_check: |
336 |
parser.error( |
|
13465.2.4
by Stuart Bishop
Simplify cli |
337 |
"--skip-connection-check conflicts with --kill-connections") |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
338 |
|
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
339 |
log = logger(options) |
340 |
||
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
341 |
if options.kill_connections: |
13465.2.10
by Stuart Bishop
Inline preflight checks |
342 |
preflight_check = KillConnectionsPreflight(log) |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
343 |
elif options.skip_connection_check: |
13465.2.10
by Stuart Bishop
Inline preflight checks |
344 |
preflight_check = NoConnectionCheckPreflight(log) |
13465.2.3
by Stuart Bishop
Command line arguments allowing preflight check ignore or kill open connections |
345 |
else: |
13465.2.10
by Stuart Bishop
Inline preflight checks |
346 |
preflight_check = DatabasePreflight(log) |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
347 |
|
348 |
if preflight_check.check_all(): |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
349 |
log.info('Preflight check succeeded. Good to go.') |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
350 |
return 0 |
351 |
else: |
|
7675.395.196
by Stuart Bishop
preflight check for database upgrades |
352 |
log.error('Preflight check failed.') |
7675.395.195
by Stuart Bishop
database deploy preflight check work in progress |
353 |
return 1 |
354 |
||
355 |
||
356 |
if __name__ == '__main__': |
|
13465.2.36
by Stuart Bishop
Retry until connections in need of termination die, although this shouldn't happen as killable connections should be connecting via pgbouncer (because we can't stop them reconnecting) |
357 |
raise SystemExit(main()) |