5
5
"""Confirm the database systems are ready to be patched as best we can."""
8
10
'DatabasePreflight',
9
11
'KillConnectionsPreflight',
10
12
'NoConnectionCheckPreflight',
15
16
from datetime import timedelta
16
17
from optparse import OptionParser
22
from lp.services.database.sqlbase import (
22
from canonical.database.sqlbase import (
24
24
ISOLATION_LEVEL_AUTOCOMMIT,
27
from lp.services.scripts import (
27
from canonical.launchpad.scripts import (
32
from canonical import lp
32
33
import replication.helpers
35
36
# Ignore connections by these users.
36
37
SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios', 'lagmon'])
40
41
# added here. The preflight check will fail if any of these users are
41
42
# connected, so these systems will need to be shut down manually before
42
43
# a database update.
43
FRAGILE_USERS = frozenset([
45
# process_accepted is fragile, but also fast so we likely shouldn't
46
# need to ever manually shut it down.
53
# If these users have long running transactions, just kill 'em. Entries
54
# added here must come with a bug number, a if part of Launchpad holds
55
# open a long running transaction it is a bug we need to fix.
56
BAD_USERS = frozenset([
57
'karma', # Bug #863109
58
'rosettaadmin', # Bug #863122
44
FRAGILE_USERS = frozenset(['archivepublisher'])
61
46
# How lagged the cluster can be before failing the preflight check.
62
# If this is set too low, perfectly normal state will abort rollouts. If
63
# this is set too high, then we will have unacceptable downtime as
64
# replication needs to catch up before the database patches will apply.
65
47
MAX_LAG = timedelta(seconds=60)
68
50
class DatabasePreflight:
69
51
def __init__(self, log):
70
master_con = connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT)
52
master_con = connect(lp.dbuser)
53
master_con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
73
56
self.is_replicated = replication.helpers.slony_installed(master_con)
94
77
self.lpmain_nodes = set(
95
78
node for node in self.nodes
96
79
if node.node_id in lpmain_node_ids)
98
# Store a reference to the lpmain origin.
99
lpmain_master_node_id = replication.helpers.get_master_node(
100
master_con, 1).node_id
101
self.lpmain_master_node = [
102
node for node in self.lpmain_nodes
103
if node.node_id == lpmain_master_node_id][0]
105
81
node = replication.helpers.Node(None, None, None, True)
106
82
node.con = master_con
107
83
self.nodes = set([node])
108
84
self.lpmain_nodes = self.nodes
109
self.lpmain_master_node = node
111
86
def check_is_superuser(self):
112
87
"""Return True if all the node connections are as superusers."""
189
164
% ', '.join(FRAGILE_USERS))
192
def check_long_running_transactions(self, max_secs=60):
167
def check_long_running_transactions(self, max_secs=10):
193
168
"""Return False if any nodes have long running transactions open.
195
170
max_secs defines what is long running. For database rollouts,
196
171
this will be short. Even if the transaction is benign like a
197
172
autovacuum task, we should wait until things have settled down.
199
We ignore transactions held open by BAD_USERS. These are bugs
200
that need to be fixed, but we have determined that rudely aborting
201
them is fine for now and there is no need to block a rollout on
205
175
for node in self.nodes:
214
184
AND datname=current_database()
216
186
for datname, usename, age, current_query in cur.fetchall():
217
if usename in BAD_USERS:
219
"%s has transactions by %s open %s (ignoring)",
220
datname, usename, age)
223
"%s has transaction by %s open %s",
224
datname, usename, age)
188
"%s has transaction by %s open %s",
189
datname, usename, age)
227
192
self.log.info("No long running transactions detected.")
279
def report_patches(self):
280
"""Report what patches are due to be applied from this tree."""
281
con = self.lpmain_master_node.con
282
upgrade.log = self.log
283
for patch_num, patch_file in upgrade.get_patchlist(con):
284
self.log.info("%s is pending", os.path.basename(patch_file))
286
244
def check_all(self):
287
245
"""Run all checks.
327
283
System users are defined by SYSTEM_USERS.
329
# We keep trying to terminate connections every 0.5 seconds for
332
seconds_to_pause = 0.5
333
for loop_count in range(num_tries):
335
for node in self.lpmain_nodes:
336
cur = node.con.cursor()
339
procpid, datname, usename,
340
pg_terminate_backend(procpid)
341
FROM pg_stat_activity
343
datname=current_database()
344
AND procpid <> pg_backend_pid()
345
AND usename NOT IN %s
346
""" % sqlvalues(SYSTEM_USERS))
347
for procpid, datname, usename, ignored in cur.fetchall():
349
if loop_count == num_tries - 1:
351
"Unable to kill %s [%s] on %s",
352
usename, procpid, datname)
353
elif usename in BAD_USERS:
355
"Killed %s [%s] on %s", usename, procpid, datname)
358
"Killed %s [%s] on %s", usename, procpid, datname)
362
# Wait a little for any terminated connections to actually
364
time.sleep(seconds_to_pause)
285
for node in self.lpmain_nodes:
286
cur = node.con.cursor()
289
procpid, datname, usename, pg_terminate_backend(procpid)
290
FROM pg_stat_activity
292
datname=current_database()
293
AND procpid <> pg_backend_pid()
294
AND usename NOT IN %s
295
""" % sqlvalues(SYSTEM_USERS))
296
for procpid, datname, usename, ignored in cur.fetchall():
298
"Killed %s [%s] on %s", usename, procpid, datname)