~launchpad-pqm/launchpad/devel : contents of utilities/report-database-stats.py at revision 14646

~launchpad-pqm/launchpad/devel : (revision 14646)

7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	1	#!/usr/bin/python -S
14027.3.4 by Jeroen Vermeulen Automated import fixes and copyright updates.	2	# Copyright 2010-2011 Canonical Ltd. This software is licensed under the
7675.686.9 by Stuart Bishop Basic database utilization report	3	# GNU Affero General Public License version 3 (see the file LICENSE).
	4
	5	"""Generate the database statistics report."""
	6
	7	__metaclass__ = type
	8
14606.3.4 by William Grant Replace canonical.database usage everywhere, and format-imports.	9	import _pythonpath
	10
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	11	from datetime import datetime
7675.686.9 by Stuart Bishop Basic database utilization report	12	from operator import attrgetter
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	13	from textwrap import (
	14	dedent,
	15	fill,
	16	)
7675.686.9 by Stuart Bishop Basic database utilization report	17
14606.3.4 by William Grant Replace canonical.database usage everywhere, and format-imports.	18	from lp.scripts.helpers import LPOptionParser
	19	from lp.services.database.namedrow import named_fetchall
	20	from lp.services.database.sqlbase import (
14027.3.4 by Jeroen Vermeulen Automated import fixes and copyright updates.	21	connect,
	22	sqlvalues,
	23	)
14565.2.15 by Curtis Hovey Moved canonical.launchpad.scripts __init__ to lp.services.scripts.	24	from lp.services.scripts import db_options
7675.686.9 by Stuart Bishop Basic database utilization report	25
	26
	27	class Table:
	28	pass
	29
	30
7675.686.28 by Stuart Bishop Add some fuzz in the time interval to get meaningful daily reports when we cannot guarantee disk utilization statistics are calculated exactly 24 hours apart	31	def get_where_clause(options, fuzz='0 seconds'):
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	32	"Generate a WHERE clause referencing the date_created column."
	33	# We have two of the from timestamp, the until timestamp and an
	34	# interval. The interval is in a format unsuitable for processing in
	35	# Python. If the interval is set, it represents the period before
	36	# the until timestamp or the period after the from timestamp,
	37	# depending on which of these is set. From this information,
	38	# generate the SQL representation of the from timestamp and the
	39	# until timestamp.
	40	if options.from_ts:
	41	from_sql = ("CAST(%s AS timestamp without time zone)"
	42	% sqlvalues(options.from_ts))
	43	elif options.interval and options.until_ts:
	44	from_sql = (
	45	"CAST(%s AS timestamp without time zone) - CAST(%s AS interval)"
	46	% sqlvalues(options.until_ts, options.interval))
	47	elif options.interval:
	48	from_sql = (
	49	"(CURRENT_TIMESTAMP AT TIME ZONE 'UTC') - CAST(%s AS interval)"
	50	% sqlvalues(options.interval))
	51	else:
	52	from_sql = "CAST('1970-01-01' AS timestamp without time zone)"
	53
	54	if options.until_ts:
	55	until_sql = (
	56	"CAST(%s AS timestamp without time zone)"
	57	% sqlvalues(options.until_ts))
	58	elif options.interval and options.from_ts:
	59	until_sql = (
	60	"CAST(%s AS timestamp without time zone) + CAST(%s AS interval)"
	61	% sqlvalues(options.from_ts, options.interval))
	62	else:
	63	until_sql = "CURRENT_TIMESTAMP AT TIME ZONE 'UTC'"
	64
7675.686.28 by Stuart Bishop Add some fuzz in the time interval to get meaningful daily reports when we cannot guarantee disk utilization statistics are calculated exactly 24 hours apart	65	fuzz_sql = "CAST(%s AS interval)" % sqlvalues(fuzz)
	66	clause = "date_created BETWEEN (%s - %s) AND (%s + %s)" % (
	67	from_sql, fuzz_sql, until_sql, fuzz_sql)
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	68
	69	return clause
	70
	71
7675.686.9 by Stuart Bishop Basic database utilization report	72	def get_table_stats(cur, options):
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	73	params = {'where': get_where_clause(options)}
	74	tablestats_query = dedent("""\
7675.686.9 by Stuart Bishop Basic database utilization report	75	SELECT
	76	Earliest.date_created AS date_start,
	77	Latest.date_created AS date_end,
	78	Latest.schemaname,
	79	Latest.relname,
	80	Latest.seq_scan - Earliest.seq_scan AS seq_scan,
	81	Latest.seq_tup_read - Earliest.seq_tup_read AS seq_tup_read,
	82	Latest.idx_scan - Earliest.idx_scan AS idx_scan,
	83	Latest.idx_tup_fetch - Earliest.idx_tup_fetch AS idx_tup_fetch,
	84	Latest.n_tup_ins - Earliest.n_tup_ins AS n_tup_ins,
	85	Latest.n_tup_upd - Earliest.n_tup_upd AS n_tup_upd,
	86	Latest.n_tup_del - Earliest.n_tup_del AS n_tup_del,
	87	Latest.n_tup_hot_upd - Earliest.n_tup_hot_upd AS n_tup_hot_upd,
	88	Latest.n_live_tup,
	89	Latest.n_dead_tup,
	90	Latest.last_vacuum,
	91	Latest.last_autovacuum,
	92	Latest.last_analyze,
	93	Latest.last_autoanalyze
	94	FROM
	95	DatabaseTableStats AS Earliest,
	96	DatabaseTableStats AS Latest
	97	WHERE
	98	Earliest.date_created = (
	99	SELECT min(date_created) FROM DatabaseTableStats
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	100	WHERE %(where)s)
7675.686.9 by Stuart Bishop Basic database utilization report	101	AND Latest.date_created = (
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	102	SELECT max(date_created) FROM DatabaseTableStats
	103	WHERE %(where)s)
7675.686.9 by Stuart Bishop Basic database utilization report	104	AND Earliest.schemaname = Latest.schemaname
	105	AND Earliest.relname = Latest.relname
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	106	""" % params)
7675.686.9 by Stuart Bishop Basic database utilization report	107	cur.execute(tablestats_query)
	108
	109	# description[0] is the column name, per PEP-0249
	110	fields = [description[0] for description in cur.description]
	111	tables = set()
	112	for row in cur.fetchall():
	113	table = Table()
	114	for index in range(len(fields)):
	115	setattr(table, fields[index], row[index])
	116	table.total_tup_read = table.seq_tup_read + table.idx_tup_fetch
	117	table.total_tup_written = (
	118	table.n_tup_ins + table.n_tup_upd + table.n_tup_del)
	119	tables.add(table)
	120
	121	return tables
	122
	123
	124	def get_cpu_stats(cur, options):
7675.686.14 by Stuart Bishop Fix bogus CPU utilization report	125	# This query calculates the averate cpu utilization from the
	126	# samples. It assumes samples are taken at regular intervals over
	127	# the period.
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	128	# Note that we have to use SUM()/COUNT() instead of AVG() as
	129	# database users not connected when the sample was taken are not
	130	# recorded - we want the average utilization over the time period,
	131	# not the subset of the time period the user was actually connected.
	132	params = {'where': get_where_clause(options)}
	133	query = dedent("""\
7675.686.14 by Stuart Bishop Fix bogus CPU utilization report	134	SELECT (
	135	CAST(SUM(cpu) AS float) / (
	136	SELECT COUNT(DISTINCT date_created) FROM DatabaseCpuStats
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	137	WHERE %(where)s
	138	)) AS avg_cpu, username
7675.686.14 by Stuart Bishop Fix bogus CPU utilization report	139	FROM DatabaseCpuStats
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	140	WHERE %(where)s
7675.686.9 by Stuart Bishop Basic database utilization report	141	GROUP BY username
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	142	""" % params)
7675.686.9 by Stuart Bishop Basic database utilization report	143	cur.execute(query)
9893.6.20 by Stuart Bishop Fold edge database stats into lpnet	144	cpu_stats = set(cur.fetchall())
	145
	146	# Fold edge into lpnet, as they are now running the same code.
	147	# This is a temporary hack until we drop edge entirely. See
	148	# Bug #667883 for details.
	149	lpnet_avg_cpu = 0.0
	150	edge_avg_cpu = 0.0
	151	for stats_tuple in list(cpu_stats):
	152	avg_cpu, username = stats_tuple
	153	if username == 'lpnet':
	154	lpnet_avg_cpu = avg_cpu
	155	cpu_stats.discard(stats_tuple)
	156	elif username == 'edge':
	157	edge_avg_cpu = avg_cpu
	158	cpu_stats.discard(stats_tuple)
	159	cpu_stats.add((lpnet_avg_cpu + edge_avg_cpu, 'lpnet'))
	160
	161	return cpu_stats
7675.686.9 by Stuart Bishop Basic database utilization report	162
	163
7675.686.22 by Stuart Bishop Start of bloat report	164	def get_bloat_stats(cur, options, kind):
	165	# Return information on bloated tables and indexes, as of the end of
	166	# the requested time period.
	167	params = {
7675.686.28 by Stuart Bishop Add some fuzz in the time interval to get meaningful daily reports when we cannot guarantee disk utilization statistics are calculated exactly 24 hours apart	168	# We only collect these statistics daily, so add some fuzz
	169	# to ensure bloat information ends up on the daily reports;
	170	# we cannot guarantee the disk utilization statistics occur
7675.686.32 by Stuart Bishop Increase fuzz for bloat reports - running adhoc queries, the most recent space report could be 1 day old	171	# exactly 24 hours apart. Our most recent snapshot could be 1
	172	# day ago, give or take a few hours.
	173	'where': get_where_clause(options, fuzz='1 day 6 hours'),
7675.686.22 by Stuart Bishop Start of bloat report	174	'bloat': options.bloat,
	175	'min_bloat': options.min_bloat,
	176	'kind': kind,
	177	}
	178	query = dedent("""
	179	SELECT * FROM (
7675.686.34 by Stuart Bishop Missing DISTINCT	180	SELECT DISTINCT
7675.686.22 by Stuart Bishop Start of bloat report	181	namespace,
	182	name,
	183	sub_namespace,
	184	sub_name,
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	185	count(*) OVER t AS num_samples,
7675.686.22 by Stuart Bishop Start of bloat report	186	last_value(table_len) OVER t AS table_len,
7675.686.22 by Stuart Bishop Start of bloat report	187	pg_size_pretty(last_value(table_len) OVER t) AS table_size,
7675.686.31 by Stuart Bishop Fix bloat default sort order and widen index name columns	188	last_value(dead_tuple_len + free_space) OVER t AS bloat_len,
7675.686.22 by Stuart Bishop Start of bloat report	189	pg_size_pretty(last_value(dead_tuple_len + free_space) OVER t)
	190	AS bloat_size,
	191	first_value(dead_tuple_percent + free_percent) OVER t
	192	AS start_bloat_percent,
	193	last_value(dead_tuple_percent + free_percent) OVER t
7675.686.25 by Stuart Bishop Improve bloat reports	194	AS end_bloat_percent,
	195	(last_value(dead_tuple_percent + free_percent) OVER t
	196	- first_value(dead_tuple_percent + free_percent) OVER t
	197	) AS delta_bloat_percent,
	198	(last_value(table_len) OVER t
	199	- first_value(table_len) OVER t) AS delta_bloat_len,
	200	pg_size_pretty(
	201	last_value(table_len) OVER t
	202	- first_value(table_len) OVER t) AS delta_bloat_size
7675.686.22 by Stuart Bishop Start of bloat report	203	FROM DatabaseDiskUtilization
	204	WHERE
	205	%(where)s
	206	AND kind = %%(kind)s
	207	WINDOW t AS (
	208	PARTITION BY sort ORDER BY date_created
	209	ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
	210	) AS whatever
	211	WHERE
	212	table_len >= %(min_bloat)s
	213	AND end_bloat_percent >= %(bloat)s
7675.686.31 by Stuart Bishop Fix bloat default sort order and widen index name columns	214	ORDER BY bloat_len DESC
7675.686.22 by Stuart Bishop Start of bloat report	215	""" % params)
	216	cur.execute(query, params)
	217	bloat_stats = named_fetchall(cur)
7675.686.24 by Stuart Bishop Reorder report, interesting stuff at the top. Catch an error when not enough data. Limit bloat report length.	218	return list(bloat_stats)
7675.686.22 by Stuart Bishop Start of bloat report	219
7675.686.22 by Stuart Bishop Start of bloat report	220
7675.686.9 by Stuart Bishop Basic database utilization report	221	def main():
	222	parser = LPOptionParser()
	223	db_options(parser)
	224	parser.add_option(
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	225	"-f", "--from", dest="from_ts", type=datetime,
	226	default=None, metavar="TIMESTAMP",
	227	help="Use statistics collected since TIMESTAMP.")
	228	parser.add_option(
	229	"-u", "--until", dest="until_ts", type=datetime,
	230	default=None, metavar="TIMESTAMP",
	231	help="Use statistics collected up until TIMESTAMP.")
	232	parser.add_option(
	233	"-i", "--interval", dest="interval", type=str,
	234	default=None, metavar="INTERVAL",
14027.3.1 by Jeroen Vermeulen Fix lots of lint in recently-changed files.	235	help=(
7675.686.9 by Stuart Bishop Basic database utilization report	236	"Use statistics collected over the last INTERVAL period. "
	237	"INTERVAL is a string parsable by PostgreSQL "
14027.3.1 by Jeroen Vermeulen Fix lots of lint in recently-changed files.	238	"such as '5 minutes'."))
7675.686.9 by Stuart Bishop Basic database utilization report	239	parser.add_option(
	240	"-n", "--limit", dest="limit", type=int,
	241	default=15, metavar="NUM",
	242	help="Display the top NUM items in each category.")
7675.686.22 by Stuart Bishop Start of bloat report	243	parser.add_option(
	244	"-b", "--bloat", dest="bloat", type=float,
	245	default=40, metavar="BLOAT",
	246	help="Display tables and indexes bloated by more than BLOAT%.")
	247	parser.add_option(
	248	"--min-bloat", dest="min_bloat", type=int,
	249	default=10000000, metavar="BLOAT",
	250	help="Don't report tables bloated less than BLOAT bytes.")
7675.686.9 by Stuart Bishop Basic database utilization report	251	parser.set_defaults(dbuser="database_stats_report")
	252	options, args = parser.parse_args()
	253
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	254	if options.from_ts and options.until_ts and options.interval:
	255	parser.error(
	256	"Only two of --from, --until and --interval may be specified.")
	257
13879.1.3 by William Grant Drop now-obsolete connect(user) args.	258	con = connect()
7675.686.9 by Stuart Bishop Basic database utilization report	259	cur = con.cursor()
	260
7675.686.18 by Stuart Bishop Let report-database-stats.py report on arbitrary time periods	261	tables = list(get_table_stats(cur, options))
	262	if len(tables) == 0:
	263	parser.error("No statistics available in that time range.")
	264	arbitrary_table = tables[0]
7675.686.9 by Stuart Bishop Basic database utilization report	265	interval = arbitrary_table.date_end - arbitrary_table.date_start
7675.686.16 by Stuart Bishop Correctly calculate number of seconds in a day	266	per_second = float(interval.days * 24 * 60 * 60 + interval.seconds)
7675.686.24 by Stuart Bishop Reorder report, interesting stuff at the top. Catch an error when not enough data. Limit bloat report length.	267	if per_second == 0:
	268	parser.error("Only one sample in that time range.")
	269
	270	user_cpu = get_cpu_stats(cur, options)
	271	print "== Most Active Users =="
	272	print
	273	for cpu, username in sorted(user_cpu, reverse=True)[:options.limit]:
	274	print "%40s \|\| %10.2f%% CPU" % (username, float(cpu) / 10)
	275
	276	print
	277	print "== Most Written Tables =="
	278	print
	279	tables_sort = [
	280	'total_tup_written', 'n_tup_upd', 'n_tup_ins', 'n_tup_del', 'relname']
	281	most_written_tables = sorted(
	282	tables, key=attrgetter(*tables_sort), reverse=True)
	283	for table in most_written_tables[:options.limit]:
	284	print "%40s \|\| %10.2f tuples/sec" % (
	285	table.relname, table.total_tup_written / per_second)
	286
	287	print
7675.686.9 by Stuart Bishop Basic database utilization report	288	print "== Most Read Tables =="
	289	print
7675.686.11 by Stuart Bishop Comments per review feedback	290	# These match the pg_user_table_stats view. schemaname is the
	291	# namespace (normally 'public'), relname is the table (relation)
	292	# name. total_tup_red is the total number of rows read.
	293	# idx_tup_fetch is the number of rows looked up using an index.
7675.686.9 by Stuart Bishop Basic database utilization report	294	tables_sort = ['total_tup_read', 'idx_tup_fetch', 'schemaname', 'relname']
	295	most_read_tables = sorted(
	296	tables, key=attrgetter(*tables_sort), reverse=True)
	297	for table in most_read_tables[:options.limit]:
7675.686.15 by Stuart Bishop tweak formatting	298	print "%40s \|\| %10.2f tuples/sec" % (
7675.686.15 by Stuart Bishop tweak formatting	299	table.relname, table.total_tup_read / per_second)
7675.686.24 by Stuart Bishop Reorder report, interesting stuff at the top. Catch an error when not enough data. Limit bloat report length.	300
7675.686.25 by Stuart Bishop Improve bloat reports	301	table_bloat_stats = get_bloat_stats(cur, options, 'r')
7675.686.25 by Stuart Bishop Improve bloat reports	302
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	303	if not table_bloat_stats:
	304	print
	305	print "(There is no bloat information available in this time range.)"
	306
	307	else:
	308	print
	309	print "== Most Bloated Tables =="
	310	print
	311	for bloated_table in table_bloat_stats[:options.limit]:
	312	print "%40s \|\| %2d%% \|\| %s of %s" % (
7675.686.25 by Stuart Bishop Improve bloat reports	313	bloated_table.name,
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	314	bloated_table.end_bloat_percent,
	315	bloated_table.bloat_size,
	316	bloated_table.table_size)
	317
	318	index_bloat_stats = get_bloat_stats(cur, options, 'i')
	319
	320	print
	321	print "== Most Bloated Indexes =="
	322	print
	323	for bloated_index in index_bloat_stats[:options.limit]:
7675.686.33 by Stuart Bishop Increase index name length further. Should be dynamic?	324	print "%65s \|\| %2d%% \|\| %s of %s" % (
7675.686.25 by Stuart Bishop Improve bloat reports	325	bloated_index.sub_name,
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	326	bloated_index.end_bloat_percent,
	327	bloated_index.bloat_size,
	328	bloated_index.table_size)
	329
	330	# Order bloat delta report by size of bloat increase.
	331	# We might want to change this to percentage bloat increase.
	332	bloating_sort_key = lambda x: x.delta_bloat_len
	333
	334	table_bloating_stats = sorted(
	335	table_bloat_stats, key=bloating_sort_key, reverse=True)
	336
	337	if table_bloating_stats[0].num_samples <= 1:
	338	print
	339	print fill(dedent("""\
	340	(There are not enough samples in this time range to display
	341	bloat change statistics)
	342	"""))
	343	else:
	344	print
	345	print "== Most Bloating Tables =="
	346	print
	347
	348	for bloated_table in table_bloating_stats[:options.limit]:
	349	# Bloat decreases are uninteresting, and would need to be in
7675.686.29 by Stuart Bishop Spelling	350	# a separate table sorted in reverse anyway.
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	351	if bloated_table.delta_bloat_percent > 0:
	352	print "%40s \|\| +%4.2f%% \|\| +%s" % (
	353	bloated_table.name,
	354	bloated_table.delta_bloat_percent,
	355	bloated_table.delta_bloat_size)
	356
	357	index_bloating_stats = sorted(
	358	index_bloat_stats, key=bloating_sort_key, reverse=True)
	359
	360	print
	361	print "== Most Bloating Indexes =="
	362	print
	363	for bloated_index in index_bloating_stats[:options.limit]:
	364	# Bloat decreases are uninteresting, and would need to be in
7675.686.29 by Stuart Bishop Spelling	365	# a separate table sorted in reverse anyway.
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	366	if bloated_index.delta_bloat_percent > 0:
7675.686.33 by Stuart Bishop Increase index name length further. Should be dynamic?	367	print "%65s \|\| +%4.2f%% \|\| +%s" % (
7675.686.27 by Stuart Bishop Disk utilization statistics are expensive to make and only calculated daily. Handle this gracefully on historical reports or reports on < 24 hour time ranges	368	bloated_index.sub_name,
	369	bloated_index.delta_bloat_percent,
	370	bloated_index.delta_bloat_size)
7675.686.25 by Stuart Bishop Improve bloat reports	371
7675.686.22 by Stuart Bishop Start of bloat report	372
7675.686.9 by Stuart Bishop Basic database utilization report	373	if __name__ == '__main__':
	374	main()