70
70
def __cmp__(self, other):
71
71
return cmp(self.title.lower(), other.title.lower())
73
def __deepcopy__(self, memo):
74
# We provide __deepcopy__ because the module doesn't handle
75
# compiled regular expression by default.
76
return Category(self.title, self.regexp)
79
class OnlineStatsCalculator:
80
"""Object that can compute count, sum, mean, variance and median.
82
It computes these value incrementally and using minimal storage
83
using the Welford / Knuth algorithm described at
84
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
90
self.M2 = 0.0 # Sum of square difference
94
"""Incrementally update the stats when adding x to the set.
96
None values are ignored.
102
delta = x - self.mean
103
self.mean = float(self.sum)/self.count
104
self.M2 += delta*(x - self.mean)
108
"""Return the population variance."""
112
return self.M2/self.count
116
"""Return the standard deviation."""
120
return math.sqrt(self.variance)
122
def __add__(self, other):
123
"""Adds this and another OnlineStatsCalculator.
125
The result combines the stats of the two objects.
127
results = OnlineStatsCalculator()
128
results.count = self.count + other.count
129
results.sum = self.sum + other.sum
130
if self.count > 0 and other.count > 0:
131
# This is 2.1b in Chan, Tony F.; Golub, Gene H.; LeVeque,
132
# Randall J. (1979), "Updating Formulae and a Pairwise Algorithm
133
# for Computing Sample Variances.",
134
# Technical Report STAN-CS-79-773,
135
# Department of Computer Science, Stanford University,
136
# ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf .
137
results.M2 = self.M2 + other.M2 + (
138
(float(self.count) / (other.count * results.count)) *
139
((float(other.count) / self.count) * self.sum - other.sum)**2)
141
results.M2 = self.M2 + other.M2 # One of them is 0.
142
if results.count > 0:
143
results.mean = float(results.sum) / results.count
147
class OnlineApproximateMedian:
148
"""Approximate the median of a set of elements.
150
This implements a space-efficient algorithm which only sees each value
151
once. (It will hold in memory log bucket_size of n elements.)
153
It was described and analysed in
154
D. Cantone and M.Hofri,
155
"Analysis of An Approximate Median Selection Algorithm"
156
ftp://ftp.cs.wpi.edu/pub/techreports/pdf/06-17.pdf
158
This algorithm is similar to Tukey's median of medians technique.
159
It will compute the median among bucket_size values. And the median among
163
def __init__(self, bucket_size=9):
164
"""Creates a new estimator.
166
It approximates the median by finding the median among each
167
successive bucket_size element. And then using these medians for other
170
The bucket size should be a low odd-integer.
172
self.bucket_size = bucket_size
173
# Index of the median in a completed bucket.
174
self.median_idx = (bucket_size-1)//2
177
def update(self, x, order=0):
184
# Create bucket on demand.
185
if i >= len(self.buckets):
186
for n in range((i+1)-len(self.buckets)):
187
self.buckets.append([])
188
bucket = self.buckets[i]
190
if len(bucket) == self.bucket_size:
191
# Select the median in this bucket, and promote it.
192
x = sorted(bucket)[self.median_idx]
193
# Free the bucket for the next round.
202
"""Return the median."""
203
# Find the 'weighted' median by assigning a weight to each
204
# element proportional to how far they have been selected.
207
for i, bucket in enumerate(self.buckets):
208
weight = self.bucket_size ** i
210
total_weight += weight
211
candidates.append([x, weight])
212
if len(candidates) == 0:
215
# Each weight is the equivalent of having the candidates appear
216
# that number of times in the array.
217
# So buckets like [[1, 2], [2, 3], [4, 2]] would be expanded to
218
# [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4,
219
# 4, 4, 4, 4, 4] and we find the median of that list (2).
220
# We don't expand the items to conserve memory.
221
median = (total_weight-1) / 2
223
for x, weight in sorted(candidates):
224
weighted_idx += weight
225
if weighted_idx > median:
228
def __add__(self, other):
229
"""Merge two approximators together.
231
All candidates from the other are merged through the standard
232
algorithm, starting at the same level. So an item that went through
233
two rounds of selection, will be compared with other items having
234
gone through the same number of rounds.
236
results = OnlineApproximateMedian(self.bucket_size)
237
results.buckets = copy.deepcopy(self.buckets)
238
for i, bucket in enumerate(other.buckets):
75
"""Bag to hold request statistics.
245
"""Bag to hold and compute request statistics.
77
247
All times are in seconds.
95
264
median_sqlstatements = 0
96
265
std_sqlstatements = 0
98
def __init__(self, times, timeout):
99
"""Compute the stats based on times.
101
Times is a list of (app_time, sql_statements, sql_times).
103
The histogram is a list of request counts per 1 second bucket.
104
ie. histogram[0] contains the number of requests taking between 0 and
105
1 second, histogram[1] contains the number of requests taking between
106
1 and 2 seconds etc. histogram is None if there are no requests in
268
def ninetyninth_percentile_time(self):
269
"""Time under which 99% of requests are rendered.
271
This is estimated as 3 std deviations from the mean. Given that
272
in a daily report, many URLs or PageIds won't have 100 requests, it's
273
more useful to use this estimator.
112
self.total_hits = len(times)
114
# Ignore missing values (-1) in computation.
115
times_array = numpy.ma.masked_values(
116
numpy.asarray(times, dtype=numpy.float32), -1.)
118
self.total_time, self.total_sqlstatements, self.total_sqltime = (
119
times_array.sum(axis=0))
121
self.mean, self.mean_sqlstatements, self.mean_sqltime = (
122
times_array.mean(axis=0))
124
self.median, self.median_sqlstatements, self.median_sqltime = (
125
numpy.median(times_array, axis=0))
127
self.std, self.std_sqlstatements, self.std_sqltime = (
128
numpy.std(times_array, axis=0))
130
# This is an approximation which may not be true: we don't know if we
131
# have a std distribution or not. We could just find the 99th
132
# percentile by counting. Shock. Horror; however this appears pretty
133
# good based on eyeballing things so far - once we're down in the 2-3
134
# second range for everything we may want to revisit.
135
self.ninetyninth_percentile_time = self.mean + self.std*3
137
histogram_width = int(timeout*1.5)
138
histogram_times = numpy.clip(times_array[:,0], 0, histogram_width)
139
histogram = numpy.histogram(
140
histogram_times, normed=True, range=(0, histogram_width),
141
bins=histogram_width)
142
self.histogram = zip(histogram[1], histogram[0])
145
class SQLiteRequestTimes:
146
"""SQLite-based request times computation."""
275
return self.mean + 3*self.std
278
def relative_histogram(self):
279
"""Return an histogram where the frequency is relative."""
281
return [[x, float(f)/self.total_hits] for x, f in self.histogram]
286
"""Return a textual version of the stats."""
287
return textwrap.dedent("""
288
<Stats for %d requests:
289
Time: total=%.2f; mean=%.2f; median=%.2f; std=%.2f
290
SQL time: total=%.2f; mean=%.2f; median=%.2f; std=%.2f
291
SQL stmt: total=%.f; mean=%.2f; median=%.f; std=%.2f
293
self.total_hits, self.total_time, self.mean, self.median,
294
self.std, self.total_sqltime, self.mean_sqltime,
295
self.median_sqltime, self.std_sqltime,
296
self.total_sqlstatements, self.mean_sqlstatements,
297
self.median_sqlstatements, self.std_sqlstatements))
300
class OnlineStats(Stats):
301
"""Implementation of stats that can be computed online.
303
You call update() for each request and the stats are updated incrementally
304
with minimum storage space.
307
def __init__(self, histogram_width):
308
self.time_stats = OnlineStatsCalculator()
309
self.time_median_approximate = OnlineApproximateMedian()
310
self.sql_time_stats = OnlineStatsCalculator()
311
self.sql_time_median_approximate = OnlineApproximateMedian()
312
self.sql_statements_stats = OnlineStatsCalculator()
313
self.sql_statements_median_approximate = OnlineApproximateMedian()
315
[x, 0] for x in range(histogram_width)]
318
def total_hits(self):
319
return self.time_stats.count
322
def total_time(self):
323
return self.time_stats.sum
327
return self.time_stats.mean
331
return self.time_median_approximate.median
335
return self.time_stats.std
338
def total_sqltime(self):
339
return self.sql_time_stats.sum
342
def mean_sqltime(self):
343
return self.sql_time_stats.mean
346
def median_sqltime(self):
347
return self.sql_time_median_approximate.median
350
def std_sqltime(self):
351
return self.sql_time_stats.std
354
def total_sqlstatements(self):
355
return self.sql_statements_stats.sum
358
def mean_sqlstatements(self):
359
return self.sql_statements_stats.mean
362
def median_sqlstatements(self):
363
return self.sql_statements_median_approximate.median
366
def std_sqlstatements(self):
367
return self.sql_statements_stats.std
371
if self.time_stats.count:
372
return self._histogram
376
def update(self, request):
377
"""Update the stats based on request."""
378
self.time_stats.update(request.app_seconds)
379
self.time_median_approximate.update(request.app_seconds)
380
self.sql_time_stats.update(request.sql_seconds)
381
self.sql_time_median_approximate.update(request.sql_seconds)
382
self.sql_statements_stats.update(request.sql_statements)
383
self.sql_statements_median_approximate.update(request.sql_statements)
385
idx = int(min(len(self.histogram)-1, request.app_seconds))
386
self.histogram[idx][1] += 1
388
def __add__(self, other):
389
"""Merge another OnlineStats with this one."""
390
results = copy.deepcopy(self)
391
results.time_stats += other.time_stats
392
results.time_median_approximate += other.time_median_approximate
393
results.sql_time_stats += other.sql_time_stats
394
results.sql_time_median_approximate += (
395
other.sql_time_median_approximate)
396
results.sql_statements_stats += other.sql_statements_stats
397
results.sql_statements_median_approximate += (
398
other.sql_statements_median_approximate)
399
for i, (n, f) in enumerate(other._histogram):
400
results._histogram[i][1] += f
405
"""Collect statistics from requests.
407
Statistics are updated by calling the add_request() method.
409
Statistics for mean/stddev/total/median for request times, SQL times and
410
number of SQL statements are collected.
412
They are grouped by Category, URL or PageID.
148
415
def __init__(self, categories, options):
149
if options.db_file is None:
150
fd, self.filename = tempfile.mkstemp(suffix='.db', prefix='ppr')
153
self.filename = options.db_file
154
self.con = sqlite3.connect(self.filename, isolation_level='EXCLUSIVE')
155
log.debug('Using request database %s' % self.filename)
156
# Some speed optimization.
157
self.con.execute('PRAGMA synchronous = off')
158
self.con.execute('PRAGMA journal_mode = off')
160
self.categories = categories
161
self.store_all_request = options.pageids or options.top_urls
162
self.timeout = options.timeout
163
self.cur = self.con.cursor()
165
# Create the tables, ignore errors about them being already present.
168
CREATE TABLE category_request (
171
sql_statements INTEGER,
174
except sqlite3.OperationalError, e:
175
if 'already exists' in str(e):
180
if self.store_all_request:
183
CREATE TABLE request (
187
sql_statements INTEGER,
190
except sqlite3.OperationalError, e:
191
if 'already exists' in str(e):
416
self.by_pageids = options.pageids
417
self.top_urls = options.top_urls
418
# We only keep in memory 50 times the number of URLs we want to
419
# return. The number of URLs can go pretty high (because of the
420
# distinct query parameters).
422
# Keeping all in memory at once is prohibitive. On a small but
423
# representative sample, keeping 50 times the possible number of
424
# candidates and culling to 90% on overflow, generated an identical
425
# report than keeping all the candidates in-memory.
427
# Keeping 10 times or culling at 90% generated a near-identical report
428
# (it differed a little in the tail.)
430
# The size/cull parameters might need to change if the requests
431
# distribution become very different than what it currently is.
432
self.top_urls_cache_size = self.top_urls * 50
434
# Histogram has a bin per second up to 1.5 our timeout.
435
self.histogram_width = int(options.timeout*1.5)
436
self.category_times = [
437
(category, OnlineStats(self.histogram_width))
438
for category in categories]
440
self.pageid_times = {}
196
442
def add_request(self, request):
197
"""Add a request to the cache."""
198
sql_statements = request.sql_statements
199
sql_seconds = request.sql_seconds
201
# Store missing value as -1, as it makes dealing with those
203
if sql_statements is None:
205
if sql_seconds is None:
207
for idx, category in enumerate(self.categories):
443
"""Add request to the set of requests we collect stats for."""
444
for category, stats in self.category_times:
208
445
if category.match(request):
210
"INSERT INTO category_request VALUES (?,?,?,?)",
211
(idx, request.app_seconds, sql_statements, sql_seconds))
446
stats.update(request)
213
if self.store_all_request:
214
449
pageid = request.pageid or 'Unknown'
216
"INSERT INTO request VALUES (?,?,?,?,?)",
217
(pageid, request.url, request.app_seconds, sql_statements,
450
stats = self.pageid_times.setdefault(
451
pageid, OnlineStats(self.histogram_width))
452
stats.update(request)
221
"""Call commit on the underlying connection."""
455
stats = self.url_times.setdefault(
456
request.url, OnlineStats(self.histogram_width))
457
stats.update(request)
458
# Whenever we have more URLs than we need to, discard 10%
459
# that is less likely to end up in the top.
460
if len(self.url_times) > self.top_urls_cache_size:
461
cutoff = int(self.top_urls_cache_size*0.90)
462
self.url_times = dict(
463
sorted(self.url_times.items(),
464
key=lambda (url, stats): stats.total_time,
465
reverse=True)[:cutoff])
224
467
def get_category_times(self):
225
468
"""Return the times for each category."""
226
category_query = 'SELECT * FROM category_request ORDER BY category'
228
empty_stats = Stats([], 0)
229
categories = dict(self.get_times(category_query))
231
(category, categories.get(idx, empty_stats))
232
for idx, category in enumerate(self.categories)]
234
def get_top_urls_times(self, top_n):
469
return self.category_times
471
def get_top_urls_times(self):
235
472
"""Return the times for the Top URL by total time"""
237
SELECT url, time, sql_statements, sql_time
238
FROM request WHERE url IN (
239
SELECT url FROM (SELECT url, sum(time) FROM request
241
ORDER BY sum(time) DESC
245
473
# Sort the result by total time
247
self.get_times(top_url_query), key=lambda x: x[1].total_time,
475
self.url_times.items(),
476
key=lambda (url, stats): stats.total_time,
477
reverse=True)[:self.top_urls]
250
479
def get_pageid_times(self):
251
480
"""Return the times for the pageids."""
253
SELECT pageid, time, sql_statements, sql_time
257
return self.get_times(pageid_query)
259
def get_times(self, query):
260
"""Return a list of key, stats based on the query.
262
The query should return rows of the form:
263
[key, app_time, sql_statements, sql_times]
265
And should be sorted on key.
270
self.cur.execute(query)
272
rows = self.cur.fetchmany()
276
# We are encountering a new group...
277
if row[0] != current_key:
278
# Compute the stats of the previous group
279
if current_key != None:
281
(current_key, Stats(times, self.timeout)))
282
# Initialize the new group.
286
times.append(row[1:])
287
# Compute the stats of the last group
288
if current_key != None:
289
results.append((current_key, Stats(times, self.timeout)))
481
# Sort the result by pageid
482
return sorted(self.pageid_times.items())
484
def __add__(self, other):
485
"""Merge two RequestTimes together."""
486
results = copy.deepcopy(self)
487
for other_category, other_stats in other.category_times:
488
for i, (category, stats) in enumerate(self.category_times):
489
if category.title == other_category.title:
490
results.category_times[i] = (
491
category, stats + other_stats)
494
results.category_times.append(
495
(other_category, copy.deepcopy(other_stats)))
497
url_times = results.url_times
498
for url, stats in other.url_times.items():
500
url_times[url] += stats
502
url_times[url] = copy.deepcopy(stats)
503
# Only keep top_urls_cache_size entries.
504
if len(self.url_times) > self.top_urls_cache_size:
505
self.url_times = dict(
508
key=lambda (url, stats): stats.total_time,
509
reverse=True)[:self.top_urls_cache_size])
511
pageid_times = results.pageid_times
512
for pageid, stats in other.pageid_times.items():
513
if pageid in pageid_times:
514
pageid_times[pageid] += stats
516
pageid_times[pageid] = copy.deepcopy(stats)
293
def close(self, remove=False):
294
"""Close the SQLite connection.
296
:param remove: If true, the DB file will be removed.
300
log.debug('Deleting request database.')
301
os.unlink(self.filename)
303
log.debug('Keeping request database %s.' % self.filename)
307
522
parser = LPOptionParser("%prog [args] tracelog [...]")