1
# Copyright 2009 Canonical Ltd. This software is licensed under the
2
# GNU Affero General Public License version 3 (see the file LICENSE).
4
"""Scrub a Launchpad database of private data."""
16
from storm.expr import (
21
from zope.component import getUtility
23
from canonical.database.constants import UTC_NOW
24
from canonical.database.sqlbase import cursor, sqlvalues
25
from canonical.database.postgresql import ConnectionString, listReferences
26
from canonical.launchpad.scripts.logger import DEBUG2, DEBUG3
27
from canonical.launchpad.webapp.interfaces import (
28
IStoreSelector, MAIN_STORE, MASTER_FLAVOR)
29
from canonical.lp import initZopeless
30
from lp.services.scripts.base import LaunchpadScript
33
class SanitizeDb(LaunchpadScript):
34
usage = "%prog [options] pg_connection_string"
35
description = "Destroy private information in a Launchpad database."
37
def add_my_options(self):
38
self.parser.add_option(
39
"-f", "--force", action="store_true", default=False,
40
help="Force running against a possible production database.")
41
self.parser.add_option(
42
"-n", "--dry-run", action="store_true", default=False,
43
help="Don't commit changes.")
45
def _init_db(self, isolation):
46
if len(self.args) == 0:
47
self.parser.error("PostgreSQL connection string required.")
48
elif len(self.args) > 1:
49
self.parser.error("Too many arguments.")
51
self.pg_connection_string = ConnectionString(self.args[0])
53
if ('prod' in str(self.pg_connection_string)
54
and not self.options.force):
56
"Attempting to sanitize a potential production database '%s'. "
57
"--force required." % self.pg_connection_string.dbname)
59
self.logger.debug("Connect using '%s'." % self.pg_connection_string)
61
self.txn = initZopeless(
62
dbname=self.pg_connection_string.dbname,
63
dbhost=self.pg_connection_string.host,
64
dbuser=self.pg_connection_string.user,
67
self.store = getUtility(IStoreSelector).get(MAIN_STORE, MASTER_FLAVOR)
70
self.allForeignKeysCascade()
71
triggers_to_disable = [
72
('bugmessage', 'set_bug_message_count_t'),
73
('bugmessage', 'set_date_last_message_t'),
75
self.disableTriggers(triggers_to_disable)
83
'commercialsubscription',
88
'mailinglistsubscription',
94
'openidconsumerassociation',
95
'openidconsumernonce',
102
'shipitsurveyanswer',
103
'shipitsurveyquestion',
104
'shipitsurveyresult',
108
'sprintattendance', # Is this private?
109
'standardshipitrequest',
110
'temporaryblobstorage',
116
for table in tables_to_empty:
117
self.removeTableRows(table)
119
self.removePrivatePeople()
120
self.removePrivateTeams()
121
self.removePrivateBugs()
122
self.removePrivateBugMessages()
123
self.removePrivateBranches()
124
self.removePrivateHwSubmissions()
125
self.removePrivateSpecifications()
126
self.removePrivateLocations()
127
self.removePrivateArchives()
128
self.removePrivateAnnouncements()
129
self.removePrivateLibrarianFiles()
130
self.removeInactiveProjects()
131
self.removeInactiveProducts()
132
self.removeInvalidEmailAddresses()
133
self.removePPAArchivePermissions()
134
self.scrambleHiddenEmailAddresses()
136
self.removeDeactivatedPeopleAndAccounts()
138
# Remove unlinked records. These might contain private data.
139
self.removeUnlinkedEmailAddresses()
140
self.removeUnlinkedAccounts()
141
self.removeUnlinked('revision', [
142
('revisioncache', 'revision'),
143
('revisionparent', 'revision'),
144
('revisionproperty', 'revision'),
146
self.removeUnlinked('libraryfilealias', [
147
('libraryfiledownloadcount', 'libraryfilealias')])
148
self.removeUnlinked('libraryfilecontent')
149
self.removeUnlinked('message', [('messagechunk', 'message')])
150
self.removeUnlinked('staticdiff')
151
self.removeUnlinked('previewdiff')
152
self.removeUnlinked('diff')
154
# Scrub data after removing all the records we are going to.
155
# No point scrubbing data that is going to get removed later.
157
('account', ['status_comment']),
158
('distribution', ['reviewer_whiteboard']),
159
('distributionmirror', ['whiteboard']),
160
('hwsubmission', ['raw_emailaddress']),
161
('nameblacklist', ['comment']),
163
'personal_standing_reason',
164
'mail_resumption_date']),
165
('product', ['reviewer_whiteboard']),
166
('project', ['reviewer_whiteboard']),
167
('revisionauthor', ['email']),
168
('signedcodeofconduct', ['admincomment']),
170
for table, column in columns_to_scrub:
171
self.scrubColumn(table, column)
173
self.enableTriggers(triggers_to_disable)
176
self.resetForeignKeysCascade()
177
if self.options.dry_run:
178
self.logger.info("Dry run - rolling back.")
181
self.logger.info("Committing.")
184
def removeDeactivatedPeopleAndAccounts(self):
185
"""Remove all suspended and deactivated people & their accounts.
187
Launchpad celebrities are ignored.
189
from canonical.launchpad.database.account import Account
190
from canonical.launchpad.database.emailaddress import EmailAddress
191
from canonical.launchpad.interfaces.account import AccountStatus
192
from lp.app.interfaces.launchpad import ILaunchpadCelebrities
193
from lp.registry.model.person import Person
194
celebrities = getUtility(ILaunchpadCelebrities)
195
# This is a slow operation due to the huge amount of cascading.
196
# We remove one row at a time for better reporting and PostgreSQL
198
deactivated_people = self.store.find(
200
Person.account == Account.id,
201
Account.status != AccountStatus.ACTIVE)
202
total_deactivated_count = deactivated_people.count()
203
deactivated_count = 0
204
for person in deactivated_people:
206
if celebrities.isCelebrityPerson(person.name):
208
deactivated_count += 1
210
"Removing %d of %d deactivated people (%s)",
211
deactivated_count, total_deactivated_count, person.name)
212
# Clean out the EmailAddress and Account for this person
213
# while we are here, making subsequent unbatched steps
214
# faster. These don't cascade due to the lack of a foreign
215
# key constraint between Person and EmailAddress, and the
216
# ON DELETE SET NULL foreign key constraint between
217
# EmailAddress and Account.
219
EmailAddress, EmailAddress.person == person).remove()
220
self.store.find(Account, Account.id == person.accountID).remove()
221
self.store.remove(person)
224
"Removed %d suspended or deactivated people + email + accounts",
227
def removePrivatePeople(self):
228
"""Remove all private people."""
229
from lp.registry.interfaces.person import PersonVisibility
230
from lp.registry.model.person import Person
231
count = self.store.find(
233
Person.teamowner == None,
234
Person.visibility != PersonVisibility.PUBLIC).remove()
236
self.logger.info("Removed %d private people.", count)
238
def removePrivateTeams(self):
239
"""Remove all private people."""
240
from lp.registry.interfaces.person import PersonVisibility
241
from lp.registry.model.person import Person
242
count = self.store.find(
244
Person.teamowner != None,
245
Person.visibility != PersonVisibility.PUBLIC).remove()
247
self.logger.info("Removed %d private teams.", count)
249
def removePrivateBugs(self):
250
"""Remove all private bugs."""
251
from lp.bugs.model.bug import Bug
252
count = self.store.find(Bug, Bug.private == True).remove()
254
self.logger.info("Removed %d private bugs.", count)
256
def removePrivateBugMessages(self):
257
"""Remove all hidden bug messages."""
258
from lp.bugs.model.bugmessage import BugMessage
259
from lp.services.messages.model.message import Message
260
message_ids = list(self.store.using(*[
262
Join(Message, BugMessage.messageID == Message.id),
263
]).find(BugMessage.id, Message.visible == False))
265
count = self.store.find(
266
BugMessage, BugMessage.id.is_in(message_ids)).remove()
268
self.logger.info("Removed %d private bug messages.", count)
270
def removePrivateBranches(self):
271
"""Remove all private branches."""
272
from lp.code.model.branch import Branch
273
count = self.store.find(Branch, Branch.private == True).remove()
275
self.logger.info("Removed %d private branches.", count)
277
def removePrivateHwSubmissions(self):
278
"""Remove all private hardware submissions."""
279
from lp.hardwaredb.model.hwdb import HWSubmission
280
count = self.store.find(
281
HWSubmission, HWSubmission.private == True).remove()
283
self.logger.info("Removed %d private hardware submissions.", count)
285
def removePrivateSpecifications(self):
286
"""Remove all private specifications."""
287
from lp.blueprints.model.specification import Specification
288
count = self.store.find(
289
Specification, Specification.private == True).remove()
291
self.logger.info("Removed %d private specifications.", count)
293
def removePrivateLocations(self):
294
"""Remove private person locations."""
295
from lp.registry.model.personlocation import PersonLocation
296
count = self.store.find(
297
PersonLocation, PersonLocation.visible == False).remove()
299
self.logger.info("Removed %d person locations.", count)
301
def removePrivateArchives(self):
302
"""Remove private archives.
304
This might over delete, but lets be conservative for now.
306
from lp.soyuz.model.archive import Archive
307
count = self.store.find(Archive, Archive.private == True).remove()
310
"Removed %d private archives.", count)
312
def removePrivateAnnouncements(self):
313
"""Remove announcements that have not yet been published."""
314
from lp.registry.model.announcement import Announcement
315
count = self.store.find(
317
Announcement.date_announced == None,
318
Announcement.date_announced > UTC_NOW,
319
Announcement.active == False)).remove()
322
"Removed %d unpublished announcements.", count)
324
def removePrivateLibrarianFiles(self):
325
"""Remove librarian files only available via the restricted librarian.
327
from canonical.launchpad.database.librarian import LibraryFileAlias
328
count = self.store.find(
329
LibraryFileAlias, LibraryFileAlias.restricted == True).remove()
331
self.logger.info("Removed %d restricted librarian files.", count)
333
def removeInactiveProjects(self):
334
"""Remove inactive projects."""
335
from lp.registry.model.projectgroup import ProjectGroup
336
count = self.store.find(
337
ProjectGroup, ProjectGroup.active == False).remove()
339
self.logger.info("Removed %d inactive product groups.", count)
341
def removeInactiveProducts(self):
342
"""Remove inactive products."""
343
from lp.registry.model.product import Product
344
count = self.store.find(
345
Product, Product.active == False).remove()
347
self.logger.info("Removed %d inactive products.", count)
349
def removeTableRows(self, table):
350
"""Remove all data from a table."""
351
count = self.store.execute("DELETE FROM %s" % table).rowcount
352
self.store.execute("ANALYZE %s" % table)
353
self.logger.info("Removed %d %s rows (all).", count, table)
355
def removeUnlinked(self, table, ignores=()):
356
"""Remove all unlinked entries in the table.
358
References from the ignores list are ignored.
360
:param table: table name.
362
:param ignores: list of (table, column) references to ignore.
365
for result in listReferences(cursor(), table, 'id'):
366
(from_table, from_column, to_table,
367
to_column, update, delete) = result
368
if (to_table == table and to_column == 'id'
369
and (from_table, from_column) not in ignores):
371
"EXCEPT SELECT %s FROM %s" % (from_column, from_table))
373
"DELETE FROM %s USING (SELECT id FROM %s %s) AS Unreferenced "
374
"WHERE %s.id = Unreferenced.id"
375
% (table, table, ' '.join(references), table))
376
self.logger.log(DEBUG2, query)
377
count = self.store.execute(query).rowcount
378
self.logger.info("Removed %d unlinked %s rows.", count, table)
380
def removeInvalidEmailAddresses(self):
381
"""Remove all invalid and old email addresses."""
382
from canonical.launchpad.database.emailaddress import EmailAddress
383
from canonical.launchpad.interfaces.emailaddress import (
385
count = self.store.find(
387
EmailAddress.status == EmailAddressStatus.NEW,
388
EmailAddress.status == EmailAddressStatus.OLD,
389
EmailAddress.email.lower().like(
390
u'%@example.com', case_sensitive=True))).remove()
393
"Removed %d invalid, unvalidated and old email addresses.", count)
395
def removePPAArchivePermissions(self):
396
"""Remove ArchivePermission records for PPAs."""
397
from lp.soyuz.enums import ArchivePurpose
398
count = self.store.execute("""
399
DELETE FROM ArchivePermission
401
WHERE ArchivePermission.archive = Archive.id
402
AND Archive.purpose = %s
403
""" % sqlvalues(ArchivePurpose.PPA)).rowcount
405
"Removed %d ArchivePermission records linked to PPAs.", count)
407
def scrambleHiddenEmailAddresses(self):
408
"""Hide email addresses users have requested to not be public.
410
Call after removeInvalidEmailAddresses to avoid any possible
413
This replaces the email addresses of all people with
414
hide_email_addresses set with an @example.com email address.
416
# One day there might be Storm documentation telling me how to
417
# do this via the ORM.
418
count = self.store.execute("""
420
SET email='e' || text(EmailAddress.id) || '@example.com'
422
WHERE EmailAddress.person = Person.id
423
AND Person.hide_email_addresses IS TRUE
426
"Replaced %d hidden email addresses with @example.com", count)
428
def removeUnlinkedEmailAddresses(self):
429
"""Remove EmailAddresses not linked to a Person.
431
We call this before removeUnlinkedAccounts to avoid the
432
ON DELETE SET NULL overhead from the EmailAddress -> Account
433
foreign key constraint.
435
from canonical.launchpad.database.emailaddress import EmailAddress
436
count = self.store.find(
437
EmailAddress, EmailAddress.person == None).remove()
440
"Removed %d email addresses not linked to people.", count)
442
def removeUnlinkedAccounts(self):
443
"""Remove Accounts not linked to a Person."""
444
from canonical.launchpad.database.account import Account
445
from lp.registry.model.person import Person
446
all_accounts = self.store.find(Account)
447
linked_accounts = self.store.find(
448
Account, Account.id == Person.accountID)
449
unlinked_accounts = all_accounts.difference(linked_accounts)
450
total_unlinked_accounts = unlinked_accounts.count()
452
for account in unlinked_accounts:
453
self.store.remove(account)
457
"Removed %d of %d unlinked accounts."
458
% (count, total_unlinked_accounts))
459
self.logger.info("Removed %d accounts not linked to a person", count)
461
def scrubColumn(self, table, columns):
462
"""Remove production admin related notes."""
463
query = ["UPDATE %s SET" % table]
464
for column in columns:
465
query.append("%s = NULL" % column)
468
query.append("WHERE")
469
for column in columns:
470
query.append("%s IS NOT NULL" % column)
473
self.logger.log(DEBUG3, ' '.join(query))
474
count = self.store.execute(' '.join(query)).rowcount
476
"Scrubbed %d %s.{%s} entries."
477
% (count, table, ','.join(columns)))
479
def allForeignKeysCascade(self):
480
"""Set all foreign key constraints to ON DELETE CASCADE.
482
The current state is recorded first so resetForeignKeysCascade
483
can repair the changes.
485
Only tables in the public schema are modified.
487
# Get the SQL needed to create the foreign key constraints.
488
# pg_dump seems the only sane way of getting this. We could
489
# generate the SQL ourselves using the pg_constraints table,
490
# but that can change between PostgreSQL releases.
491
# Ideally we could use ALTER CONSTRAINT, but that doesn't exist.
492
# Or modify pg_constraints, but that doesn't work.
494
'pg_dump', '--no-privileges', '--no-owner', '--schema-only',
497
self.pg_connection_string.asPGCommandLineArgs().split(' '))
498
self.logger.debug("Running %s", ' '.join(cmd))
499
pg_dump = subprocess.Popen(
500
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
501
stdin=subprocess.PIPE)
502
(pg_dump_out, pg_dump_err) = pg_dump.communicate()
503
if pg_dump.returncode != 0:
504
self.fail("pg_dump returned %d" % pg_dump.returncode)
509
(?x) ALTER \s+ TABLE \s+ ONLY \s+ (".*?"|\w+?) \s+
510
ADD \s+ CONSTRAINT \s+ (".*?"|\w+?) \s+ FOREIGN \s+ KEY [^;]+;
512
for match in re.finditer(pattern, pg_dump_out):
513
table = match.group(1)
514
constraint = match.group(2)
518
# Drop the existing constraint so we can recreate it.
519
drop_sql = 'ALTER TABLE %s DROP CONSTRAINT %s;' % (
521
restore_sql.append(drop_sql)
522
cascade_sql.append(drop_sql)
524
# Store the SQL needed to restore the constraint.
525
restore_sql.append(sql)
527
# Recreate the constraint as ON DELETE CASCADE
528
sql = re.sub(r"""(?xs)^
530
(?:ON \s+ DELETE \s+ (?:NO\s+|SET\s+)?\w+)? \s*
531
((?:NOT\s+)? DEFERRABLE|) \s*
532
(INITIALLY\s+(?:DEFERRED|IMMEDIATE)|) \s*;
533
""", r"\1 ON DELETE CASCADE \2 \3;", sql)
534
cascade_sql.append(sql)
536
# Set all the foreign key constraints to ON DELETE CASCADE, really.
538
"Setting %d constraints to ON DELETE CASCADE",
539
len(cascade_sql) / 2)
540
for statement in cascade_sql:
541
self.logger.log(DEBUG3, statement)
542
self.store.execute(statement)
544
# Store the recovery SQL.
545
self._reset_foreign_key_sql = restore_sql
547
def resetForeignKeysCascade(self):
548
"""Reset the foreign key constraints' ON DELETE mode."""
550
"Resetting %d foreign key constraints to initial state.",
551
len(self._reset_foreign_key_sql)/2)
552
for statement in self._reset_foreign_key_sql:
553
self.store.execute(statement)
555
def disableTriggers(self, triggers_to_disable):
556
"""Disable a set of triggers.
558
:param triggers_to_disable: List of (table_name, trigger_name).
560
self.logger.debug("Disabling %d triggers." % len(triggers_to_disable))
561
for table_name, trigger_name in triggers_to_disable:
563
"Disabling trigger %s.%s." % (table_name, trigger_name))
565
"ALTER TABLE %s DISABLE TRIGGER %s"
566
% (table_name, trigger_name))
568
def enableTriggers(self, triggers_to_enable):
569
"""Renable a set of triggers.
571
:param triggers_to_enable: List of (table_name, trigger_name).
573
self.logger.debug("Enabling %d triggers." % len(triggers_to_enable))
574
for table_name, trigger_name in triggers_to_enable:
576
"Enabling trigger %s.%s." % (table_name, trigger_name))
578
"ALTER TABLE %s ENABLE TRIGGER %s"
579
% (table_name, trigger_name))
581
def repairData(self):
582
"""After scrubbing, repair any data possibly damaged in the process.
584
# Repair Bug.message_count and Bug.date_last_message.
585
# The triggers where disabled while we where doing the cascading
586
# deletes because they fail (attempting to change a mutating table).
587
# We can repair these caches by forcing the triggers to run for
589
self.store.execute("""
590
UPDATE Message SET visible=visible
592
WHERE BugMessage.message = Message.id
595
def _fail(self, error_message):
596
self.logger.fatal(error_message)