1
# Copyright 2009 Canonical Ltd. This software is licensed under the
2
# GNU Affero General Public License version 3 (see the file LICENSE).
4
"""Scrub a Launchpad database of private data."""
16
from storm.expr import (
21
from zope.component import getUtility
23
from canonical.database.constants import UTC_NOW
24
from canonical.database.sqlbase import cursor, sqlvalues
25
from canonical.database.postgresql import ConnectionString, listReferences
26
from canonical.launchpad.scripts.logger import DEBUG2, DEBUG3
27
from canonical.launchpad.webapp.interfaces import (
28
IStoreSelector, MAIN_STORE, MASTER_FLAVOR)
29
from canonical.lp import initZopeless
30
from lp.services.scripts.base import LaunchpadScript
33
class SanitizeDb(LaunchpadScript):
34
usage = "%prog [options] pg_connection_string"
35
description = "Destroy private information in a Launchpad database."
37
def add_my_options(self):
38
self.parser.add_option(
39
"-f", "--force", action="store_true", default=False,
40
help="Force running against a possible production database.")
41
self.parser.add_option(
42
"-n", "--dry-run", action="store_true", default=False,
43
help="Don't commit changes.")
45
def _init_db(self, isolation):
46
if len(self.args) == 0:
47
self.parser.error("PostgreSQL connection string required.")
48
elif len(self.args) > 1:
49
self.parser.error("Too many arguments.")
51
self.pg_connection_string = ConnectionString(self.args[0])
53
if ('prod' in str(self.pg_connection_string)
54
and not self.options.force):
56
"Attempting to sanitize a potential production database '%s'. "
57
"--force required." % self.pg_connection_string.dbname)
59
self.logger.debug("Connect using '%s'." % self.pg_connection_string)
61
self.txn = initZopeless(
62
dbname=self.pg_connection_string.dbname,
63
dbhost=self.pg_connection_string.host,
64
dbuser=self.pg_connection_string.user,
67
self.store = getUtility(IStoreSelector).get(MAIN_STORE, MASTER_FLAVOR)
70
self.allForeignKeysCascade()
71
triggers_to_disable = [
72
('bugmessage', 'set_bug_message_count_t'),
73
('bugmessage', 'set_date_last_message_t'),
75
self.disableTriggers(triggers_to_disable)
83
'commercialsubscription',
88
'mailinglistsubscription',
94
'openidconsumerassociation',
95
'openidconsumernonce',
102
'shipitsurveyanswer',
103
'shipitsurveyquestion',
104
'shipitsurveyresult',
108
'sprintattendance', # Is this private?
109
'standardshipitrequest',
110
'temporaryblobstorage',
116
for table in tables_to_empty:
117
self.removeTableRows(table)
119
self.removePrivatePeople()
120
self.removePrivateTeams()
121
self.removePrivateBugs()
122
self.removePrivateBugMessages()
123
self.removePrivateBranches()
124
self.removePrivateHwSubmissions()
125
self.removePrivateSpecifications()
126
self.removePrivateLocations()
127
self.removePrivateArchives()
128
self.removePrivateAnnouncements()
129
self.removePrivateLibrarianFiles()
130
self.removeInactiveProjects()
131
self.removeInactiveProducts()
132
self.removeInvalidEmailAddresses()
133
self.removePPAArchivePermissions()
134
self.scrambleHiddenEmailAddresses()
136
self.removeDeactivatedPeopleAndAccounts()
138
# Remove unlinked records. These might contain private data.
139
self.removeUnlinkedEmailAddresses()
140
self.removeUnlinkedAccounts()
141
self.removeUnlinked('revision', [
142
('revisioncache', 'revision'),
143
('revisionparent', 'revision'),
144
('revisionproperty', 'revision'),
146
self.removeUnlinked('libraryfilealias', [
147
('libraryfiledownloadcount', 'libraryfilealias')])
148
self.removeUnlinked('libraryfilecontent')
149
self.removeUnlinked('message', [('messagechunk', 'message')])
150
self.removeUnlinked('staticdiff')
151
self.removeUnlinked('previewdiff')
152
self.removeUnlinked('diff')
154
# Scrub data after removing all the records we are going to.
155
# No point scrubbing data that is going to get removed later.
157
('account', ['status_comment']),
158
('distribution', ['reviewer_whiteboard']),
159
('distributionmirror', ['whiteboard']),
160
('hwsubmission', ['raw_emailaddress']),
161
('nameblacklist', ['comment']),
163
'personal_standing_reason',
164
'mail_resumption_date']),
165
('product', ['reviewer_whiteboard']),
166
('project', ['reviewer_whiteboard']),
167
('revisionauthor', ['email']),
168
('signedcodeofconduct', ['admincomment']),
170
for table, column in columns_to_scrub:
171
self.scrubColumn(table, column)
173
self.enableTriggers(triggers_to_disable)
176
self.resetForeignKeysCascade()
177
if self.options.dry_run:
178
self.logger.info("Dry run - rolling back.")
181
self.logger.info("Committing.")
184
def removeDeactivatedPeopleAndAccounts(self):
185
"""Remove all suspended and deactivated people & their accounts.
187
Launchpad celebrities are ignored.
189
from canonical.launchpad.database.account import Account
190
from canonical.launchpad.database.emailaddress import EmailAddress
191
from canonical.launchpad.interfaces.account import AccountStatus
192
from canonical.launchpad.interfaces.launchpad import (
193
ILaunchpadCelebrities)
194
from lp.registry.model.person import Person
195
celebrities = getUtility(ILaunchpadCelebrities)
196
# This is a slow operation due to the huge amount of cascading.
197
# We remove one row at a time for better reporting and PostgreSQL
199
deactivated_people = self.store.find(
201
Person.account == Account.id,
202
Account.status != AccountStatus.ACTIVE)
203
total_deactivated_count = deactivated_people.count()
204
deactivated_count = 0
205
for person in deactivated_people:
207
if celebrities.isCelebrityPerson(person.name):
209
deactivated_count += 1
211
"Removing %d of %d deactivated people (%s)",
212
deactivated_count, total_deactivated_count, person.name)
213
# Clean out the EmailAddress and Account for this person
214
# while we are here, making subsequent unbatched steps
215
# faster. These don't cascade due to the lack of a foreign
216
# key constraint between Person and EmailAddress, and the
217
# ON DELETE SET NULL foreign key constraint between
218
# EmailAddress and Account.
220
EmailAddress, EmailAddress.person == person).remove()
221
self.store.find(Account, Account.id == person.accountID).remove()
222
self.store.remove(person)
225
"Removed %d suspended or deactivated people + email + accounts",
228
def removePrivatePeople(self):
229
"""Remove all private people."""
230
from lp.registry.interfaces.person import PersonVisibility
231
from lp.registry.model.person import Person
232
count = self.store.find(
234
Person.teamowner == None,
235
Person.visibility != PersonVisibility.PUBLIC).remove()
237
self.logger.info("Removed %d private people.", count)
239
def removePrivateTeams(self):
240
"""Remove all private people."""
241
from lp.registry.interfaces.person import PersonVisibility
242
from lp.registry.model.person import Person
243
count = self.store.find(
245
Person.teamowner != None,
246
Person.visibility != PersonVisibility.PUBLIC).remove()
248
self.logger.info("Removed %d private teams.", count)
250
def removePrivateBugs(self):
251
"""Remove all private bugs."""
252
from lp.bugs.model.bug import Bug
253
count = self.store.find(Bug, Bug.private == True).remove()
255
self.logger.info("Removed %d private bugs.", count)
257
def removePrivateBugMessages(self):
258
"""Remove all hidden bug messages."""
259
from lp.bugs.model.bugmessage import BugMessage
260
from canonical.launchpad.database.message import Message
261
message_ids = list(self.store.using(*[
263
Join(Message, BugMessage.messageID == Message.id),
264
]).find(BugMessage.id, Message.visible == False))
266
count = self.store.find(
267
BugMessage, BugMessage.id.is_in(message_ids)).remove()
269
self.logger.info("Removed %d private bug messages.", count)
271
def removePrivateBranches(self):
272
"""Remove all private branches."""
273
from lp.code.model.branch import Branch
274
count = self.store.find(Branch, Branch.private == True).remove()
276
self.logger.info("Removed %d private branches.", count)
278
def removePrivateHwSubmissions(self):
279
"""Remove all private hardware submissions."""
280
from lp.hardwaredb.model.hwdb import HWSubmission
281
count = self.store.find(
282
HWSubmission, HWSubmission.private == True).remove()
284
self.logger.info("Removed %d private hardware submissions.", count)
286
def removePrivateSpecifications(self):
287
"""Remove all private specifications."""
288
from lp.blueprints.model.specification import Specification
289
count = self.store.find(
290
Specification, Specification.private == True).remove()
292
self.logger.info("Removed %d private specifications.", count)
294
def removePrivateLocations(self):
295
"""Remove private person locations."""
296
from lp.registry.model.personlocation import PersonLocation
297
count = self.store.find(
298
PersonLocation, PersonLocation.visible == False).remove()
300
self.logger.info("Removed %d person locations.", count)
302
def removePrivateArchives(self):
303
"""Remove private archives.
305
This might over delete, but lets be conservative for now.
307
from lp.soyuz.model.archive import Archive
308
count = self.store.find(Archive, Archive.private == True).remove()
311
"Removed %d private archives.", count)
313
def removePrivateAnnouncements(self):
314
"""Remove announcements that have not yet been published."""
315
from lp.registry.model.announcement import Announcement
316
count = self.store.find(
318
Announcement.date_announced == None,
319
Announcement.date_announced > UTC_NOW,
320
Announcement.active == False)).remove()
323
"Removed %d unpublished announcements.", count)
325
def removePrivateLibrarianFiles(self):
326
"""Remove librarian files only available via the restricted librarian.
328
from canonical.launchpad.database.librarian import LibraryFileAlias
329
count = self.store.find(
330
LibraryFileAlias, LibraryFileAlias.restricted == True).remove()
332
self.logger.info("Removed %d restricted librarian files.", count)
334
def removeInactiveProjects(self):
335
"""Remove inactive projects."""
336
from lp.registry.model.projectgroup import ProjectGroup
337
count = self.store.find(
338
ProjectGroup, ProjectGroup.active == False).remove()
340
self.logger.info("Removed %d inactive product groups.", count)
342
def removeInactiveProducts(self):
343
"""Remove inactive products."""
344
from lp.registry.model.product import Product
345
count = self.store.find(
346
Product, Product.active == False).remove()
348
self.logger.info("Removed %d inactive products.", count)
350
def removeTableRows(self, table):
351
"""Remove all data from a table."""
352
count = self.store.execute("DELETE FROM %s" % table).rowcount
353
self.store.execute("ANALYZE %s" % table)
354
self.logger.info("Removed %d %s rows (all).", count, table)
356
def removeUnlinked(self, table, ignores=()):
357
"""Remove all unlinked entries in the table.
359
References from the ignores list are ignored.
361
:param table: table name.
363
:param ignores: list of (table, column) references to ignore.
366
for result in listReferences(cursor(), table, 'id'):
367
(from_table, from_column, to_table,
368
to_column, update, delete) = result
369
if (to_table == table and to_column == 'id'
370
and (from_table, from_column) not in ignores):
372
"EXCEPT SELECT %s FROM %s" % (from_column, from_table))
374
"DELETE FROM %s USING (SELECT id FROM %s %s) AS Unreferenced "
375
"WHERE %s.id = Unreferenced.id"
376
% (table, table, ' '.join(references), table))
377
self.logger.log(DEBUG2, query)
378
count = self.store.execute(query).rowcount
379
self.logger.info("Removed %d unlinked %s rows.", count, table)
381
def removeInvalidEmailAddresses(self):
382
"""Remove all invalid and old email addresses."""
383
from canonical.launchpad.database.emailaddress import EmailAddress
384
from canonical.launchpad.interfaces.emailaddress import (
386
count = self.store.find(
388
EmailAddress.status == EmailAddressStatus.NEW,
389
EmailAddress.status == EmailAddressStatus.OLD,
390
EmailAddress.email.lower().like(
391
u'%@example.com', case_sensitive=True))).remove()
394
"Removed %d invalid, unvalidated and old email addresses.", count)
396
def removePPAArchivePermissions(self):
397
"""Remove ArchivePermission records for PPAs."""
398
from lp.soyuz.enums import ArchivePurpose
399
count = self.store.execute("""
400
DELETE FROM ArchivePermission
402
WHERE ArchivePermission.archive = Archive.id
403
AND Archive.purpose = %s
404
""" % sqlvalues(ArchivePurpose.PPA)).rowcount
406
"Removed %d ArchivePermission records linked to PPAs.", count)
408
def scrambleHiddenEmailAddresses(self):
409
"""Hide email addresses users have requested to not be public.
411
Call after removeInvalidEmailAddresses to avoid any possible
414
This replaces the email addresses of all people with
415
hide_email_addresses set with an @example.com email address.
417
# One day there might be Storm documentation telling me how to
418
# do this via the ORM.
419
count = self.store.execute("""
421
SET email='e' || text(EmailAddress.id) || '@example.com'
423
WHERE EmailAddress.person = Person.id
424
AND Person.hide_email_addresses IS TRUE
427
"Replaced %d hidden email addresses with @example.com", count)
429
def removeUnlinkedEmailAddresses(self):
430
"""Remove EmailAddresses not linked to a Person.
432
We call this before removeUnlinkedAccounts to avoid the
433
ON DELETE SET NULL overhead from the EmailAddress -> Account
434
foreign key constraint.
436
from canonical.launchpad.database.emailaddress import EmailAddress
437
count = self.store.find(
438
EmailAddress, EmailAddress.person == None).remove()
441
"Removed %d email addresses not linked to people.", count)
443
def removeUnlinkedAccounts(self):
444
"""Remove Accounts not linked to a Person."""
445
from canonical.launchpad.database.account import Account
446
from lp.registry.model.person import Person
447
all_accounts = self.store.find(Account)
448
linked_accounts = self.store.find(
449
Account, Account.id == Person.accountID)
450
unlinked_accounts = all_accounts.difference(linked_accounts)
451
total_unlinked_accounts = unlinked_accounts.count()
453
for account in unlinked_accounts:
454
self.store.remove(account)
458
"Removed %d of %d unlinked accounts."
459
% (count, total_unlinked_accounts))
460
self.logger.info("Removed %d accounts not linked to a person", count)
462
def scrubColumn(self, table, columns):
463
"""Remove production admin related notes."""
464
query = ["UPDATE %s SET" % table]
465
for column in columns:
466
query.append("%s = NULL" % column)
469
query.append("WHERE")
470
for column in columns:
471
query.append("%s IS NOT NULL" % column)
474
self.logger.log(DEBUG3, ' '.join(query))
475
count = self.store.execute(' '.join(query)).rowcount
477
"Scrubbed %d %s.{%s} entries."
478
% (count, table, ','.join(columns)))
480
def allForeignKeysCascade(self):
481
"""Set all foreign key constraints to ON DELETE CASCADE.
483
The current state is recorded first so resetForeignKeysCascade
484
can repair the changes.
486
Only tables in the public schema are modified.
488
# Get the SQL needed to create the foreign key constraints.
489
# pg_dump seems the only sane way of getting this. We could
490
# generate the SQL ourselves using the pg_constraints table,
491
# but that can change between PostgreSQL releases.
492
# Ideally we could use ALTER CONSTRAINT, but that doesn't exist.
493
# Or modify pg_constraints, but that doesn't work.
495
'pg_dump', '--no-privileges', '--no-owner', '--schema-only',
498
self.pg_connection_string.asPGCommandLineArgs().split(' '))
499
self.logger.debug("Running %s", ' '.join(cmd))
500
pg_dump = subprocess.Popen(
501
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
502
stdin=subprocess.PIPE)
503
(pg_dump_out, pg_dump_err) = pg_dump.communicate()
504
if pg_dump.returncode != 0:
505
self.fail("pg_dump returned %d" % pg_dump.returncode)
510
(?x) ALTER \s+ TABLE \s+ ONLY \s+ (".*?"|\w+?) \s+
511
ADD \s+ CONSTRAINT \s+ (".*?"|\w+?) \s+ FOREIGN \s+ KEY [^;]+;
513
for match in re.finditer(pattern, pg_dump_out):
514
table = match.group(1)
515
constraint = match.group(2)
519
# Drop the existing constraint so we can recreate it.
520
drop_sql = 'ALTER TABLE %s DROP CONSTRAINT %s;' % (
522
restore_sql.append(drop_sql)
523
cascade_sql.append(drop_sql)
525
# Store the SQL needed to restore the constraint.
526
restore_sql.append(sql)
528
# Recreate the constraint as ON DELETE CASCADE
529
sql = re.sub(r"""(?xs)^
531
(?:ON \s+ DELETE \s+ (?:NO\s+|SET\s+)?\w+)? \s*
532
((?:NOT\s+)? DEFERRABLE|) \s*
533
(INITIALLY\s+(?:DEFERRED|IMMEDIATE)|) \s*;
534
""", r"\1 ON DELETE CASCADE \2 \3;", sql)
535
cascade_sql.append(sql)
537
# Set all the foreign key constraints to ON DELETE CASCADE, really.
539
"Setting %d constraints to ON DELETE CASCADE",
540
len(cascade_sql) / 2)
541
for statement in cascade_sql:
542
self.logger.log(DEBUG3, statement)
543
self.store.execute(statement)
545
# Store the recovery SQL.
546
self._reset_foreign_key_sql = restore_sql
548
def resetForeignKeysCascade(self):
549
"""Reset the foreign key constraints' ON DELETE mode."""
551
"Resetting %d foreign key constraints to initial state.",
552
len(self._reset_foreign_key_sql)/2)
553
for statement in self._reset_foreign_key_sql:
554
self.store.execute(statement)
556
def disableTriggers(self, triggers_to_disable):
557
"""Disable a set of triggers.
559
:param triggers_to_disable: List of (table_name, trigger_name).
561
self.logger.debug("Disabling %d triggers." % len(triggers_to_disable))
562
for table_name, trigger_name in triggers_to_disable:
564
"Disabling trigger %s.%s." % (table_name, trigger_name))
566
"ALTER TABLE %s DISABLE TRIGGER %s"
567
% (table_name, trigger_name))
569
def enableTriggers(self, triggers_to_enable):
570
"""Renable a set of triggers.
572
:param triggers_to_enable: List of (table_name, trigger_name).
574
self.logger.debug("Enabling %d triggers." % len(triggers_to_enable))
575
for table_name, trigger_name in triggers_to_enable:
577
"Enabling trigger %s.%s." % (table_name, trigger_name))
579
"ALTER TABLE %s ENABLE TRIGGER %s"
580
% (table_name, trigger_name))
582
def repairData(self):
583
"""After scrubbing, repair any data possibly damaged in the process.
585
# Repair Bug.message_count and Bug.date_last_message.
586
# The triggers where disabled while we where doing the cascading
587
# deletes because they fail (attempting to change a mutating table).
588
# We can repair these caches by forcing the triggers to run for
590
self.store.execute("""
591
UPDATE Message SET visible=visible
593
WHERE BugMessage.message = Message.id
596
def _fail(self, error_message):
597
self.logger.fatal(error_message)