~launchpad-pqm/launchpad/devel

8687.15.18 by Karl Fogel
Add the copyright header block to files under lib/canonical/.
1
# Copyright 2009 Canonical Ltd.  This software is licensed under the
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
2816.1.3 by Stuart Bishop
Work in progress
4
"""Librarian garbage collection routines"""
5
6
__metaclass__ = type
7
3691.357.6 by Stuart Bishop
Updates based on review feedback
8
from datetime import datetime, timedelta
8814.2.6 by Stuart Bishop
Review feedback
9
import errno
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
10
import re
2816.1.18 by Stuart Bishop
Post review updates
11
import sys
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
12
from time import time
13
import os
2816.1.8 by Stuart Bishop
Add tests of existing functionality
14
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
15
from zope.interface import implements
16
2816.1.7 by Stuart Bishop
Work in progress
17
from canonical.config import config
7675.415.7 by Abel Deuring
implemented reviewer's comments
18
from canonical.database.postgresql import drop_tables, quoteIdentifier
14560.2.12 by Curtis Hovey
Move looptuner to lp.services.
19
from lp.services.looptuner import (
20
    DBLoopTuner,
21
    ITunableLoop,
22
    )
2816.1.7 by Stuart Bishop
Work in progress
23
from canonical.librarian.storage import _relFileLocation as relative_file_path
2816.1.18 by Stuart Bishop
Post review updates
24
from canonical.librarian.storage import _sameFile
2816.1.11 by Stuart Bishop
Work in progress
25
from canonical.database.postgresql import listReferences
2816.1.3 by Stuart Bishop
Work in progress
26
9893.6.52 by Stuart Bishop
delint
27
log = None  # This is set by cronscripts/librarian-gc.py
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
28
debug = False
2816.1.3 by Stuart Bishop
Work in progress
29
9893.6.52 by Stuart Bishop
delint
30
3691.357.6 by Stuart Bishop
Updates based on review feedback
31
def confirm_no_clock_skew(con):
32
    """Raise an exception if there is significant clock skew between the
33
    database and this machine.
34
35
    It is theoretically possible to lose data if there is more than several
36
    hours of skew.
37
    """
38
    cur = con.cursor()
39
    cur.execute("SELECT CURRENT_TIMESTAMP AT TIME ZONE 'UTC'")
40
    db_now = cur.fetchone()[0]
41
    local_now = datetime.utcnow()
42
    five_minutes = timedelta(minutes=5)
43
44
    if -five_minutes < local_now - db_now < five_minutes:
45
        return
46
    else:
47
        raise Exception("%s clock skew between librarian and database" % (
48
            local_now - db_now,
49
            ))
50
7500.2.1 by Stuart Bishop
When all aliases are expired, flag a LibraryFileContent as deleted
51
3691.239.2 by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String
52
def delete_expired_blobs(con):
3691.239.5 by Stuart Bishop
Review feedback and fix blob garbage collection
53
    """Remove expired TemporaryBlobStorage entries and their corresponding
54
       LibraryFileAlias entries.
55
56
       We delete the LibraryFileAliases here as the default behavior of the
57
       garbage collector could leave them hanging around indefinitely.
9572.1.13 by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records
58
59
       We also delete any linked ApportJob and Job records here.
3691.239.5 by Stuart Bishop
Review feedback and fix blob garbage collection
60
    """
3691.239.2 by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String
61
    cur = con.cursor()
9572.1.13 by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records
62
63
    # Generate the list of expired blobs.
3691.239.2 by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String
64
    cur.execute("""
3691.239.5 by Stuart Bishop
Review feedback and fix blob garbage collection
65
        SELECT file_alias
66
        INTO TEMPORARY TABLE BlobAliasesToDelete
67
        FROM LibraryFileAlias, TemporaryBlobStorage
68
        WHERE file_alias = LibraryFileAlias.id
3691.239.2 by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String
69
            AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
70
        """)
9572.1.13 by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records
71
72
    # Generate the list of expired Jobs. We ignore jobs that have not
73
    # finished.
74
    cur.execute("""
75
        SELECT job
76
        INTO TEMPORARY TABLE JobsToDelete
77
        FROM Job, ApportJob, TemporaryBlobStorage, LibraryFileAlias
78
        WHERE
79
            ApportJob.blob = TemporaryBlobStorage.id
80
            AND Job.id = ApportJob.job
81
            AND Job.date_finished < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
82
            AND TemporaryBlobStorage.file_alias = LibraryFileAlias.id
83
                AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
84
        """)
85
86
    # Delete expired ApportJob records.
87
    cur.execute("""
88
        DELETE FROM ApportJob
89
        USING JobsToDelete
90
        WHERE ApportJob.job = JobsToDelete.job
91
        """)
92
93
    # Delete expired Job records.
94
    cur.execute("""
95
        DELETE FROM Job
96
        USING JobsToDelete
97
        WHERE Job.id = JobsToDelete.job
98
        """)
99
100
    # Delete expired blobs.
3691.239.5 by Stuart Bishop
Review feedback and fix blob garbage collection
101
    cur.execute("""
102
        DELETE FROM TemporaryBlobStorage
103
        USING BlobAliasesToDelete
104
        WHERE TemporaryBlobStorage.file_alias = BlobAliasesToDelete.file_alias
105
        """)
9572.1.13 by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records
106
107
    # Delete LibraryFileAliases referencing expired blobs.
3691.239.5 by Stuart Bishop
Review feedback and fix blob garbage collection
108
    cur.execute("""
109
        DELETE FROM LibraryFileAlias
110
        USING BlobAliasesToDelete
111
        WHERE file_alias = LibraryFileAlias.id
112
        """)
3691.239.2 by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String
113
    log.info("Removed %d expired blobs" % cur.rowcount)
114
    con.commit()
115
116
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
117
def merge_duplicates(con):
2816.1.11 by Stuart Bishop
Work in progress
118
    """Merge duplicate LibraryFileContent rows
4785.3.7 by Jeroen Vermeulen
Removed whitespace at ends of lines
119
2816.1.11 by Stuart Bishop
Work in progress
120
    This is the first step in a full garbage collection run. We assume files
121
    are identical if their sha1 hashes and filesizes are identical. For every
122
    duplicate detected, we make all LibraryFileAlias entries point to one of
123
    them and delete the unnecessary duplicates from the filesystem and the
124
    database.
125
    """
2816.1.3 by Stuart Bishop
Work in progress
126
2816.1.7 by Stuart Bishop
Work in progress
127
    # Get a list of all (sha1, filesize) that are duplicated in
128
    # LibraryFileContent
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
129
    cur = con.cursor()
2816.1.7 by Stuart Bishop
Work in progress
130
    cur.execute("""
131
        SELECT sha1, filesize
132
        FROM LibraryFileContent
133
        GROUP BY sha1, filesize
134
        HAVING COUNT(*) > 1
135
        """)
136
    rows = list(cur.fetchall())
137
138
    # Merge the duplicate entries, each one in a seperate transaction
139
    for sha1, filesize in rows:
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
140
        cur = con.cursor()
2816.1.7 by Stuart Bishop
Work in progress
141
9893.6.52 by Stuart Bishop
delint
142
        sha1 = sha1.encode('US-ASCII')  # Can't pass Unicode to execute (yet)
2816.1.3 by Stuart Bishop
Work in progress
143
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
144
        # Get a list of our dupes. Where multiple files exist, we return
145
        # the most recently added one first, because this is the version
146
        # most likely to exist on the staging server (it should be
147
        # irrelevant on production).
2816.1.3 by Stuart Bishop
Work in progress
148
        cur.execute("""
149
            SELECT id
150
            FROM LibraryFileContent
2816.1.7 by Stuart Bishop
Work in progress
151
            WHERE sha1=%(sha1)s AND filesize=%(filesize)s
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
152
            ORDER BY datecreated DESC
2816.1.3 by Stuart Bishop
Work in progress
153
            """, vars())
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
154
        dupes = [row[0] for row in cur.fetchall()]
2816.1.3 by Stuart Bishop
Work in progress
155
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
156
        if debug:
157
            log.debug("Found duplicate LibraryFileContents")
158
            # Spit out more info in case it helps work out where
159
            # dupes are coming from.
160
            for dupe_id in dupes:
161
                cur.execute("""
162
                    SELECT id, filename, mimetype FROM LibraryFileAlias
163
                    WHERE content = %(dupe_id)s
164
                    """, vars())
165
                for id, filename, mimetype in cur.fetchall():
166
                    log.debug("> %d %s %s" % (id, filename, mimetype))
2816.1.3 by Stuart Bishop
Work in progress
167
2816.1.18 by Stuart Bishop
Post review updates
168
        # Make sure the first file exists on disk. Don't merge if it
2856.1.3 by Stuart Bishop
Don't raise an error if library content is missing on staging, because that is normal there
169
        # doesn't. This shouldn't happen on production, so we don't try
170
        # and cope - just report and skip. However, on staging this will
2856.1.8 by Stuart Bishop
Comment clarification
171
        # be more common because database records has been synced from
2856.1.3 by Stuart Bishop
Don't raise an error if library content is missing on staging, because that is normal there
172
        # production but the actual librarian contents has not.
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
173
        dupe1_id = dupes[0]
2816.1.18 by Stuart Bishop
Post review updates
174
        dupe1_path = get_file_path(dupe1_id)
2856.1.6 by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data
175
        if not os.path.exists(dupe1_path):
5925.1.1 by Curtis Hovey
Added a mechanism to select the correct conf to load. The switch
176
            if config.instance_name == 'staging':
2856.1.6 by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data
177
                log.debug(
178
                        "LibraryFileContent %d data is missing (%s)",
179
                        dupe1_id, dupe1_path
180
                        )
181
            else:
3587.1.3 by Stuart Bishop
Tweak librariangc log levels to reduce noice
182
                log.warning(
2856.1.6 by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data
183
                        "LibraryFileContent %d data is missing (%s)",
184
                        dupe1_id, dupe1_path
185
                        )
2816.1.18 by Stuart Bishop
Post review updates
186
            continue
187
188
        # Do a manual check that they really are identical, because we
189
        # employ paranoids. And we might as well cope with someone breaking
190
        # SHA1 enough that it becomes possible to create a SHA1 collision
191
        # with an identical filesize to an existing file. Which is pretty
192
        # unlikely. Where did I leave my tin foil hat?
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
193
        for dupe2_id in (dupe for dupe in dupes[1:]):
2816.1.18 by Stuart Bishop
Post review updates
194
            dupe2_path = get_file_path(dupe2_id)
2856.1.4 by Stuart Bishop
Don't do byte-for-byte comparison if files don't exist, which may be the case on staging
195
            # Check paths exist, because on staging they may not!
2856.1.7 by Stuart Bishop
Remove unnecessary check
196
            if (os.path.exists(dupe2_path)
2856.1.4 by Stuart Bishop
Don't do byte-for-byte comparison if files don't exist, which may be the case on staging
197
                and not _sameFile(dupe1_path, dupe2_path)):
2816.1.18 by Stuart Bishop
Post review updates
198
                log.error(
199
                        "SHA-1 collision found. LibraryFileContent %d and "
200
                        "%d have the same SHA1 and filesize, but are not "
201
                        "byte-for-byte identical.",
202
                        dupe1_id, dupe2_id
203
                        )
204
                sys.exit(1)
205
2816.1.3 by Stuart Bishop
Work in progress
206
        # Update all the LibraryFileAlias entries to point to a single
207
        # LibraryFileContent
208
        prime_id = dupes[0]
3587.1.4 by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents
209
        other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
2816.1.3 by Stuart Bishop
Work in progress
210
        log.debug(
5863.9.3 by Curtis Hovey
Changes per lint.
211
            "Making LibraryFileAliases referencing %s reference %s instead",
212
            other_ids, prime_id
213
            )
2894.1.2 by Stuart Bishop
update one-at-a-time to avoid dying on too many duplicates
214
        for other_id in dupes[1:]:
215
            cur.execute("""
216
                UPDATE LibraryFileAlias SET content=%(prime_id)s
217
                WHERE content = %(other_id)s
218
                """, vars())
2816.1.3 by Stuart Bishop
Work in progress
219
220
        log.debug("Committing")
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
221
        con.commit()
222
223
9572.1.10 by Stuart Bishop
Expire expired LibraryFileAliases
224
class ExpireAliases:
225
    """Expire expired LibraryFileAlias records.
226
227
    This simply involves setting the LibraryFileAlias.content to NULL.
228
    Unreferenced LibraryFileContent records are cleaned up elsewhere.
229
    """
230
    implements(ITunableLoop)
231
232
    def __init__(self, con):
233
        self.con = con
234
        self.total_expired = 0
235
        self._done = False
236
237
    def isDone(self):
238
        if self._done:
239
            log.info(
240
                "Expired %d LibraryFileAlias records." % self.total_expired)
241
            return True
242
        else:
243
            return False
244
245
    def __call__(self, chunksize):
246
        chunksize = int(chunksize)
247
        cur = self.con.cursor()
248
        cur.execute("""
249
            UPDATE LibraryFileAlias
250
            SET content=NULL
251
            WHERE id IN (
252
                SELECT id FROM LibraryFileAlias
253
                WHERE
254
                    content IS NOT NULL
9572.1.11 by Stuart Bishop
Review feedback
255
                    AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
256
                        - interval '1 week'
9572.1.10 by Stuart Bishop
Expire expired LibraryFileAliases
257
                LIMIT %d)
258
            """ % chunksize)
259
        self.total_expired += cur.rowcount
260
        if cur.rowcount == 0:
261
            self._done = True
262
        else:
263
            log.debug("Expired %d LibraryFileAlias records." % cur.rowcount)
264
        self.con.commit()
265
266
267
def expire_aliases(con):
268
    """Invoke ExpireLibraryFileAliases."""
269
    loop_tuner = DBLoopTuner(ExpireAliases(con), 5, log=log)
270
    loop_tuner.run()
271
272
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
273
class UnreferencedLibraryFileAliasPruner:
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
274
    """Delete unreferenced LibraryFileAliases.
275
276
    The LibraryFileContent records are left untouched for the code that
277
    knows how to delete them and the corresponding files on disk.
2816.1.9 by Stuart Bishop
Work in progress
278
2816.1.11 by Stuart Bishop
Work in progress
279
    This is the second step in a full garbage collection sweep. We determine
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
280
    which LibraryFileAlias entries are not being referenced by other objects
281
    in the database and delete them, if they are expired (expiry in the past
282
    or NULL), and if they have not been recently accessed (last_access over
283
    one week in the past).
2816.1.9 by Stuart Bishop
Work in progress
284
    """
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
285
    implements(ITunableLoop)
286
287
    def __init__(self, con):
9893.6.52 by Stuart Bishop
delint
288
        self.con = con  # Database connection to use
289
        self.total_deleted = 0  # Running total
8814.2.6 by Stuart Bishop
Review feedback
290
        self.index = 1
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
291
292
        log.info("Deleting unreferenced LibraryFileAliases")
293
294
        cur = con.cursor()
295
7675.415.7 by Abel Deuring
implemented reviewer's comments
296
        drop_tables(cur, "ReferencedLibraryFileAlias")
2816.1.10 by Stuart Bishop
Work in progress
297
        cur.execute("""
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
298
            CREATE TEMPORARY TABLE ReferencedLibraryFileAlias (
299
                alias integer)
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
300
            """)
301
302
        # Determine what columns link to LibraryFileAlias
303
        # references = [(table, column), ...]
304
        references = [
305
            tuple(ref[:2])
306
            for ref in listReferences(cur, 'libraryfilealias', 'id')
307
            if ref[0] != 'libraryfiledownloadcount'
308
            ]
309
        assert len(references) > 10, (
310
            'Database introspection returned nonsense')
8814.2.2 by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output.
311
        log.debug(
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
312
            "Found %d columns referencing LibraryFileAlias", len(references))
313
314
        # Find all relevant LibraryFileAlias references and fill in
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
315
        # ReferencedLibraryFileAlias
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
316
        for table, column in references:
317
            cur.execute("""
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
318
                INSERT INTO ReferencedLibraryFileAlias
319
                SELECT LibraryFileAlias.id
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
320
                FROM LibraryFileAlias, %(table)s
321
                WHERE LibraryFileAlias.id = %(table)s.%(column)s
8814.2.6 by Stuart Bishop
Review feedback
322
                """ % {
323
                    'table': quoteIdentifier(table),
324
                    'column': quoteIdentifier(column)})
9055.6.2 by Stuart Bishop
Handle new uploads while the set of referenced LFC is being calculated
325
            log.debug("%s.%s references %d LibraryFileContent rows." % (
326
                table, column, cur.rowcount))
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
327
            con.commit()
328
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
329
        log.debug("Calculating unreferenced LibraryFileAlias set.")
7675.415.7 by Abel Deuring
implemented reviewer's comments
330
        drop_tables(cur, "UnreferencedLibraryFileAlias")
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
331
        cur.execute("""
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
332
            CREATE TEMPORARY TABLE UnreferencedLibraryFileAlias (
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
333
                id serial PRIMARY KEY,
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
334
                alias integer UNIQUE)
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
335
            """)
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
336
        # Calculate the set of unreferenced LibraryFileAlias.
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
337
        # We also exclude all unexpired and recently accessed
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
338
        # records - we don't remove them even if they are unlinked. We
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
339
        # currently don't remove stuff until it has been expired for
340
        # more than one week, but we will change this if disk space
341
        # becomes short and it actually will make a noticeable
342
        # difference. We handle excluding recently created content
343
        # here rather than earlier when creating the
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
344
        # ReferencedLibraryFileAlias table to handle uploads going on
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
345
        # while this script is running.
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
346
        cur.execute("""
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
347
            INSERT INTO UnreferencedLibraryFileAlias (alias)
348
            SELECT id AS alias FROM LibraryFileAlias
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
349
            WHERE
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
350
                content IS NULL
7675.415.7 by Abel Deuring
implemented reviewer's comments
351
                OR ((expires IS NULL OR
352
                     expires <
353
                         CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
354
                             - interval '1 week'
355
                    )
356
                    AND last_accessed <
357
                        CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
358
                            - interval '1 week'
359
                    AND date_created <
360
                        CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
361
                            - interval '1 week'
362
                   )
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
363
            EXCEPT
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
364
            SELECT alias FROM ReferencedLibraryFileAlias
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
365
            """)
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
366
        con.commit()
7675.415.7 by Abel Deuring
implemented reviewer's comments
367
        drop_tables(cur, "ReferencedLibraryFileAlias")
8814.2.2 by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output.
368
        cur.execute(
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
369
            "SELECT COALESCE(max(id),0) FROM UnreferencedLibraryFileAlias")
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
370
        self.max_id = cur.fetchone()[0]
9055.6.3 by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent
371
        log.debug(
372
            "%d unreferenced LibraryFileContent to remove." % self.max_id)
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
373
        con.commit()
374
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
375
    def isDone(self):
8814.2.2 by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output.
376
        if self.index > self.max_id:
377
            log.info(
378
                "Deleted %d LibraryFileAlias records." % self.total_deleted)
379
            return True
380
        else:
381
            return False
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
382
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
383
    def __call__(self, chunksize):
384
        chunksize = int(chunksize)
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
385
        cur = self.con.cursor()
386
        cur.execute("""
387
            DELETE FROM LibraryFileAlias
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
388
            WHERE id IN
389
                (SELECT alias FROM UnreferencedLibraryFileAlias
390
                WHERE id BETWEEN %s AND %s)
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
391
            """, (self.index, self.index + chunksize - 1))
8814.2.2 by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output.
392
        deleted_rows = cur.rowcount
393
        self.total_deleted += deleted_rows
394
        log.debug("Deleted %d LibraryFileAlias records." % deleted_rows)
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
395
        self.con.commit()
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
396
        self.index += chunksize
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
397
398
399
def delete_unreferenced_aliases(con):
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
400
    "Run the UnreferencedLibraryFileAliasPruner."
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
401
    loop_tuner = DBLoopTuner(
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
402
        UnreferencedLibraryFileAliasPruner(con), 5, log=log)
8814.2.1 by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly
403
    loop_tuner.run()
404
3322.1.11 by Stuart Bishop
Refactor librarian garbage collection to only use one connection
405
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
406
class UnreferencedContentPruner:
2816.1.11 by Stuart Bishop
Work in progress
407
    """Delete LibraryFileContent entries and their disk files that are
408
    not referenced by any LibraryFileAlias entries.
409
410
    Note that a LibraryFileContent can only be accessed through a
411
    LibraryFileAlias, so all entries in this state are garbage no matter
412
    what their expires flag says.
413
    """
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
414
    implements(ITunableLoop)
415
416
    def __init__(self, con):
417
        self.con = con
8814.2.6 by Stuart Bishop
Review feedback
418
        self.index = 1
419
        self.total_deleted = 0
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
420
        cur = con.cursor()
7675.415.7 by Abel Deuring
implemented reviewer's comments
421
        drop_tables(cur, "UnreferencedLibraryFileContent")
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
422
        cur.execute("""
423
            CREATE TEMPORARY TABLE UnreferencedLibraryFileContent (
424
                id serial PRIMARY KEY,
425
                content integer UNIQUE)
426
            """)
427
        cur.execute("""
428
            INSERT INTO UnreferencedLibraryFileContent (content)
429
            SELECT DISTINCT LibraryFileContent.id
430
            FROM LibraryFileContent
431
            LEFT OUTER JOIN LibraryFileAlias
432
                ON LibraryFileContent.id = LibraryFileAlias.content
433
            WHERE LibraryFileAlias.content IS NULL
2816.1.11 by Stuart Bishop
Work in progress
434
        """)
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
435
        cur.execute("""
436
            SELECT COALESCE(max(id), 0) FROM UnreferencedLibraryFileContent
437
            """)
438
        self.max_id = cur.fetchone()[0]
9055.6.1 by Stuart Bishop
Missing interpolation variable
439
        log.debug(
440
            "%d unreferenced LibraryFileContent rows to remove."
441
            % self.max_id)
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
442
443
    def isDone(self):
444
        if self.index > self.max_id:
445
            log.info("Deleted %d unreferenced files." % self.total_deleted)
446
            return True
447
        else:
448
            return False
449
450
    def __call__(self, chunksize):
451
        chunksize = int(chunksize)
452
453
        cur = self.con.cursor()
454
455
        # Delete unreferenced LibraryFileContent entries.
456
        cur.execute("""
457
            DELETE FROM LibraryFileContent
458
            USING (
459
                SELECT content FROM UnreferencedLibraryFileContent
8814.2.6 by Stuart Bishop
Review feedback
460
                WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
461
            WHERE
462
                LibraryFileContent.id = UnreferencedLibraryFileContent.content
8814.2.6 by Stuart Bishop
Review feedback
463
            """, (self.index, self.index + chunksize - 1))
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
464
        rows_deleted = cur.rowcount
465
        self.total_deleted += rows_deleted
466
        self.con.commit()
467
468
        # Remove files from disk. We do this outside the transaction,
469
        # as the garbage collector happily deals with files that exist
470
        # on disk but not in the DB.
471
        cur.execute("""
472
            SELECT content FROM UnreferencedLibraryFileContent
8814.2.6 by Stuart Bishop
Review feedback
473
            WHERE id BETWEEN %s AND %s
474
            """, (self.index, self.index + chunksize - 1))
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
475
        for content_id in (row[0] for row in cur.fetchall()):
2856.1.13 by Stuart Bishop
More batching
476
            # Remove the file from disk, if it hasn't already been
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
477
            path = get_file_path(content_id)
8814.2.6 by Stuart Bishop
Review feedback
478
            try:
2856.1.13 by Stuart Bishop
More batching
479
                os.unlink(path)
8814.2.7 by Stuart Bishop
Review feedback
480
            except OSError, e:
481
                if e.errno != errno.ENOENT:
482
                    raise
483
                if config.librarian_server.upstream_host is None:
484
                    # It is normal to have files in the database that
485
                    # are not on disk if the Librarian has an upstream
486
                    # Librarian, such as on staging. Don't annoy the
487
                    # operator with noise in this case.
488
                    log.info("%s already deleted", path)
489
            else:
8814.2.6 by Stuart Bishop
Review feedback
490
                log.debug("Deleted %s", path)
9055.6.4 by Stuart Bishop
Don't leave a transaction dangling
491
        self.con.rollback()
2816.1.11 by Stuart Bishop
Work in progress
492
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
493
        self.index += chunksize
494
495
496
def delete_unreferenced_content(con):
497
    """Invoke UnreferencedContentPruner."""
498
    loop_tuner = DBLoopTuner(UnreferencedContentPruner(con), 5, log=log)
499
    loop_tuner.run()
2816.1.9 by Stuart Bishop
Work in progress
500
501
3691.357.2 by Stuart Bishop
Clean files from disk that have been flagged deleted too
502
def delete_unwanted_files(con):
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
503
    """Delete files found on disk that have no corresponding record in the
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
504
    database.
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
505
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
506
    Files will only be deleted if they were created more than one day ago
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
507
    to avoid deleting files that have just been uploaded but have yet to have
508
    the database records committed.
509
    """
510
    cur = con.cursor()
511
8814.2.4 by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal
512
    # Calculate all stored LibraryFileContent ids that we want to keep.
513
    # Results are ordered so we don't have to suck them all in at once.
3691.357.6 by Stuart Bishop
Updates based on review feedback
514
    cur.execute("""
7675.415.6 by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null.
515
        SELECT id FROM LibraryFileContent ORDER BY id
3691.357.6 by Stuart Bishop
Updates based on review feedback
516
        """)
8814.2.3 by Stuart Bishop
Make delete_unwanted_files() RAM friendly
517
518
    def get_next_wanted_content_id():
519
        result = cur.fetchone()
520
        if result is None:
521
            return None
522
        else:
523
            return result[0]
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
524
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
525
    removed_count = 0
10766.5.1 by Gary Poster
Fix bug 562828 by copying Python 2.6 os.walk over.
526
    content_id = next_wanted_content_id = -1
8814.2.3 by Stuart Bishop
Make delete_unwanted_files() RAM friendly
527
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
528
    hex_content_id_re = re.compile('^[0-9a-f]{8}$')
9055.6.7 by Stuart Bishop
Review feedback
529
    ONE_DAY = 24 * 60 * 60
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
530
9893.6.51 by Stuart Bishop
Switch to Python 2.6 os.walk
531
    for dirpath, dirnames, filenames in os.walk(
10766.5.1 by Gary Poster
Fix bug 562828 by copying Python 2.6 os.walk over.
532
        get_storage_root(), followlinks=True):
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
533
534
        # Ignore known and harmless noise in the Librarian storage area.
535
        if 'incoming' in dirnames:
536
            dirnames.remove('incoming')
9572.1.6 by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings
537
        if 'lost+found' in dirnames:
538
            dirnames.remove('lost+found')
11634.2.4 by Robert Collins
Start refactoring the librarian test helper.
539
        filenames = set(filenames)
540
        filenames.discard('librarian.pid')
541
        filenames.discard('librarian.log')
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
542
543
        for dirname in dirnames[:]:
544
            if len(dirname) != 2:
545
                dirnames.remove(dirname)
9572.1.6 by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings
546
                log.warning(
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
547
                    "Ignoring directory %s that shouldn't be here" % dirname)
9572.1.1 by Stuart Bishop
fix storage noise detection
548
                continue
8814.2.6 by Stuart Bishop
Review feedback
549
            try:
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
550
                int(dirname, 16)
551
            except ValueError:
552
                dirnames.remove(dirname)
9572.1.6 by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings
553
                log.warning("Ignoring invalid directory %s" % dirname)
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
554
555
        # We need everything in order to ensure we visit files in the
556
        # same order we retrieve wanted files from the database.
557
        dirnames.sort()
11634.2.4 by Robert Collins
Start refactoring the librarian test helper.
558
        filenames = sorted(filenames)
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
559
560
        # Noise in the storage area, or maybe we are looking at the wrong
561
        # path?
562
        if dirnames and filenames:
9572.1.6 by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings
563
            log.warning(
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
564
                "%s contains both files %r and subdirectories %r. Skipping."
565
                % (dirpath, filenames, dirnames))
566
            continue
567
568
        for filename in filenames:
569
            path = os.path.join(dirpath, filename)
570
            hex_content_id = ''.join(path.split(os.sep)[-4:])
571
            if hex_content_id_re.search(hex_content_id) is None:
9572.1.6 by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings
572
                log.warning(
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
573
                    "Ignoring invalid path %s" % path)
574
                continue
575
9055.6.7 by Stuart Bishop
Review feedback
576
            content_id = int(hex_content_id, 16)
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
577
578
            while (next_wanted_content_id is not None
579
                    and content_id > next_wanted_content_id):
580
581
                next_wanted_content_id = get_next_wanted_content_id()
582
583
                if (config.librarian_server.upstream_host is None
9572.1.3 by Stuart Bishop
Handle running out of wanted content_ids
584
                        and next_wanted_content_id is not None
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
585
                        and next_wanted_content_id < content_id):
586
                    log.error(
587
                        "LibraryFileContent %d exists in the database but "
588
                        "was not found on disk." % next_wanted_content_id)
589
590
            file_wanted = (
591
                    next_wanted_content_id is not None
592
                    and next_wanted_content_id == content_id)
593
594
            if not file_wanted:
9055.6.7 by Stuart Bishop
Review feedback
595
                if time() - os.path.getctime(path) < ONE_DAY:
8814.2.6 by Stuart Bishop
Review feedback
596
                    log.debug(
597
                        "File %d not removed - created too recently"
598
                        % content_id)
599
                else:
600
                    # File uploaded a while ago but no longer wanted.
601
                    os.unlink(path)
602
                    log.debug("Deleted %s" % path)
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
603
                    removed_count += 1
604
605
    # Report any remaining LibraryFileContent that the database says
606
    # should exist but we didn't find on disk.
607
    if next_wanted_content_id == content_id:
608
        next_wanted_content_id = get_next_wanted_content_id()
609
    while next_wanted_content_id is not None:
610
        log.error(
611
            "LibraryFileContent %d exists in the database but "
612
            "was not found on disk." % next_wanted_content_id)
613
        next_wanted_content_id = get_next_wanted_content_id()
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
614
615
    log.info(
8814.2.2 by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output.
616
            "Deleted %d files from disk that where no longer referenced "
9055.6.6 by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain.
617
            "in the db" % removed_count
3691.357.1 by Stuart Bishop
Remove files found on disk with no corresponding record in the database
618
            )
619
620
2816.1.8 by Stuart Bishop
Add tests of existing functionality
621
def get_file_path(content_id):
5863.9.3 by Curtis Hovey
Changes per lint.
622
    """Return the physical file path to the matching LibraryFileContent id.
2816.1.8 by Stuart Bishop
Add tests of existing functionality
623
    """
9055.6.7 by Stuart Bishop
Review feedback
624
    assert isinstance(content_id, (int, long)), (
625
        'Invalid content_id %s' % repr(content_id))
626
    return os.path.join(get_storage_root(), relative_file_path(content_id))
627
628
629
def get_storage_root():
630
    """Return the path to the root of the Librarian storage area.
631
632
    Performs some basic sanity checking to avoid accidents.
633
    """
5863.9.2 by Curtis Hovey
Updated code to use the libraran_server section.
634
    storage_root = config.librarian_server.root
2816.1.8 by Stuart Bishop
Add tests of existing functionality
635
    # Do a basic sanity check.
9055.6.7 by Stuart Bishop
Review feedback
636
    assert os.path.isdir(os.path.join(storage_root, 'incoming')), (
637
        '%s is not a Librarian storage area' % storage_root)
638
    return storage_root