8687.15.18
by Karl Fogel
Add the copyright header block to files under lib/canonical/. |
1 |
# Copyright 2009 Canonical Ltd. This software is licensed under the
|
2 |
# GNU Affero General Public License version 3 (see the file LICENSE).
|
|
3 |
||
2816.1.3
by Stuart Bishop
Work in progress |
4 |
"""Librarian garbage collection routines"""
|
5 |
||
6 |
__metaclass__ = type |
|
7 |
||
3691.357.6
by Stuart Bishop
Updates based on review feedback |
8 |
from datetime import datetime, timedelta |
8814.2.6
by Stuart Bishop
Review feedback |
9 |
import errno |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
10 |
import re |
2816.1.18
by Stuart Bishop
Post review updates |
11 |
import sys |
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
12 |
from time import time |
13 |
import os |
|
2816.1.8
by Stuart Bishop
Add tests of existing functionality |
14 |
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
15 |
from zope.interface import implements |
16 |
||
2816.1.7
by Stuart Bishop
Work in progress |
17 |
from canonical.config import config |
7675.415.7
by Abel Deuring
implemented reviewer's comments |
18 |
from canonical.database.postgresql import drop_tables, quoteIdentifier |
14560.2.12
by Curtis Hovey
Move looptuner to lp.services. |
19 |
from lp.services.looptuner import ( |
20 |
DBLoopTuner, |
|
21 |
ITunableLoop, |
|
22 |
)
|
|
2816.1.7
by Stuart Bishop
Work in progress |
23 |
from canonical.librarian.storage import _relFileLocation as relative_file_path |
2816.1.18
by Stuart Bishop
Post review updates |
24 |
from canonical.librarian.storage import _sameFile |
2816.1.11
by Stuart Bishop
Work in progress |
25 |
from canonical.database.postgresql import listReferences |
2816.1.3
by Stuart Bishop
Work in progress |
26 |
|
9893.6.52
by Stuart Bishop
delint |
27 |
log = None # This is set by cronscripts/librarian-gc.py |
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
28 |
debug = False |
2816.1.3
by Stuart Bishop
Work in progress |
29 |
|
9893.6.52
by Stuart Bishop
delint |
30 |
|
3691.357.6
by Stuart Bishop
Updates based on review feedback |
31 |
def confirm_no_clock_skew(con): |
32 |
"""Raise an exception if there is significant clock skew between the
|
|
33 |
database and this machine.
|
|
34 |
||
35 |
It is theoretically possible to lose data if there is more than several
|
|
36 |
hours of skew.
|
|
37 |
"""
|
|
38 |
cur = con.cursor() |
|
39 |
cur.execute("SELECT CURRENT_TIMESTAMP AT TIME ZONE 'UTC'") |
|
40 |
db_now = cur.fetchone()[0] |
|
41 |
local_now = datetime.utcnow() |
|
42 |
five_minutes = timedelta(minutes=5) |
|
43 |
||
44 |
if -five_minutes < local_now - db_now < five_minutes: |
|
45 |
return
|
|
46 |
else: |
|
47 |
raise Exception("%s clock skew between librarian and database" % ( |
|
48 |
local_now - db_now, |
|
49 |
))
|
|
50 |
||
7500.2.1
by Stuart Bishop
When all aliases are expired, flag a LibraryFileContent as deleted |
51 |
|
3691.239.2
by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String |
52 |
def delete_expired_blobs(con): |
3691.239.5
by Stuart Bishop
Review feedback and fix blob garbage collection |
53 |
"""Remove expired TemporaryBlobStorage entries and their corresponding
|
54 |
LibraryFileAlias entries.
|
|
55 |
||
56 |
We delete the LibraryFileAliases here as the default behavior of the
|
|
57 |
garbage collector could leave them hanging around indefinitely.
|
|
9572.1.13
by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records |
58 |
|
59 |
We also delete any linked ApportJob and Job records here.
|
|
3691.239.5
by Stuart Bishop
Review feedback and fix blob garbage collection |
60 |
"""
|
3691.239.2
by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String |
61 |
cur = con.cursor() |
9572.1.13
by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records |
62 |
|
63 |
# Generate the list of expired blobs.
|
|
3691.239.2
by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String |
64 |
cur.execute(""" |
3691.239.5
by Stuart Bishop
Review feedback and fix blob garbage collection |
65 |
SELECT file_alias
|
66 |
INTO TEMPORARY TABLE BlobAliasesToDelete
|
|
67 |
FROM LibraryFileAlias, TemporaryBlobStorage
|
|
68 |
WHERE file_alias = LibraryFileAlias.id
|
|
3691.239.2
by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String |
69 |
AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
70 |
""") |
|
9572.1.13
by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records |
71 |
|
72 |
# Generate the list of expired Jobs. We ignore jobs that have not
|
|
73 |
# finished.
|
|
74 |
cur.execute(""" |
|
75 |
SELECT job
|
|
76 |
INTO TEMPORARY TABLE JobsToDelete
|
|
77 |
FROM Job, ApportJob, TemporaryBlobStorage, LibraryFileAlias
|
|
78 |
WHERE
|
|
79 |
ApportJob.blob = TemporaryBlobStorage.id
|
|
80 |
AND Job.id = ApportJob.job
|
|
81 |
AND Job.date_finished < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
|
82 |
AND TemporaryBlobStorage.file_alias = LibraryFileAlias.id
|
|
83 |
AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
|
84 |
""") |
|
85 |
||
86 |
# Delete expired ApportJob records.
|
|
87 |
cur.execute(""" |
|
88 |
DELETE FROM ApportJob
|
|
89 |
USING JobsToDelete
|
|
90 |
WHERE ApportJob.job = JobsToDelete.job
|
|
91 |
""") |
|
92 |
||
93 |
# Delete expired Job records.
|
|
94 |
cur.execute(""" |
|
95 |
DELETE FROM Job
|
|
96 |
USING JobsToDelete
|
|
97 |
WHERE Job.id = JobsToDelete.job
|
|
98 |
""") |
|
99 |
||
100 |
# Delete expired blobs.
|
|
3691.239.5
by Stuart Bishop
Review feedback and fix blob garbage collection |
101 |
cur.execute(""" |
102 |
DELETE FROM TemporaryBlobStorage
|
|
103 |
USING BlobAliasesToDelete
|
|
104 |
WHERE TemporaryBlobStorage.file_alias = BlobAliasesToDelete.file_alias
|
|
105 |
""") |
|
9572.1.13
by Stuart Bishop
Make the Librarian garbage collector remove expired ApportJob and Job records |
106 |
|
107 |
# Delete LibraryFileAliases referencing expired blobs.
|
|
3691.239.5
by Stuart Bishop
Review feedback and fix blob garbage collection |
108 |
cur.execute(""" |
109 |
DELETE FROM LibraryFileAlias
|
|
110 |
USING BlobAliasesToDelete
|
|
111 |
WHERE file_alias = LibraryFileAlias.id
|
|
112 |
""") |
|
3691.239.2
by Stuart Bishop
Update to use Librarian as blob storage. Use Bytes types instead of String |
113 |
log.info("Removed %d expired blobs" % cur.rowcount) |
114 |
con.commit() |
|
115 |
||
116 |
||
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
117 |
def merge_duplicates(con): |
2816.1.11
by Stuart Bishop
Work in progress |
118 |
"""Merge duplicate LibraryFileContent rows
|
4785.3.7
by Jeroen Vermeulen
Removed whitespace at ends of lines |
119 |
|
2816.1.11
by Stuart Bishop
Work in progress |
120 |
This is the first step in a full garbage collection run. We assume files
|
121 |
are identical if their sha1 hashes and filesizes are identical. For every
|
|
122 |
duplicate detected, we make all LibraryFileAlias entries point to one of
|
|
123 |
them and delete the unnecessary duplicates from the filesystem and the
|
|
124 |
database.
|
|
125 |
"""
|
|
2816.1.3
by Stuart Bishop
Work in progress |
126 |
|
2816.1.7
by Stuart Bishop
Work in progress |
127 |
# Get a list of all (sha1, filesize) that are duplicated in
|
128 |
# LibraryFileContent
|
|
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
129 |
cur = con.cursor() |
2816.1.7
by Stuart Bishop
Work in progress |
130 |
cur.execute(""" |
131 |
SELECT sha1, filesize
|
|
132 |
FROM LibraryFileContent
|
|
133 |
GROUP BY sha1, filesize
|
|
134 |
HAVING COUNT(*) > 1
|
|
135 |
""") |
|
136 |
rows = list(cur.fetchall()) |
|
137 |
||
138 |
# Merge the duplicate entries, each one in a seperate transaction
|
|
139 |
for sha1, filesize in rows: |
|
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
140 |
cur = con.cursor() |
2816.1.7
by Stuart Bishop
Work in progress |
141 |
|
9893.6.52
by Stuart Bishop
delint |
142 |
sha1 = sha1.encode('US-ASCII') # Can't pass Unicode to execute (yet) |
2816.1.3
by Stuart Bishop
Work in progress |
143 |
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
144 |
# Get a list of our dupes. Where multiple files exist, we return
|
145 |
# the most recently added one first, because this is the version
|
|
146 |
# most likely to exist on the staging server (it should be
|
|
147 |
# irrelevant on production).
|
|
2816.1.3
by Stuart Bishop
Work in progress |
148 |
cur.execute(""" |
149 |
SELECT id
|
|
150 |
FROM LibraryFileContent
|
|
2816.1.7
by Stuart Bishop
Work in progress |
151 |
WHERE sha1=%(sha1)s AND filesize=%(filesize)s |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
152 |
ORDER BY datecreated DESC
|
2816.1.3
by Stuart Bishop
Work in progress |
153 |
""", vars()) |
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
154 |
dupes = [row[0] for row in cur.fetchall()] |
2816.1.3
by Stuart Bishop
Work in progress |
155 |
|
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
156 |
if debug: |
157 |
log.debug("Found duplicate LibraryFileContents") |
|
158 |
# Spit out more info in case it helps work out where
|
|
159 |
# dupes are coming from.
|
|
160 |
for dupe_id in dupes: |
|
161 |
cur.execute(""" |
|
162 |
SELECT id, filename, mimetype FROM LibraryFileAlias
|
|
163 |
WHERE content = %(dupe_id)s |
|
164 |
""", vars()) |
|
165 |
for id, filename, mimetype in cur.fetchall(): |
|
166 |
log.debug("> %d %s %s" % (id, filename, mimetype)) |
|
2816.1.3
by Stuart Bishop
Work in progress |
167 |
|
2816.1.18
by Stuart Bishop
Post review updates |
168 |
# Make sure the first file exists on disk. Don't merge if it
|
2856.1.3
by Stuart Bishop
Don't raise an error if library content is missing on staging, because that is normal there |
169 |
# doesn't. This shouldn't happen on production, so we don't try
|
170 |
# and cope - just report and skip. However, on staging this will
|
|
2856.1.8
by Stuart Bishop
Comment clarification |
171 |
# be more common because database records has been synced from
|
2856.1.3
by Stuart Bishop
Don't raise an error if library content is missing on staging, because that is normal there |
172 |
# production but the actual librarian contents has not.
|
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
173 |
dupe1_id = dupes[0] |
2816.1.18
by Stuart Bishop
Post review updates |
174 |
dupe1_path = get_file_path(dupe1_id) |
2856.1.6
by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data |
175 |
if not os.path.exists(dupe1_path): |
5925.1.1
by Curtis Hovey
Added a mechanism to select the correct conf to load. The switch |
176 |
if config.instance_name == 'staging': |
2856.1.6
by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data |
177 |
log.debug( |
178 |
"LibraryFileContent %d data is missing (%s)", |
|
179 |
dupe1_id, dupe1_path |
|
180 |
)
|
|
181 |
else: |
|
3587.1.3
by Stuart Bishop
Tweak librariangc log levels to reduce noice |
182 |
log.warning( |
2856.1.6
by Stuart Bishop
Use most recent upload for duplicate merging, instead of oldest, to avoid trashing staging data |
183 |
"LibraryFileContent %d data is missing (%s)", |
184 |
dupe1_id, dupe1_path |
|
185 |
)
|
|
2816.1.18
by Stuart Bishop
Post review updates |
186 |
continue
|
187 |
||
188 |
# Do a manual check that they really are identical, because we
|
|
189 |
# employ paranoids. And we might as well cope with someone breaking
|
|
190 |
# SHA1 enough that it becomes possible to create a SHA1 collision
|
|
191 |
# with an identical filesize to an existing file. Which is pretty
|
|
192 |
# unlikely. Where did I leave my tin foil hat?
|
|
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
193 |
for dupe2_id in (dupe for dupe in dupes[1:]): |
2816.1.18
by Stuart Bishop
Post review updates |
194 |
dupe2_path = get_file_path(dupe2_id) |
2856.1.4
by Stuart Bishop
Don't do byte-for-byte comparison if files don't exist, which may be the case on staging |
195 |
# Check paths exist, because on staging they may not!
|
2856.1.7
by Stuart Bishop
Remove unnecessary check |
196 |
if (os.path.exists(dupe2_path) |
2856.1.4
by Stuart Bishop
Don't do byte-for-byte comparison if files don't exist, which may be the case on staging |
197 |
and not _sameFile(dupe1_path, dupe2_path)): |
2816.1.18
by Stuart Bishop
Post review updates |
198 |
log.error( |
199 |
"SHA-1 collision found. LibraryFileContent %d and " |
|
200 |
"%d have the same SHA1 and filesize, but are not " |
|
201 |
"byte-for-byte identical.", |
|
202 |
dupe1_id, dupe2_id |
|
203 |
)
|
|
204 |
sys.exit(1) |
|
205 |
||
2816.1.3
by Stuart Bishop
Work in progress |
206 |
# Update all the LibraryFileAlias entries to point to a single
|
207 |
# LibraryFileContent
|
|
208 |
prime_id = dupes[0] |
|
3587.1.4
by Stuart Bishop
Output LibraryFileAlias information of duplicated LibraryFileContents |
209 |
other_ids = ', '.join(str(dupe) for dupe in dupes[1:]) |
2816.1.3
by Stuart Bishop
Work in progress |
210 |
log.debug( |
5863.9.3
by Curtis Hovey
Changes per lint. |
211 |
"Making LibraryFileAliases referencing %s reference %s instead", |
212 |
other_ids, prime_id |
|
213 |
)
|
|
2894.1.2
by Stuart Bishop
update one-at-a-time to avoid dying on too many duplicates |
214 |
for other_id in dupes[1:]: |
215 |
cur.execute(""" |
|
216 |
UPDATE LibraryFileAlias SET content=%(prime_id)s |
|
217 |
WHERE content = %(other_id)s |
|
218 |
""", vars()) |
|
2816.1.3
by Stuart Bishop
Work in progress |
219 |
|
220 |
log.debug("Committing") |
|
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
221 |
con.commit() |
222 |
||
223 |
||
9572.1.10
by Stuart Bishop
Expire expired LibraryFileAliases |
224 |
class ExpireAliases: |
225 |
"""Expire expired LibraryFileAlias records.
|
|
226 |
||
227 |
This simply involves setting the LibraryFileAlias.content to NULL.
|
|
228 |
Unreferenced LibraryFileContent records are cleaned up elsewhere.
|
|
229 |
"""
|
|
230 |
implements(ITunableLoop) |
|
231 |
||
232 |
def __init__(self, con): |
|
233 |
self.con = con |
|
234 |
self.total_expired = 0 |
|
235 |
self._done = False |
|
236 |
||
237 |
def isDone(self): |
|
238 |
if self._done: |
|
239 |
log.info( |
|
240 |
"Expired %d LibraryFileAlias records." % self.total_expired) |
|
241 |
return True |
|
242 |
else: |
|
243 |
return False |
|
244 |
||
245 |
def __call__(self, chunksize): |
|
246 |
chunksize = int(chunksize) |
|
247 |
cur = self.con.cursor() |
|
248 |
cur.execute(""" |
|
249 |
UPDATE LibraryFileAlias
|
|
250 |
SET content=NULL
|
|
251 |
WHERE id IN (
|
|
252 |
SELECT id FROM LibraryFileAlias
|
|
253 |
WHERE
|
|
254 |
content IS NOT NULL
|
|
9572.1.11
by Stuart Bishop
Review feedback |
255 |
AND expires < CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
256 |
- interval '1 week'
|
|
9572.1.10
by Stuart Bishop
Expire expired LibraryFileAliases |
257 |
LIMIT %d) |
258 |
""" % chunksize) |
|
259 |
self.total_expired += cur.rowcount |
|
260 |
if cur.rowcount == 0: |
|
261 |
self._done = True |
|
262 |
else: |
|
263 |
log.debug("Expired %d LibraryFileAlias records." % cur.rowcount) |
|
264 |
self.con.commit() |
|
265 |
||
266 |
||
267 |
def expire_aliases(con): |
|
268 |
"""Invoke ExpireLibraryFileAliases."""
|
|
269 |
loop_tuner = DBLoopTuner(ExpireAliases(con), 5, log=log) |
|
270 |
loop_tuner.run() |
|
271 |
||
272 |
||
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
273 |
class UnreferencedLibraryFileAliasPruner: |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
274 |
"""Delete unreferenced LibraryFileAliases.
|
275 |
||
276 |
The LibraryFileContent records are left untouched for the code that
|
|
277 |
knows how to delete them and the corresponding files on disk.
|
|
2816.1.9
by Stuart Bishop
Work in progress |
278 |
|
2816.1.11
by Stuart Bishop
Work in progress |
279 |
This is the second step in a full garbage collection sweep. We determine
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
280 |
which LibraryFileAlias entries are not being referenced by other objects
|
281 |
in the database and delete them, if they are expired (expiry in the past
|
|
282 |
or NULL), and if they have not been recently accessed (last_access over
|
|
283 |
one week in the past).
|
|
2816.1.9
by Stuart Bishop
Work in progress |
284 |
"""
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
285 |
implements(ITunableLoop) |
286 |
||
287 |
def __init__(self, con): |
|
9893.6.52
by Stuart Bishop
delint |
288 |
self.con = con # Database connection to use |
289 |
self.total_deleted = 0 # Running total |
|
8814.2.6
by Stuart Bishop
Review feedback |
290 |
self.index = 1 |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
291 |
|
292 |
log.info("Deleting unreferenced LibraryFileAliases") |
|
293 |
||
294 |
cur = con.cursor() |
|
295 |
||
7675.415.7
by Abel Deuring
implemented reviewer's comments |
296 |
drop_tables(cur, "ReferencedLibraryFileAlias") |
2816.1.10
by Stuart Bishop
Work in progress |
297 |
cur.execute(""" |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
298 |
CREATE TEMPORARY TABLE ReferencedLibraryFileAlias (
|
299 |
alias integer)
|
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
300 |
""") |
301 |
||
302 |
# Determine what columns link to LibraryFileAlias
|
|
303 |
# references = [(table, column), ...]
|
|
304 |
references = [ |
|
305 |
tuple(ref[:2]) |
|
306 |
for ref in listReferences(cur, 'libraryfilealias', 'id') |
|
307 |
if ref[0] != 'libraryfiledownloadcount' |
|
308 |
]
|
|
309 |
assert len(references) > 10, ( |
|
310 |
'Database introspection returned nonsense') |
|
8814.2.2
by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output. |
311 |
log.debug( |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
312 |
"Found %d columns referencing LibraryFileAlias", len(references)) |
313 |
||
314 |
# Find all relevant LibraryFileAlias references and fill in
|
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
315 |
# ReferencedLibraryFileAlias
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
316 |
for table, column in references: |
317 |
cur.execute(""" |
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
318 |
INSERT INTO ReferencedLibraryFileAlias
|
319 |
SELECT LibraryFileAlias.id
|
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
320 |
FROM LibraryFileAlias, %(table)s |
321 |
WHERE LibraryFileAlias.id = %(table)s.%(column)s |
|
8814.2.6
by Stuart Bishop
Review feedback |
322 |
""" % { |
323 |
'table': quoteIdentifier(table), |
|
324 |
'column': quoteIdentifier(column)}) |
|
9055.6.2
by Stuart Bishop
Handle new uploads while the set of referenced LFC is being calculated |
325 |
log.debug("%s.%s references %d LibraryFileContent rows." % ( |
326 |
table, column, cur.rowcount)) |
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
327 |
con.commit() |
328 |
||
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
329 |
log.debug("Calculating unreferenced LibraryFileAlias set.") |
7675.415.7
by Abel Deuring
implemented reviewer's comments |
330 |
drop_tables(cur, "UnreferencedLibraryFileAlias") |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
331 |
cur.execute(""" |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
332 |
CREATE TEMPORARY TABLE UnreferencedLibraryFileAlias (
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
333 |
id serial PRIMARY KEY,
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
334 |
alias integer UNIQUE)
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
335 |
""") |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
336 |
# Calculate the set of unreferenced LibraryFileAlias.
|
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
337 |
# We also exclude all unexpired and recently accessed
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
338 |
# records - we don't remove them even if they are unlinked. We
|
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
339 |
# currently don't remove stuff until it has been expired for
|
340 |
# more than one week, but we will change this if disk space
|
|
341 |
# becomes short and it actually will make a noticeable
|
|
342 |
# difference. We handle excluding recently created content
|
|
343 |
# here rather than earlier when creating the
|
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
344 |
# ReferencedLibraryFileAlias table to handle uploads going on
|
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
345 |
# while this script is running.
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
346 |
cur.execute(""" |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
347 |
INSERT INTO UnreferencedLibraryFileAlias (alias)
|
348 |
SELECT id AS alias FROM LibraryFileAlias
|
|
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
349 |
WHERE
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
350 |
content IS NULL
|
7675.415.7
by Abel Deuring
implemented reviewer's comments |
351 |
OR ((expires IS NULL OR
|
352 |
expires <
|
|
353 |
CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
|
354 |
- interval '1 week'
|
|
355 |
)
|
|
356 |
AND last_accessed <
|
|
357 |
CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
|
358 |
- interval '1 week'
|
|
359 |
AND date_created <
|
|
360 |
CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
|
|
361 |
- interval '1 week'
|
|
362 |
)
|
|
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
363 |
EXCEPT
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
364 |
SELECT alias FROM ReferencedLibraryFileAlias
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
365 |
""") |
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
366 |
con.commit() |
7675.415.7
by Abel Deuring
implemented reviewer's comments |
367 |
drop_tables(cur, "ReferencedLibraryFileAlias") |
8814.2.2
by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output. |
368 |
cur.execute( |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
369 |
"SELECT COALESCE(max(id),0) FROM UnreferencedLibraryFileAlias") |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
370 |
self.max_id = cur.fetchone()[0] |
9055.6.3
by Stuart Bishop
Handle uploads occuring when calculating our set of unreachable LibraryFileContent |
371 |
log.debug( |
372 |
"%d unreferenced LibraryFileContent to remove." % self.max_id) |
|
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
373 |
con.commit() |
374 |
||
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
375 |
def isDone(self): |
8814.2.2
by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output. |
376 |
if self.index > self.max_id: |
377 |
log.info( |
|
378 |
"Deleted %d LibraryFileAlias records." % self.total_deleted) |
|
379 |
return True |
|
380 |
else: |
|
381 |
return False |
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
382 |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
383 |
def __call__(self, chunksize): |
384 |
chunksize = int(chunksize) |
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
385 |
cur = self.con.cursor() |
386 |
cur.execute(""" |
|
387 |
DELETE FROM LibraryFileAlias
|
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
388 |
WHERE id IN
|
389 |
(SELECT alias FROM UnreferencedLibraryFileAlias
|
|
390 |
WHERE id BETWEEN %s AND %s) |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
391 |
""", (self.index, self.index + chunksize - 1)) |
8814.2.2
by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output. |
392 |
deleted_rows = cur.rowcount |
393 |
self.total_deleted += deleted_rows |
|
394 |
log.debug("Deleted %d LibraryFileAlias records." % deleted_rows) |
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
395 |
self.con.commit() |
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
396 |
self.index += chunksize |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
397 |
|
398 |
||
399 |
def delete_unreferenced_aliases(con): |
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
400 |
"Run the UnreferencedLibraryFileAliasPruner."
|
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
401 |
loop_tuner = DBLoopTuner( |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
402 |
UnreferencedLibraryFileAliasPruner(con), 5, log=log) |
8814.2.1
by Stuart Bishop
Make delete_unreferenced_aliases() use DBLoopTuner and temporary tables to be database and RAM friendly |
403 |
loop_tuner.run() |
404 |
||
3322.1.11
by Stuart Bishop
Refactor librarian garbage collection to only use one connection |
405 |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
406 |
class UnreferencedContentPruner: |
2816.1.11
by Stuart Bishop
Work in progress |
407 |
"""Delete LibraryFileContent entries and their disk files that are
|
408 |
not referenced by any LibraryFileAlias entries.
|
|
409 |
||
410 |
Note that a LibraryFileContent can only be accessed through a
|
|
411 |
LibraryFileAlias, so all entries in this state are garbage no matter
|
|
412 |
what their expires flag says.
|
|
413 |
"""
|
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
414 |
implements(ITunableLoop) |
415 |
||
416 |
def __init__(self, con): |
|
417 |
self.con = con |
|
8814.2.6
by Stuart Bishop
Review feedback |
418 |
self.index = 1 |
419 |
self.total_deleted = 0 |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
420 |
cur = con.cursor() |
7675.415.7
by Abel Deuring
implemented reviewer's comments |
421 |
drop_tables(cur, "UnreferencedLibraryFileContent") |
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
422 |
cur.execute(""" |
423 |
CREATE TEMPORARY TABLE UnreferencedLibraryFileContent (
|
|
424 |
id serial PRIMARY KEY,
|
|
425 |
content integer UNIQUE)
|
|
426 |
""") |
|
427 |
cur.execute(""" |
|
428 |
INSERT INTO UnreferencedLibraryFileContent (content)
|
|
429 |
SELECT DISTINCT LibraryFileContent.id
|
|
430 |
FROM LibraryFileContent
|
|
431 |
LEFT OUTER JOIN LibraryFileAlias
|
|
432 |
ON LibraryFileContent.id = LibraryFileAlias.content
|
|
433 |
WHERE LibraryFileAlias.content IS NULL
|
|
2816.1.11
by Stuart Bishop
Work in progress |
434 |
""") |
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
435 |
cur.execute(""" |
436 |
SELECT COALESCE(max(id), 0) FROM UnreferencedLibraryFileContent
|
|
437 |
""") |
|
438 |
self.max_id = cur.fetchone()[0] |
|
9055.6.1
by Stuart Bishop
Missing interpolation variable |
439 |
log.debug( |
440 |
"%d unreferenced LibraryFileContent rows to remove." |
|
441 |
% self.max_id) |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
442 |
|
443 |
def isDone(self): |
|
444 |
if self.index > self.max_id: |
|
445 |
log.info("Deleted %d unreferenced files." % self.total_deleted) |
|
446 |
return True |
|
447 |
else: |
|
448 |
return False |
|
449 |
||
450 |
def __call__(self, chunksize): |
|
451 |
chunksize = int(chunksize) |
|
452 |
||
453 |
cur = self.con.cursor() |
|
454 |
||
455 |
# Delete unreferenced LibraryFileContent entries.
|
|
456 |
cur.execute(""" |
|
457 |
DELETE FROM LibraryFileContent
|
|
458 |
USING (
|
|
459 |
SELECT content FROM UnreferencedLibraryFileContent
|
|
8814.2.6
by Stuart Bishop
Review feedback |
460 |
WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent |
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
461 |
WHERE
|
462 |
LibraryFileContent.id = UnreferencedLibraryFileContent.content
|
|
8814.2.6
by Stuart Bishop
Review feedback |
463 |
""", (self.index, self.index + chunksize - 1)) |
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
464 |
rows_deleted = cur.rowcount |
465 |
self.total_deleted += rows_deleted |
|
466 |
self.con.commit() |
|
467 |
||
468 |
# Remove files from disk. We do this outside the transaction,
|
|
469 |
# as the garbage collector happily deals with files that exist
|
|
470 |
# on disk but not in the DB.
|
|
471 |
cur.execute(""" |
|
472 |
SELECT content FROM UnreferencedLibraryFileContent
|
|
8814.2.6
by Stuart Bishop
Review feedback |
473 |
WHERE id BETWEEN %s AND %s |
474 |
""", (self.index, self.index + chunksize - 1)) |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
475 |
for content_id in (row[0] for row in cur.fetchall()): |
2856.1.13
by Stuart Bishop
More batching |
476 |
# Remove the file from disk, if it hasn't already been
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
477 |
path = get_file_path(content_id) |
8814.2.6
by Stuart Bishop
Review feedback |
478 |
try: |
2856.1.13
by Stuart Bishop
More batching |
479 |
os.unlink(path) |
8814.2.7
by Stuart Bishop
Review feedback |
480 |
except OSError, e: |
481 |
if e.errno != errno.ENOENT: |
|
482 |
raise
|
|
483 |
if config.librarian_server.upstream_host is None: |
|
484 |
# It is normal to have files in the database that
|
|
485 |
# are not on disk if the Librarian has an upstream
|
|
486 |
# Librarian, such as on staging. Don't annoy the
|
|
487 |
# operator with noise in this case.
|
|
488 |
log.info("%s already deleted", path) |
|
489 |
else: |
|
8814.2.6
by Stuart Bishop
Review feedback |
490 |
log.debug("Deleted %s", path) |
9055.6.4
by Stuart Bishop
Don't leave a transaction dangling |
491 |
self.con.rollback() |
2816.1.11
by Stuart Bishop
Work in progress |
492 |
|
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
493 |
self.index += chunksize |
494 |
||
495 |
||
496 |
def delete_unreferenced_content(con): |
|
497 |
"""Invoke UnreferencedContentPruner."""
|
|
498 |
loop_tuner = DBLoopTuner(UnreferencedContentPruner(con), 5, log=log) |
|
499 |
loop_tuner.run() |
|
2816.1.9
by Stuart Bishop
Work in progress |
500 |
|
501 |
||
3691.357.2
by Stuart Bishop
Clean files from disk that have been flagged deleted too |
502 |
def delete_unwanted_files(con): |
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
503 |
"""Delete files found on disk that have no corresponding record in the
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
504 |
database.
|
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
505 |
|
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
506 |
Files will only be deleted if they were created more than one day ago
|
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
507 |
to avoid deleting files that have just been uploaded but have yet to have
|
508 |
the database records committed.
|
|
509 |
"""
|
|
510 |
cur = con.cursor() |
|
511 |
||
8814.2.4
by Stuart Bishop
Make delete_unreferenced_content use DBLoopTuner and be RAM friendly, and don't emit errors when files are not found on disk and we have an upstream Librarian as this is normal |
512 |
# Calculate all stored LibraryFileContent ids that we want to keep.
|
513 |
# Results are ordered so we don't have to suck them all in at once.
|
|
3691.357.6
by Stuart Bishop
Updates based on review feedback |
514 |
cur.execute(""" |
7675.415.6
by Abel Deuring
update the librarian GC for the schema change. LibraryFileAlias records are now deleted, if they are not referenced by any other table and if they are sufficiently old, or if LFA.content is null. |
515 |
SELECT id FROM LibraryFileContent ORDER BY id
|
3691.357.6
by Stuart Bishop
Updates based on review feedback |
516 |
""") |
8814.2.3
by Stuart Bishop
Make delete_unwanted_files() RAM friendly |
517 |
|
518 |
def get_next_wanted_content_id(): |
|
519 |
result = cur.fetchone() |
|
520 |
if result is None: |
|
521 |
return None |
|
522 |
else: |
|
523 |
return result[0] |
|
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
524 |
|
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
525 |
removed_count = 0 |
10766.5.1
by Gary Poster
Fix bug 562828 by copying Python 2.6 os.walk over. |
526 |
content_id = next_wanted_content_id = -1 |
8814.2.3
by Stuart Bishop
Make delete_unwanted_files() RAM friendly |
527 |
|
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
528 |
hex_content_id_re = re.compile('^[0-9a-f]{8}$') |
9055.6.7
by Stuart Bishop
Review feedback |
529 |
ONE_DAY = 24 * 60 * 60 |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
530 |
|
9893.6.51
by Stuart Bishop
Switch to Python 2.6 os.walk |
531 |
for dirpath, dirnames, filenames in os.walk( |
10766.5.1
by Gary Poster
Fix bug 562828 by copying Python 2.6 os.walk over. |
532 |
get_storage_root(), followlinks=True): |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
533 |
|
534 |
# Ignore known and harmless noise in the Librarian storage area.
|
|
535 |
if 'incoming' in dirnames: |
|
536 |
dirnames.remove('incoming') |
|
9572.1.6
by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings |
537 |
if 'lost+found' in dirnames: |
538 |
dirnames.remove('lost+found') |
|
11634.2.4
by Robert Collins
Start refactoring the librarian test helper. |
539 |
filenames = set(filenames) |
540 |
filenames.discard('librarian.pid') |
|
541 |
filenames.discard('librarian.log') |
|
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
542 |
|
543 |
for dirname in dirnames[:]: |
|
544 |
if len(dirname) != 2: |
|
545 |
dirnames.remove(dirname) |
|
9572.1.6
by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings |
546 |
log.warning( |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
547 |
"Ignoring directory %s that shouldn't be here" % dirname) |
9572.1.1
by Stuart Bishop
fix storage noise detection |
548 |
continue
|
8814.2.6
by Stuart Bishop
Review feedback |
549 |
try: |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
550 |
int(dirname, 16) |
551 |
except ValueError: |
|
552 |
dirnames.remove(dirname) |
|
9572.1.6
by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings |
553 |
log.warning("Ignoring invalid directory %s" % dirname) |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
554 |
|
555 |
# We need everything in order to ensure we visit files in the
|
|
556 |
# same order we retrieve wanted files from the database.
|
|
557 |
dirnames.sort() |
|
11634.2.4
by Robert Collins
Start refactoring the librarian test helper. |
558 |
filenames = sorted(filenames) |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
559 |
|
560 |
# Noise in the storage area, or maybe we are looking at the wrong
|
|
561 |
# path?
|
|
562 |
if dirnames and filenames: |
|
9572.1.6
by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings |
563 |
log.warning( |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
564 |
"%s contains both files %r and subdirectories %r. Skipping." |
565 |
% (dirpath, filenames, dirnames)) |
|
566 |
continue
|
|
567 |
||
568 |
for filename in filenames: |
|
569 |
path = os.path.join(dirpath, filename) |
|
570 |
hex_content_id = ''.join(path.split(os.sep)[-4:]) |
|
571 |
if hex_content_id_re.search(hex_content_id) is None: |
|
9572.1.6
by Stuart Bishop
Silently ignore well known lost+found directory that isn't going away and lower some errors to warnings |
572 |
log.warning( |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
573 |
"Ignoring invalid path %s" % path) |
574 |
continue
|
|
575 |
||
9055.6.7
by Stuart Bishop
Review feedback |
576 |
content_id = int(hex_content_id, 16) |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
577 |
|
578 |
while (next_wanted_content_id is not None |
|
579 |
and content_id > next_wanted_content_id): |
|
580 |
||
581 |
next_wanted_content_id = get_next_wanted_content_id() |
|
582 |
||
583 |
if (config.librarian_server.upstream_host is None |
|
9572.1.3
by Stuart Bishop
Handle running out of wanted content_ids |
584 |
and next_wanted_content_id is not None |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
585 |
and next_wanted_content_id < content_id): |
586 |
log.error( |
|
587 |
"LibraryFileContent %d exists in the database but " |
|
588 |
"was not found on disk." % next_wanted_content_id) |
|
589 |
||
590 |
file_wanted = ( |
|
591 |
next_wanted_content_id is not None |
|
592 |
and next_wanted_content_id == content_id) |
|
593 |
||
594 |
if not file_wanted: |
|
9055.6.7
by Stuart Bishop
Review feedback |
595 |
if time() - os.path.getctime(path) < ONE_DAY: |
8814.2.6
by Stuart Bishop
Review feedback |
596 |
log.debug( |
597 |
"File %d not removed - created too recently" |
|
598 |
% content_id) |
|
599 |
else: |
|
600 |
# File uploaded a while ago but no longer wanted.
|
|
601 |
os.unlink(path) |
|
602 |
log.debug("Deleted %s" % path) |
|
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
603 |
removed_count += 1 |
604 |
||
605 |
# Report any remaining LibraryFileContent that the database says
|
|
606 |
# should exist but we didn't find on disk.
|
|
607 |
if next_wanted_content_id == content_id: |
|
608 |
next_wanted_content_id = get_next_wanted_content_id() |
|
609 |
while next_wanted_content_id is not None: |
|
610 |
log.error( |
|
611 |
"LibraryFileContent %d exists in the database but " |
|
612 |
"was not found on disk." % next_wanted_content_id) |
|
613 |
next_wanted_content_id = get_next_wanted_content_id() |
|
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
614 |
|
615 |
log.info( |
|
8814.2.2
by Stuart Bishop
Make flag_expired_files() use DBLoopTuner and temporary tables to be database and RAM friendly. Also, tidy output. |
616 |
"Deleted %d files from disk that where no longer referenced " |
9055.6.6
by Stuart Bishop
Instead of os.stat to discover if files exist, refactor to use os.walk instead for much speed gain. |
617 |
"in the db" % removed_count |
3691.357.1
by Stuart Bishop
Remove files found on disk with no corresponding record in the database |
618 |
)
|
619 |
||
620 |
||
2816.1.8
by Stuart Bishop
Add tests of existing functionality |
621 |
def get_file_path(content_id): |
5863.9.3
by Curtis Hovey
Changes per lint. |
622 |
"""Return the physical file path to the matching LibraryFileContent id.
|
2816.1.8
by Stuart Bishop
Add tests of existing functionality |
623 |
"""
|
9055.6.7
by Stuart Bishop
Review feedback |
624 |
assert isinstance(content_id, (int, long)), ( |
625 |
'Invalid content_id %s' % repr(content_id)) |
|
626 |
return os.path.join(get_storage_root(), relative_file_path(content_id)) |
|
627 |
||
628 |
||
629 |
def get_storage_root(): |
|
630 |
"""Return the path to the root of the Librarian storage area.
|
|
631 |
||
632 |
Performs some basic sanity checking to avoid accidents.
|
|
633 |
"""
|
|
5863.9.2
by Curtis Hovey
Updated code to use the libraran_server section. |
634 |
storage_root = config.librarian_server.root |
2816.1.8
by Stuart Bishop
Add tests of existing functionality |
635 |
# Do a basic sanity check.
|
9055.6.7
by Stuart Bishop
Review feedback |
636 |
assert os.path.isdir(os.path.join(storage_root, 'incoming')), ( |
637 |
'%s is not a Librarian storage area' % storage_root) |
|
638 |
return storage_root |