1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
#!/usr/bin/python2.4
#
# Copyright 2009 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Parse librarian apache logs to find out download counts for each file.
Thanks to the *huge* number of different LibraryFileAlias objects this script
will fetch when parsing multiple log files from scratch and the fact that we
overwrite storm's cache with something that caches *everything*, this script
may end up eating all your RAM. That shouldn't happen in general as we run
it multiple times a day, but if we ever fail to run it for more than a week,
we may need to add a hack (store._cache.clear()) to clear the cache after
updating the counts of every LFA, in order to get through the backlog.
"""
__metaclass__ = type
import os
# pylint: disable-msg=W0403
import _pythonpath
from zope.component import getUtility
from storm.sqlobject import SQLObjectNotFound
from canonical.config import config
from lp.services.worlddata.interfaces.country import ICountrySet
from canonical.launchpad.interfaces.librarian import ILibraryFileAliasSet
from lp.services.scripts.base import LaunchpadCronScript
from canonical.launchpad.scripts.librarian_apache_log_parser import (
create_or_update_parsedlog_entry, DBUSER, get_files_to_parse, parse_file)
from canonical.launchpad.webapp.interfaces import NotFoundError
class ParseLibrarianApacheLogs(LaunchpadCronScript):
def main(self):
root = config.librarianlogparser.logs_root
files_to_parse = get_files_to_parse(root, os.listdir(root))
libraryfilealias_set = getUtility(ILibraryFileAliasSet)
country_set = getUtility(ICountrySet)
for fd, position in files_to_parse.items():
downloads, parsed_bytes = parse_file(fd, position, self.logger)
# Use a while loop here because we want to pop items from the dict
# in order to free some memory as we go along. This is a good
# thing here because the downloads dict may get really huge.
while downloads:
file_id, daily_downloads = downloads.popitem()
try:
lfa = libraryfilealias_set[file_id]
except SQLObjectNotFound:
# This file has been deleted from the librarian, so don't
# try to store download counters for it.
continue
for day, country_downloads in daily_downloads.items():
for country_code, count in country_downloads.items():
try:
country = country_set[country_code]
except NotFoundError:
# We don't know the country for the IP address
# where this request originated.
country = None
lfa.updateDownloadCount(day, country, count)
fd.seek(0)
first_line = fd.readline()
fd.close()
create_or_update_parsedlog_entry(first_line, parsed_bytes)
self.txn.commit()
self.logger.info('Finished parsing %s' % fd)
self.logger.info('Done parsing apache log files for librarian')
if __name__ == '__main__':
script = ParseLibrarianApacheLogs('parse-librarian-apache-logs', DBUSER)
script.lock_and_run()
|