9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
1 |
# Copyright 2009 Canonical Ltd. This software is licensed under the
|
2 |
# GNU Affero General Public License version 3 (see the file LICENSE).
|
|
3 |
||
4 |
__metaclass__ = type |
|
5 |
__all__ = ['ParseApacheLogs'] |
|
6 |
||
12163.1.3
by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used. |
7 |
import glob |
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
8 |
import os |
9 |
||
13931.2.1
by Steve Kowalik
Chip away at canonical.lazr a little more. |
10 |
from lazr.restful.utils import safe_hasattr |
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
11 |
from zope.component import getUtility |
12 |
||
14605.1.1
by Curtis Hovey
Moved canonical.config to lp.services. |
13 |
from lp.services.config import config |
11270.1.3
by Tim Penhey
Changed NotFoundError imports - gee there were a lot of them. |
14 |
from lp.app.errors import NotFoundError |
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
15 |
from lp.services.apachelogparser.base import ( |
11403.1.4
by Henning Eggers
Reformatted imports using format-imports script r32. |
16 |
create_or_update_parsedlog_entry, |
17 |
get_files_to_parse, |
|
18 |
parse_file, |
|
19 |
)
|
|
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
20 |
from lp.services.scripts.base import LaunchpadCronScript |
21 |
from lp.services.worlddata.interfaces.country import ICountrySet |
|
22 |
||
23 |
||
24 |
class ParseApacheLogs(LaunchpadCronScript): |
|
25 |
"""An abstract Apache log parser, finding download counts for each file.
|
|
26 |
||
27 |
This does the heavy lifting to turn a directory of Apache log files
|
|
28 |
into a structure mapping files to days to countries to download counts.
|
|
29 |
||
30 |
Subclasses should override root, getDownloadKey, getDownloadCountUpdater,
|
|
31 |
and optionally setUpUtilities.
|
|
32 |
"""
|
|
33 |
||
12163.1.3
by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used. |
34 |
# Glob to restrict filenames that are parsed.
|
35 |
log_file_glob = '*' |
|
36 |
||
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
37 |
def setUpUtilities(self): |
38 |
"""Prepare any utilities that might be used many times."""
|
|
39 |
pass
|
|
40 |
||
41 |
@property
|
|
42 |
def root(self): |
|
43 |
"""Root directory in which to find the logs."""
|
|
44 |
raise NotImplementedError |
|
45 |
||
46 |
def getDownloadKey(self, path): |
|
47 |
"""Generate a value to use as a key in the download dict.
|
|
48 |
||
49 |
This will be called for every log line, so it should be very cheap.
|
|
50 |
It's probably best not to return any complex objects, as there will
|
|
51 |
be lots and lots and lots of these results sitting around for quite
|
|
52 |
some time.
|
|
53 |
||
54 |
:param path: The requested path.
|
|
55 |
:return: A hashable object identifying the object at the path, or
|
|
56 |
None if a request with this path should be ignored.
|
|
57 |
"""
|
|
58 |
raise NotImplementedError |
|
59 |
||
60 |
def getDownloadCountUpdater(self, file_id): |
|
61 |
"""Return a function which updates the download count of the object.
|
|
62 |
||
63 |
:param file_id: The download key as calculated by getDownloadKey.
|
|
64 |
:return: A count updating function, called as f(day, country, count),
|
|
65 |
or None if the count should not be updated (eg. target deleted).
|
|
66 |
"""
|
|
67 |
raise NotImplementedError |
|
68 |
||
69 |
def main(self): |
|
12163.1.3
by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used. |
70 |
files_to_parse = get_files_to_parse( |
71 |
glob.glob(os.path.join(self.root, self.log_file_glob))) |
|
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
72 |
|
73 |
self.setUpUtilities() |
|
74 |
country_set = getUtility(ICountrySet) |
|
11461.1.2
by Benji York
make logparser_max_parsed_lines respected across all log files, not |
75 |
parsed_lines = 0 |
76 |
max_parsed_lines = getattr( |
|
77 |
config.launchpad, 'logparser_max_parsed_lines', None) |
|
11461.1.4
by Benji York
tweaks based on review feedback |
78 |
max_is_set = max_parsed_lines is not None |
11461.1.1
by Benji York
turn get_files_to_parse into a generator so only one file is opened at a time |
79 |
for fd, position in files_to_parse: |
11461.1.2
by Benji York
make logparser_max_parsed_lines respected across all log files, not |
80 |
# If we've used up our budget of lines to process, stop.
|
11461.1.4
by Benji York
tweaks based on review feedback |
81 |
if (max_is_set and parsed_lines >= max_parsed_lines): |
11461.1.2
by Benji York
make logparser_max_parsed_lines respected across all log files, not |
82 |
break
|
83 |
downloads, parsed_bytes, parsed_lines = parse_file( |
|
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
84 |
fd, position, self.logger, self.getDownloadKey) |
85 |
# Use a while loop here because we want to pop items from the dict
|
|
86 |
# in order to free some memory as we go along. This is a good
|
|
87 |
# thing here because the downloads dict may get really huge.
|
|
88 |
while downloads: |
|
89 |
file_id, daily_downloads = downloads.popitem() |
|
90 |
update_download_count = self.getDownloadCountUpdater(file_id) |
|
91 |
||
92 |
# The object couldn't be retrieved (maybe it was deleted).
|
|
93 |
# Don't bother counting downloads for it.
|
|
94 |
if update_download_count is None: |
|
95 |
continue
|
|
96 |
||
97 |
for day, country_downloads in daily_downloads.items(): |
|
98 |
for country_code, count in country_downloads.items(): |
|
99 |
try: |
|
100 |
country = country_set[country_code] |
|
101 |
except NotFoundError: |
|
102 |
# We don't know the country for the IP address
|
|
103 |
# where this request originated.
|
|
104 |
country = None |
|
105 |
update_download_count(day, country, count) |
|
106 |
fd.seek(0) |
|
107 |
first_line = fd.readline() |
|
108 |
fd.close() |
|
109 |
create_or_update_parsedlog_entry(first_line, parsed_bytes) |
|
110 |
self.txn.commit() |
|
12083.1.1
by Julian Edwards
make the apache log parser print the name of the file properly, even if it's a gzip |
111 |
if safe_hasattr(fd, 'name'): |
112 |
name = fd.name |
|
113 |
else: |
|
114 |
name = fd |
|
115 |
self.logger.info('Finished parsing %s' % name) |
|
9399.3.1
by William Grant
Factor the non-LFA-specific bits out of the parser script itself. |
116 |
|
117 |
self.logger.info('Done parsing apache log files') |