~launchpad-pqm/launchpad/devel

9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
1
# Copyright 2009 Canonical Ltd.  This software is licensed under the
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
4
__metaclass__ = type
5
__all__ = ['ParseApacheLogs']
6
12163.1.3 by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used.
7
import glob
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
8
import os
9
13931.2.1 by Steve Kowalik
Chip away at canonical.lazr a little more.
10
from lazr.restful.utils import safe_hasattr
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
11
from zope.component import getUtility
12
14605.1.1 by Curtis Hovey
Moved canonical.config to lp.services.
13
from lp.services.config import config
11270.1.3 by Tim Penhey
Changed NotFoundError imports - gee there were a lot of them.
14
from lp.app.errors import NotFoundError
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
15
from lp.services.apachelogparser.base import (
11403.1.4 by Henning Eggers
Reformatted imports using format-imports script r32.
16
    create_or_update_parsedlog_entry,
17
    get_files_to_parse,
18
    parse_file,
19
    )
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
20
from lp.services.scripts.base import LaunchpadCronScript
21
from lp.services.worlddata.interfaces.country import ICountrySet
22
23
24
class ParseApacheLogs(LaunchpadCronScript):
25
    """An abstract Apache log parser, finding download counts for each file.
26
27
    This does the heavy lifting to turn a directory of Apache log files
28
    into a structure mapping files to days to countries to download counts.
29
30
    Subclasses should override root, getDownloadKey, getDownloadCountUpdater,
31
    and optionally setUpUtilities.
32
    """
33
12163.1.3 by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used.
34
    # Glob to restrict filenames that are parsed.
35
    log_file_glob = '*'
36
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
37
    def setUpUtilities(self):
38
        """Prepare any utilities that might be used many times."""
39
        pass
40
41
    @property
42
    def root(self):
43
        """Root directory in which to find the logs."""
44
        raise NotImplementedError
45
46
    def getDownloadKey(self, path):
47
        """Generate a value to use as a key in the download dict.
48
49
        This will be called for every log line, so it should be very cheap.
50
        It's probably best not to return any complex objects, as there will
51
        be lots and lots and lots of these results sitting around for quite
52
        some time.
53
54
        :param path: The requested path.
55
        :return: A hashable object identifying the object at the path, or
56
            None if a request with this path should be ignored.
57
        """
58
        raise NotImplementedError
59
60
    def getDownloadCountUpdater(self, file_id):
61
        """Return a function which updates the download count of the object.
62
63
        :param file_id: The download key as calculated by getDownloadKey.
64
        :return: A count updating function, called as f(day, country, count),
65
            or None if the count should not be updated (eg. target deleted).
66
        """
67
        raise NotImplementedError
68
69
    def main(self):
12163.1.3 by William Grant
get_files_to_parse now takes full paths, not a root and a filename. Log parsers can now define a glob that will be used.
70
        files_to_parse = get_files_to_parse(
71
            glob.glob(os.path.join(self.root, self.log_file_glob)))
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
72
73
        self.setUpUtilities()
74
        country_set = getUtility(ICountrySet)
11461.1.2 by Benji York
make logparser_max_parsed_lines respected across all log files, not
75
        parsed_lines = 0
76
        max_parsed_lines = getattr(
77
            config.launchpad, 'logparser_max_parsed_lines', None)
11461.1.4 by Benji York
tweaks based on review feedback
78
        max_is_set = max_parsed_lines is not None
11461.1.1 by Benji York
turn get_files_to_parse into a generator so only one file is opened at a time
79
        for fd, position in files_to_parse:
11461.1.2 by Benji York
make logparser_max_parsed_lines respected across all log files, not
80
            # If we've used up our budget of lines to process, stop.
11461.1.4 by Benji York
tweaks based on review feedback
81
            if (max_is_set and parsed_lines >= max_parsed_lines):
11461.1.2 by Benji York
make logparser_max_parsed_lines respected across all log files, not
82
                break
83
            downloads, parsed_bytes, parsed_lines = parse_file(
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
84
                fd, position, self.logger, self.getDownloadKey)
85
            # Use a while loop here because we want to pop items from the dict
86
            # in order to free some memory as we go along. This is a good
87
            # thing here because the downloads dict may get really huge.
88
            while downloads:
89
                file_id, daily_downloads = downloads.popitem()
90
                update_download_count = self.getDownloadCountUpdater(file_id)
91
92
                # The object couldn't be retrieved (maybe it was deleted).
93
                # Don't bother counting downloads for it.
94
                if update_download_count is None:
95
                    continue
96
97
                for day, country_downloads in daily_downloads.items():
98
                    for country_code, count in country_downloads.items():
99
                        try:
100
                            country = country_set[country_code]
101
                        except NotFoundError:
102
                            # We don't know the country for the IP address
103
                            # where this request originated.
104
                            country = None
105
                        update_download_count(day, country, count)
106
            fd.seek(0)
107
            first_line = fd.readline()
108
            fd.close()
109
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
110
            self.txn.commit()
12083.1.1 by Julian Edwards
make the apache log parser print the name of the file properly, even if it's a gzip
111
            if safe_hasattr(fd, 'name'):
112
                name = fd.name
113
            else:
114
                name = fd
115
            self.logger.info('Finished parsing %s' % name)
9399.3.1 by William Grant
Factor the non-LFA-specific bits out of the parser script itself.
116
117
        self.logger.info('Done parsing apache log files')