~launchpad-pqm/launchpad/devel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Copyright 2009 Canonical Ltd.  This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).

__metaclass__ = type
__all__ = ['ParseApacheLogs']

import glob
import os

from lazr.restful.utils import safe_hasattr
from zope.component import getUtility

from lp.services.config import config
from lp.app.errors import NotFoundError
from lp.services.apachelogparser.base import (
    create_or_update_parsedlog_entry,
    get_files_to_parse,
    parse_file,
    )
from lp.services.scripts.base import LaunchpadCronScript
from lp.services.worlddata.interfaces.country import ICountrySet


class ParseApacheLogs(LaunchpadCronScript):
    """An abstract Apache log parser, finding download counts for each file.

    This does the heavy lifting to turn a directory of Apache log files
    into a structure mapping files to days to countries to download counts.

    Subclasses should override root, getDownloadKey, getDownloadCountUpdater,
    and optionally setUpUtilities.
    """

    # Glob to restrict filenames that are parsed.
    log_file_glob = '*'

    def setUpUtilities(self):
        """Prepare any utilities that might be used many times."""
        pass

    @property
    def root(self):
        """Root directory in which to find the logs."""
        raise NotImplementedError

    def getDownloadKey(self, path):
        """Generate a value to use as a key in the download dict.

        This will be called for every log line, so it should be very cheap.
        It's probably best not to return any complex objects, as there will
        be lots and lots and lots of these results sitting around for quite
        some time.

        :param path: The requested path.
        :return: A hashable object identifying the object at the path, or
            None if a request with this path should be ignored.
        """
        raise NotImplementedError

    def getDownloadCountUpdater(self, file_id):
        """Return a function which updates the download count of the object.

        :param file_id: The download key as calculated by getDownloadKey.
        :return: A count updating function, called as f(day, country, count),
            or None if the count should not be updated (eg. target deleted).
        """
        raise NotImplementedError

    def main(self):
        files_to_parse = get_files_to_parse(
            glob.glob(os.path.join(self.root, self.log_file_glob)))

        self.setUpUtilities()
        country_set = getUtility(ICountrySet)
        parsed_lines = 0
        max_parsed_lines = getattr(
            config.launchpad, 'logparser_max_parsed_lines', None)
        max_is_set = max_parsed_lines is not None
        for fd, position in files_to_parse:
            # If we've used up our budget of lines to process, stop.
            if (max_is_set and parsed_lines >= max_parsed_lines):
                break
            downloads, parsed_bytes, parsed_lines = parse_file(
                fd, position, self.logger, self.getDownloadKey)
            # Use a while loop here because we want to pop items from the dict
            # in order to free some memory as we go along. This is a good
            # thing here because the downloads dict may get really huge.
            while downloads:
                file_id, daily_downloads = downloads.popitem()
                update_download_count = self.getDownloadCountUpdater(file_id)

                # The object couldn't be retrieved (maybe it was deleted).
                # Don't bother counting downloads for it.
                if update_download_count is None:
                    continue

                for day, country_downloads in daily_downloads.items():
                    for country_code, count in country_downloads.items():
                        try:
                            country = country_set[country_code]
                        except NotFoundError:
                            # We don't know the country for the IP address
                            # where this request originated.
                            country = None
                        update_download_count(day, country, count)
            fd.seek(0)
            first_line = fd.readline()
            fd.close()
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
            self.txn.commit()
            if safe_hasattr(fd, 'name'):
                name = fd.name
            else:
                name = fd
            self.logger.info('Finished parsing %s' % name)

        self.logger.info('Done parsing apache log files')