~launchpad-pqm/launchpad/devel

13155.1.17 by Curtis Hovey
Updated the copyrights in the very old branch.
1
# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
2
# GNU Affero General Public License version 3 (see the file LICENSE).
3
4
"""Utilities for the update-bugzilla-remote-components cronscript"""
5
6
__metaclass__ = type
7
__all__ = [
8
    'BugzillaRemoteComponentFinder',
7675.891.25 by Bryce Harrington
Syntactical fixups
9
    'BugzillaRemoteComponentScraper',
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
10
    ]
11
12
import re
7675.891.3 by Bryce Harrington
Move from use of pycurl to straight urllib2
13
from urllib2 import (
7675.891.32 by Bryce Harrington
Store components to database
14
    HTTPError,
15
    urlopen,
16
    )
7675.891.3 by Bryce Harrington
Move from use of pycurl to straight urllib2
17
from BeautifulSoup import BeautifulSoup
7675.891.9 by Bryce Harrington
Drop LaunchpadBugTracker now that we can pull info from LP directly
18
from canonical.launchpad.scripts.logger import log as default_log
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
19
from zope.component import getUtility
20
from lp.bugs.interfaces.bugtracker import (
7675.891.32 by Bryce Harrington
Store components to database
21
    BugTrackerType,
22
    IBugTrackerSet,
23
    )
24
from lp.bugs.model.bugtracker import (
25
    BugTrackerComponent,
26
    )
27
from canonical.launchpad.interfaces.lpstorm import IStore
28
7675.891.3 by Bryce Harrington
Move from use of pycurl to straight urllib2
29
7675.891.16 by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level
30
def dictFromCSV(line):
31
    items_dict = {}
32
    for item in line.split(","):
33
        item = item.strip()
34
        item = item.replace("'", "")
35
        item = item.replace("\\", "")
36
        items_dict[item] = {
37
            'name': item,
38
            }
39
    return items_dict
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
40
7675.891.25 by Bryce Harrington
Syntactical fixups
41
7675.891.10 by Bryce Harrington
Isolate the scraping code into its own class separate from the finder
42
class BugzillaRemoteComponentScraper:
43
    """Scrapes Bugzilla query.cgi page for lists of products and components"""
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
44
45
    re_cpts = re.compile(r'cpts\[(\d+)\] = \[(.*)\]')
46
    re_vers = re.compile(r'vers\[(\d+)\] = \[(.*)\]')
47
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
48
    def __init__(self, base_url=None):
7675.891.16 by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level
49
        self.base_url = re.sub(r'/$', '', base_url)
13155.1.15 by Curtis Hovey
Hush lint.
50
        self.url = "%s/query.cgi?format=advanced" % (self.base_url)
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
51
        self.products = {}
7675.891.3 by Bryce Harrington
Move from use of pycurl to straight urllib2
52
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
53
    def getPage(self):
7675.891.16 by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level
54
        return urlopen(self.url).read()
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
55
56
    def parsePage(self, page_text):
57
        soup = BeautifulSoup(page_text)
58
        if soup is None:
7675.891.5 by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup
59
            return None
60
7675.891.41 by Bryce Harrington
Lint fixes
61
        # Load products into a list since Bugzilla references them
62
        # by index number
7675.891.5 by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup
63
        products = []
64
        for product in soup.find(
65
            name='select',
66
            onchange="doOnSelectProduct(2);").contents:
67
            if product.string != "\n":
68
                products.append({
69
                    'name': product.string,
70
                    'components': {},
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
71
                    'versions': None,
7675.891.5 by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup
72
                    })
73
74
        for script_text in soup.findAll(name="script"):
75
            if script_text is None or script_text.string is None:
76
                continue
77
            for line in script_text.string.split(";"):
78
                m = self.re_cpts.search(line)
79
                if m:
80
                    num = int(m.group(1))
7675.891.16 by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level
81
                    products[num]['components'] = dictFromCSV(m.group(2))
7675.891.5 by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup
82
83
                m = self.re_vers.search(line)
84
                if m:
85
                    num = int(m.group(1))
7675.891.16 by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level
86
                    products[num]['versions'] = dictFromCSV(m.group(2))
7675.891.2 by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file
87
88
        # Re-map list into dict for easier lookups
89
        for product in products:
90
            product_name = product['name']
91
            self.products[product_name] = product
7675.891.10 by Bryce Harrington
Isolate the scraping code into its own class separate from the finder
92
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
93
        return True
94
95
7675.891.10 by Bryce Harrington
Isolate the scraping code into its own class separate from the finder
96
class BugzillaRemoteComponentFinder:
97
    """Updates remote components for all Bugzillas registered in Launchpad"""
98
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
99
    # Names of bug trackers we should not pull data from
100
    _BLACKLIST = [
101
        u"ubuntu-bugzilla",
102
        u"mozilla.org",
103
        ]
104
7675.891.52 by Bryce Harrington
txn had been inherited from update-sourceforge-remote-products.py
105
    def __init__(self, logger=None, static_bugzilla_text=None):
106
        """Instantiates object, without performing any parsing.
107
12070.1.4 by Tim Penhey
Move FakeLogger and BufferLogger to lp.services.log.logging and delete the QuietFakeLogger.
108
        :param logger: A logger object
7675.891.52 by Bryce Harrington
txn had been inherited from update-sourceforge-remote-products.py
109
        :param static_bugzilla_text: Instead of retrieving the remote
110
         web page for a bug tracker, act as if this static text was
111
         returned.  This is intended for testing purposes to avoid
112
         needing to make remote web connections.
113
        """
7675.891.10 by Bryce Harrington
Isolate the scraping code into its own class separate from the finder
114
        self.logger = logger
115
        if logger is None:
116
            self.logger = default_log
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
117
        self.static_bugzilla_text = static_bugzilla_text
7675.891.10 by Bryce Harrington
Isolate the scraping code into its own class separate from the finder
118
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
119
    def getRemoteProductsAndComponents(self, bugtracker_name=None):
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
120
        lp_bugtrackers = getUtility(IBugTrackerSet)
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
121
        if bugtracker_name is not None:
7675.891.41 by Bryce Harrington
Lint fixes
122
            lp_bugtrackers = [
123
                lp_bugtrackers.getByName(bugtracker_name),
124
                ]
125
            if not lp_bugtrackers or len(lp_bugtrackers) != 1:
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
126
                self.logger.warning(
127
                    "Could not find specified bug tracker %s",
128
                    bugtracker_name)
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
129
        for lp_bugtracker in lp_bugtrackers:
130
            if lp_bugtracker.bugtrackertype != BugTrackerType.BUGZILLA:
131
                continue
132
            if lp_bugtracker.name in self._BLACKLIST:
133
                continue
134
13155.1.15 by Curtis Hovey
Hush lint.
135
            self.logger.info("%s: %s" % (
7675.891.44 by Bryce Harrington
Lint
136
                lp_bugtracker.name, lp_bugtracker.baseurl))
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
137
            bz_bugtracker = BugzillaRemoteComponentScraper(
13155.1.15 by Curtis Hovey
Hush lint.
138
                base_url=lp_bugtracker.baseurl)
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
139
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
140
            if self.static_bugzilla_text is not None:
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
141
                self.logger.debug("Using static bugzilla text")
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
142
                page_text = self.static_bugzilla_text
143
144
            else:
145
                try:
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
146
                    self.logger.debug("...Fetching page")
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
147
                    page_text = bz_bugtracker.getPage()
148
                except HTTPError, error:
7675.891.44 by Bryce Harrington
Lint
149
                    self.logger.error("Error fetching %s: %s" % (
150
                        lp_bugtracker.baseurl, error))
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
151
                    continue
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
152
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
153
            self.logger.debug("...Parsing html")
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
154
            bz_bugtracker.parsePage(page_text)
155
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
156
            self.logger.debug("...Storing new data to Launchpad")
7675.891.44 by Bryce Harrington
Lint
157
            self.storeRemoteProductsAndComponents(
158
                bz_bugtracker, lp_bugtracker)
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
159
160
    def storeRemoteProductsAndComponents(self, bz_bugtracker, lp_bugtracker):
161
        components_to_add = []
7675.891.25 by Bryce Harrington
Syntactical fixups
162
        for product in bz_bugtracker.products.itervalues():
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
163
            # Look up the component group id from Launchpad for the product
164
            # if it already exists.  Otherwise, add it.
165
            lp_component_group = lp_bugtracker.getRemoteComponentGroup(
7675.891.25 by Bryce Harrington
Syntactical fixups
166
                product['name'])
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
167
            if lp_component_group is None:
168
                lp_component_group = lp_bugtracker.addRemoteComponentGroup(
7675.891.25 by Bryce Harrington
Syntactical fixups
169
                    product['name'])
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
170
                if lp_component_group is None:
171
                    self.logger.warning("Failed to add new component group")
172
                    continue
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
173
            else:
174
                for component in lp_component_group.components:
175
                    if (component.name in product['components'] or
176
                        component.is_visible == False or
7675.891.25 by Bryce Harrington
Syntactical fixups
177
                        component.is_custom == True):
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
178
                        # We already know something about this component,
179
                        # or a user has configured it, so ignore it
180
                        del product['components'][component.name]
181
                    else:
7675.891.41 by Bryce Harrington
Lint fixes
182
                        # Component is now missing from Bugzilla,
183
                        # so drop it here too
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
184
                        component.remove()
185
7675.891.55 by Bryce Harrington
mars review - Clarify comment
186
            # The remaining components in the collection will need to be
187
            # added to launchpad.  Record them for now.
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
188
            for component in product['components'].values():
189
                components_to_add.append(
13155.1.15 by Curtis Hovey
Hush lint.
190
                    "('%s', %d, 'True', 'False')" % (
7675.891.32 by Bryce Harrington
Store components to database
191
                        component['name'], lp_component_group.id))
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
192
13155.1.15 by Curtis Hovey
Hush lint.
193
        if len(components_to_add) > 0:
7675.891.20 by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls
194
            sqltext = """
7675.891.14 by Bryce Harrington
Finish migrating scraping logic into scripts directory
195
            INSERT INTO BugTrackerComponent
196
            (name, component_group, is_visible, is_custom)
7675.891.32 by Bryce Harrington
Store components to database
197
            VALUES %s""" % ",\n ".join(components_to_add)
7675.891.41 by Bryce Harrington
Lint fixes
198
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
199
            self.logger.debug("...Inserting components into database")
7675.891.32 by Bryce Harrington
Store components to database
200
            store = IStore(BugTrackerComponent)
201
            store.execute(sqltext)
202
            store.commit()
203
            store.flush()
7675.891.43 by Bryce Harrington
Tweak logger output to reduce verbosity in test runs
204
            self.logger.debug("...Done")