13155.1.17
by Curtis Hovey
Updated the copyrights in the very old branch. |
1 |
# Copyright 2009-2011 Canonical Ltd. This software is licensed under the
|
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
2 |
# GNU Affero General Public License version 3 (see the file LICENSE).
|
3 |
||
4 |
"""Utilities for the update-bugzilla-remote-components cronscript"""
|
|
5 |
||
6 |
__metaclass__ = type |
|
7 |
__all__ = [ |
|
8 |
'BugzillaRemoteComponentFinder', |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
9 |
'BugzillaRemoteComponentScraper', |
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
10 |
]
|
11 |
||
12 |
import re |
|
7675.891.3
by Bryce Harrington
Move from use of pycurl to straight urllib2 |
13 |
from urllib2 import ( |
7675.891.32
by Bryce Harrington
Store components to database |
14 |
HTTPError, |
15 |
urlopen, |
|
16 |
)
|
|
7675.891.3
by Bryce Harrington
Move from use of pycurl to straight urllib2 |
17 |
from BeautifulSoup import BeautifulSoup |
7675.891.9
by Bryce Harrington
Drop LaunchpadBugTracker now that we can pull info from LP directly |
18 |
from canonical.launchpad.scripts.logger import log as default_log |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
19 |
from zope.component import getUtility |
20 |
from lp.bugs.interfaces.bugtracker import ( |
|
7675.891.32
by Bryce Harrington
Store components to database |
21 |
BugTrackerType, |
22 |
IBugTrackerSet, |
|
23 |
)
|
|
24 |
from lp.bugs.model.bugtracker import ( |
|
25 |
BugTrackerComponent, |
|
26 |
)
|
|
27 |
from canonical.launchpad.interfaces.lpstorm import IStore |
|
28 |
||
7675.891.3
by Bryce Harrington
Move from use of pycurl to straight urllib2 |
29 |
|
7675.891.16
by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level |
30 |
def dictFromCSV(line): |
31 |
items_dict = {} |
|
32 |
for item in line.split(","): |
|
33 |
item = item.strip() |
|
34 |
item = item.replace("'", "") |
|
35 |
item = item.replace("\\", "") |
|
36 |
items_dict[item] = { |
|
37 |
'name': item, |
|
38 |
}
|
|
39 |
return items_dict |
|
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
40 |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
41 |
|
7675.891.10
by Bryce Harrington
Isolate the scraping code into its own class separate from the finder |
42 |
class BugzillaRemoteComponentScraper: |
43 |
"""Scrapes Bugzilla query.cgi page for lists of products and components"""
|
|
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
44 |
|
45 |
re_cpts = re.compile(r'cpts\[(\d+)\] = \[(.*)\]') |
|
46 |
re_vers = re.compile(r'vers\[(\d+)\] = \[(.*)\]') |
|
47 |
||
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
48 |
def __init__(self, base_url=None): |
7675.891.16
by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level |
49 |
self.base_url = re.sub(r'/$', '', base_url) |
13155.1.15
by Curtis Hovey
Hush lint. |
50 |
self.url = "%s/query.cgi?format=advanced" % (self.base_url) |
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
51 |
self.products = {} |
7675.891.3
by Bryce Harrington
Move from use of pycurl to straight urllib2 |
52 |
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
53 |
def getPage(self): |
7675.891.16
by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level |
54 |
return urlopen(self.url).read() |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
55 |
|
56 |
def parsePage(self, page_text): |
|
57 |
soup = BeautifulSoup(page_text) |
|
58 |
if soup is None: |
|
7675.891.5
by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup |
59 |
return None |
60 |
||
7675.891.41
by Bryce Harrington
Lint fixes |
61 |
# Load products into a list since Bugzilla references them
|
62 |
# by index number
|
|
7675.891.5
by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup |
63 |
products = [] |
64 |
for product in soup.find( |
|
65 |
name='select', |
|
66 |
onchange="doOnSelectProduct(2);").contents: |
|
67 |
if product.string != "\n": |
|
68 |
products.append({ |
|
69 |
'name': product.string, |
|
70 |
'components': {}, |
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
71 |
'versions': None, |
7675.891.5
by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup |
72 |
})
|
73 |
||
74 |
for script_text in soup.findAll(name="script"): |
|
75 |
if script_text is None or script_text.string is None: |
|
76 |
continue
|
|
77 |
for line in script_text.string.split(";"): |
|
78 |
m = self.re_cpts.search(line) |
|
79 |
if m: |
|
80 |
num = int(m.group(1)) |
|
7675.891.16
by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level |
81 |
products[num]['components'] = dictFromCSV(m.group(2)) |
7675.891.5
by Bryce Harrington
Drop dependence on pycurl; convert to BeautifulSoup |
82 |
|
83 |
m = self.re_vers.search(line) |
|
84 |
if m: |
|
85 |
num = int(m.group(1)) |
|
7675.891.16
by Bryce Harrington
Trim trailing / on urls. Move dictFromCSV() to being a top level |
86 |
products[num]['versions'] = dictFromCSV(m.group(2)) |
7675.891.2
by Bryce Harrington
Break out BugzillaRemoteComponentFinder to separate script file |
87 |
|
88 |
# Re-map list into dict for easier lookups
|
|
89 |
for product in products: |
|
90 |
product_name = product['name'] |
|
91 |
self.products[product_name] = product |
|
7675.891.10
by Bryce Harrington
Isolate the scraping code into its own class separate from the finder |
92 |
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
93 |
return True |
94 |
||
95 |
||
7675.891.10
by Bryce Harrington
Isolate the scraping code into its own class separate from the finder |
96 |
class BugzillaRemoteComponentFinder: |
97 |
"""Updates remote components for all Bugzillas registered in Launchpad"""
|
|
98 |
||
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
99 |
# Names of bug trackers we should not pull data from
|
100 |
_BLACKLIST = [ |
|
101 |
u"ubuntu-bugzilla", |
|
102 |
u"mozilla.org", |
|
103 |
]
|
|
104 |
||
7675.891.52
by Bryce Harrington
txn had been inherited from update-sourceforge-remote-products.py |
105 |
def __init__(self, logger=None, static_bugzilla_text=None): |
106 |
"""Instantiates object, without performing any parsing.
|
|
107 |
||
12070.1.4
by Tim Penhey
Move FakeLogger and BufferLogger to lp.services.log.logging and delete the QuietFakeLogger. |
108 |
:param logger: A logger object
|
7675.891.52
by Bryce Harrington
txn had been inherited from update-sourceforge-remote-products.py |
109 |
:param static_bugzilla_text: Instead of retrieving the remote
|
110 |
web page for a bug tracker, act as if this static text was
|
|
111 |
returned. This is intended for testing purposes to avoid
|
|
112 |
needing to make remote web connections.
|
|
113 |
"""
|
|
7675.891.10
by Bryce Harrington
Isolate the scraping code into its own class separate from the finder |
114 |
self.logger = logger |
115 |
if logger is None: |
|
116 |
self.logger = default_log |
|
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
117 |
self.static_bugzilla_text = static_bugzilla_text |
7675.891.10
by Bryce Harrington
Isolate the scraping code into its own class separate from the finder |
118 |
|
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
119 |
def getRemoteProductsAndComponents(self, bugtracker_name=None): |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
120 |
lp_bugtrackers = getUtility(IBugTrackerSet) |
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
121 |
if bugtracker_name is not None: |
7675.891.41
by Bryce Harrington
Lint fixes |
122 |
lp_bugtrackers = [ |
123 |
lp_bugtrackers.getByName(bugtracker_name), |
|
124 |
]
|
|
125 |
if not lp_bugtrackers or len(lp_bugtrackers) != 1: |
|
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
126 |
self.logger.warning( |
127 |
"Could not find specified bug tracker %s", |
|
128 |
bugtracker_name) |
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
129 |
for lp_bugtracker in lp_bugtrackers: |
130 |
if lp_bugtracker.bugtrackertype != BugTrackerType.BUGZILLA: |
|
131 |
continue
|
|
132 |
if lp_bugtracker.name in self._BLACKLIST: |
|
133 |
continue
|
|
134 |
||
13155.1.15
by Curtis Hovey
Hush lint. |
135 |
self.logger.info("%s: %s" % ( |
7675.891.44
by Bryce Harrington
Lint |
136 |
lp_bugtracker.name, lp_bugtracker.baseurl)) |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
137 |
bz_bugtracker = BugzillaRemoteComponentScraper( |
13155.1.15
by Curtis Hovey
Hush lint. |
138 |
base_url=lp_bugtracker.baseurl) |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
139 |
|
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
140 |
if self.static_bugzilla_text is not None: |
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
141 |
self.logger.debug("Using static bugzilla text") |
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
142 |
page_text = self.static_bugzilla_text |
143 |
||
144 |
else: |
|
145 |
try: |
|
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
146 |
self.logger.debug("...Fetching page") |
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
147 |
page_text = bz_bugtracker.getPage() |
148 |
except HTTPError, error: |
|
7675.891.44
by Bryce Harrington
Lint |
149 |
self.logger.error("Error fetching %s: %s" % ( |
150 |
lp_bugtracker.baseurl, error)) |
|
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
151 |
continue
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
152 |
|
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
153 |
self.logger.debug("...Parsing html") |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
154 |
bz_bugtracker.parsePage(page_text) |
155 |
||
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
156 |
self.logger.debug("...Storing new data to Launchpad") |
7675.891.44
by Bryce Harrington
Lint |
157 |
self.storeRemoteProductsAndComponents( |
158 |
bz_bugtracker, lp_bugtracker) |
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
159 |
|
160 |
def storeRemoteProductsAndComponents(self, bz_bugtracker, lp_bugtracker): |
|
161 |
components_to_add = [] |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
162 |
for product in bz_bugtracker.products.itervalues(): |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
163 |
# Look up the component group id from Launchpad for the product
|
164 |
# if it already exists. Otherwise, add it.
|
|
165 |
lp_component_group = lp_bugtracker.getRemoteComponentGroup( |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
166 |
product['name']) |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
167 |
if lp_component_group is None: |
168 |
lp_component_group = lp_bugtracker.addRemoteComponentGroup( |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
169 |
product['name']) |
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
170 |
if lp_component_group is None: |
171 |
self.logger.warning("Failed to add new component group") |
|
172 |
continue
|
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
173 |
else: |
174 |
for component in lp_component_group.components: |
|
175 |
if (component.name in product['components'] or |
|
176 |
component.is_visible == False or |
|
7675.891.25
by Bryce Harrington
Syntactical fixups |
177 |
component.is_custom == True): |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
178 |
# We already know something about this component,
|
179 |
# or a user has configured it, so ignore it
|
|
180 |
del product['components'][component.name] |
|
181 |
else: |
|
7675.891.41
by Bryce Harrington
Lint fixes |
182 |
# Component is now missing from Bugzilla,
|
183 |
# so drop it here too
|
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
184 |
component.remove() |
185 |
||
7675.891.55
by Bryce Harrington
mars review - Clarify comment |
186 |
# The remaining components in the collection will need to be
|
187 |
# added to launchpad. Record them for now.
|
|
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
188 |
for component in product['components'].values(): |
189 |
components_to_add.append( |
|
13155.1.15
by Curtis Hovey
Hush lint. |
190 |
"('%s', %d, 'True', 'False')" % ( |
7675.891.32
by Bryce Harrington
Store components to database |
191 |
component['name'], lp_component_group.id)) |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
192 |
|
13155.1.15
by Curtis Hovey
Hush lint. |
193 |
if len(components_to_add) > 0: |
7675.891.20
by Bryce Harrington
Provide hooks so tests can supply static text rather than pulling urls |
194 |
sqltext = """ |
7675.891.14
by Bryce Harrington
Finish migrating scraping logic into scripts directory |
195 |
INSERT INTO BugTrackerComponent
|
196 |
(name, component_group, is_visible, is_custom)
|
|
7675.891.32
by Bryce Harrington
Store components to database |
197 |
VALUES %s""" % ",\n ".join(components_to_add) |
7675.891.41
by Bryce Harrington
Lint fixes |
198 |
|
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
199 |
self.logger.debug("...Inserting components into database") |
7675.891.32
by Bryce Harrington
Store components to database |
200 |
store = IStore(BugTrackerComponent) |
201 |
store.execute(sqltext) |
|
202 |
store.commit() |
|
203 |
store.flush() |
|
7675.891.43
by Bryce Harrington
Tweak logger output to reduce verbosity in test runs |
204 |
self.logger.debug("...Done") |