~launchpad-pqm/launchpad/devel

Viewing changes to lib/canonical/dyson/walker.py

Committer: Canonical.com Patch Queue Manager
Date: 2005-06-27 17:32:32 UTC
mfrom: (unknown (missing))
Revision ID: Arch-1:rocketfuel@canonical.com%launchpad--devel--0--patch-1946

dyson product release creator [r=dsilvers]
Patches applied:

* scott@canonical.com--2005/launchpad--dyson--0--base-0
   tag of rocketfuel@canonical.com/launchpad--devel--0--patch-1898

* scott@canonical.com--2005/launchpad--dyson--0--patch-1
   add releaseroot field to Product

* scott@canonical.com--2005/launchpad--dyson--0--patch-2
   add releaseroot product field to interface and sqlobject

* scott@canonical.com--2005/launchpad--dyson--0--patch-3
   add ProductRelease function to add a ProductReleaseFile link to LibraryFileAlias

* scott@canonical.com--2005/launchpad--dyson--0--patch-4
   dyson module (contains the non-specific code)

* scott@canonical.com--2005/launchpad--dyson--0--patch-5
   add dyson script that does the job with config changes to configure it

* scott@canonical.com--2005/launchpad--dyson--0--patch-6
   typo

* scott@canonical.com--2005/launchpad--dyson--0--patch-7
   add dyson user to security.cfg

* scott@canonical.com--2005/launchpad--dyson--0--patch-8
   use lp scripts to do logging options and stuff

* scott@canonical.com--2005/launchpad--dyson--0--patch-9
   add required comment

* scott@canonical.com--2005/launchpad--dyson--0--patch-10
   rename patch-17-77-0 to patch-17-29-0

* scott@canonical.com--2005/launchpad--dyson--0--patch-11
   fix the revision in my patch

* scott@canonical.com--2005/launchpad--dyson--0--patch-12
   merge and resolve conflicts

* scott@canonical.com--2005/launchpad--dyson--0--patch-13
   review comments from Kinnison

* scott@canonical.com--2005/launchpad--dyson--0--patch-14
   review comments

* scott@canonical.com--2005/launchpad--dyson--0--patch-15
   keep one cache for life and use a save() method rather than __del__()

* scott@canonical.com--2005/launchpad--dyson--0--patch-16
   bracket-space facism

* scott@canonical.com--2005/launchpad--dyson--0--patch-17
   fix up the test suite

* scott@canonical.com--2005/launchpad--dyson--0--patch-18
   comment that the http walker only goes down

* scott@canonical.com--2005/launchpad--dyson--0--patch-19
   add not-implemented functions in WalkerBase

* scott@canonical.com--2005/launchpad--dyson--0--patch-20
   ketchup

* scott@canonical.com--2005/launchpad--dyson--0--patch-21
   add missing register(__name__) line

files added:
database/schema/patch-17-29-0.sql

lib/BeautifulSoup.py

lib/canonical/dyson

lib/canonical/dyson/Makefile

lib/canonical/dyson/__init__.py

lib/canonical/dyson/filter.py

lib/canonical/dyson/hose.py

lib/canonical/dyson/tests

lib/canonical/dyson/tests/Makefile

lib/canonical/dyson/tests/__init__.py

lib/canonical/dyson/tests/test_filter.py

lib/canonical/dyson/tests/test_hose.py

lib/canonical/dyson/tests/test_walker.py

lib/canonical/dyson/testsuite.py

lib/canonical/dyson/walker.py

scripts/dyson-run.py

files modified:
configs/default/launchpad.conf

database/schema/comments.sql

database/schema/security.cfg

lib/canonical/config/schema.xml

lib/canonical/launchpad/database/product.py

lib/canonical/launchpad/database/productrelease.py

lib/canonical/launchpad/interfaces/product.py

lib/canonical/launchpad/interfaces/productrelease.py

Show diffs side-by-side

added added

removed removed

lib/canonical/dyson/walker.py

"""HTTP and FTP walker.

This module implements classes to walk HTTP and FTP sites to find files.

"""

import os

import base64

import ftplib

import httplib

import logging

from urllib import unquote_plus

from urlparse import urlsplit, urljoin

from BeautifulSoup import BeautifulSoup

from hct.util import log

from hct.util.path import as_dir, subdir, under_only

class WalkerError(Exception): pass

class FTPWalkerError(WalkerError): pass

class HTTPWalkerError(WalkerError): pass

class WalkerBase(object):

"""Base class for URL walkers.

This class is a base class for those wishing to implement protocol

specific walkers. Walkers behave much like the os.walk() function,

but taking a URL and working remotely.

A typical usage would be:

for (dirpath, dirnames, filenames) in ProtWalker(url):

...

Sub-classes are required to implement the open(), list() and close()

methods.

"""

# URL schemes the walker supports, the first is the default

URL_SCHEMES = ["ftp", "http", "https"]

# Whether to ignore or parse fragments in the URL

FRAGMENTS = False

def __init__(self, base, log_parent=None):

self.log = log.get_logger(type(self).__name__, log_parent)

self.base = base

(scheme, netloc, path, query, fragment) \

= urlsplit(base, self.URL_SCHEMES[0], self.FRAGMENTS)

if scheme not in self.URL_SCHEMES:

raise WalkerError, "Can't handle %s scheme" % scheme

self.scheme = scheme

self.full_netloc = netloc

try:

(user_passwd, host) = netloc.split("@", 1)

self.host = unquote_plus(host)

try:

(user, passwd) = user_passwd.split(":", 1)

self.user = unquote_plus(user)

self.passwd = unquote_plus(passwd)

except ValueError:

self.user = unquote_plus(user_passwd)

self.passwd = None

except ValueError:

self.host = unquote_plus(netloc)

self.user = None

self.passwd = None

self.query = query

self.fragment = fragment

self.path = as_dir(path)

def walk(self):

"""Walk through the URL.

Yields (dirpath, dirnames, filenames) for each path under the base;

dirnames can be modified as with os.walk.

"""

self.open()

subdirs = [self.path]

while len(subdirs):

subdir = subdirs.pop(0)

(dirnames, filenames) = self.list(subdir)

yield (subdir, dirnames, filenames)

for dirname in dirnames:

subdirs.append(urljoin(subdir, as_dir(dirname)))

self.close()

__iter__ = walk

100

def open(self):

101

"""Open the FTP connection.

102

103

Must be implemented by sub-classes.

104

"""

105

raise NotImplementedError

106

107

def close(self):

108

"""Close the FTP connection.

109

110

Must be implemented by sub-classes.

111

"""

112

raise NotImplementedError

113

114

def list(self, dir):

115

"""Return listing of directory.

116

117

Must be implemented by sub-classes to return two lists, one of

118

directory names and one of file names; both underneath the directory

119

given.

120

"""

121

raise NotImplementedError

122

123

124

class FTPWalker(WalkerBase):

125

"""FTP URL scheme walker.

126

127

This class implements a walker for the FTP URL scheme; it's fairly

128

simple and just walks the FTP tree beneath the URL given using CWD

129

and LIST.

130

"""

131

132

# URL schemes the walker supports, the first is the default

133

URL_SCHEMES = ["ftp"]

134

135

# Whether to ignore or parse fragments in the URL

136

FRAGMENTS = False

137

138

def __init__(self, *args, **kwds):

139

super(FTPWalker, self).__init__(*args, **kwds)

140

141

if self.user is None:

142

self.user = "anonymous"

143

if self.passwd is None:

144

self.passwd = ""

145

146

def open(self):

147

"""Open the FTP connection."""

148

self.log.info("Connecting to %s", self.host)

149

self.ftp = ftplib.FTP()

150

self.ftp.connect(self.host)

151

152

if self.user is not None:

153

self.log.info("Logging in as %s", self.user)

154

self.ftp.login(self.user, self.passwd)

155

156

pwd = self.ftp.pwd()

157

self.log.info("Connected, working directory is %s", pwd)

158

159

def close(self):

160

"""Close the FTP connection."""

161

self.log.info("Closing connection")

162

self.ftp.quit()

163

del self.ftp

164

165

def list(self, subdir):

166

"""Change directory and return listing.

167

168

Returns two lists, one of directory names and one of file names

169

under the path.

170

"""

171

self.log.info("Changing directory to %s", subdir)

172

self.ftp.cwd(subdir)

173

174

listing = []

175

self.log.info("Listing remote directory")

176

self.ftp.retrlines("LIST", listing.append)

177

178

dirnames = []

179

filenames = []

180

for line in listing:

181

# XXX: Assume UNIX listings for now --keybuk 24jun05

182

words = line.split(None, 8)

183

if len(words) < 6:

184

self.log.debug("Ignoring short line: %s", line)

185

continue

186

187

# Chomp symlinks

188

filename = words[-1].lstrip()

189

i = filename.find(" -> ")

190

if i >= 0:

191

filename = filename[:i]

192

193

mode = words[0]

194

if mode.startswith("d"):

195

if filename not in (".", ".."):

196

dirnames.append(filename)

197

elif mode.startswith("-") or mode.startswith("l"):

198

filenames.append(filename)

199

200

return (dirnames, filenames)

201

202

203

class HTTPWalker(WalkerBase):

204

"""HTTP URL scheme walker.

205

206

This class implements a walker for the HTTP and HTTPS URL schemes.

207

It works by assuming any URL ending with a / is a directory, and

208

every other URL a file. URLs are tested using HEAD to see whether

209

they cause a redirect to one ending with a /.

210

211

HTML Directory pages are parsed to find all links within them that

212

lead to deeper URLs; this way it isn't tied to the Apache directory

213

listing format and can actually walk arbitrary trees.

214

"""

215

216

# URL schemes the walker supports, the first is the default

217

URL_SCHEMES = ["http", "https"]

218

219

# Whether to ignore or parse fragments in the URL

220

FRAGMENTS = True

221

222

def open(self):

223

"""Open the HTTP connection."""

224

self.log.info("Connecting to %s", self.host)

225

if self.scheme == "https":

226

self.http = httplib.HTTPSConnection(self.host)

227

else:

228

self.http = httplib.HTTPConnection(self.host)

229

230

self.http.connect()

231

self.log.info("Connected")

232

233

def close(self):

234

"""Close the FTP connection."""

235

self.log.info("Closing connection")

236

self.http.close()

237

del self.http

238

239

def request(self, method, path):

240

"""Make an HTTP request.

241

242

Returns the HTTPResponse object.

243

"""

244

tries = 2

245

while tries > 0:

246

self.http.putrequest(method, path)

247

if self.user is not None:

248

auth = base64.encodestring("%s:%s" % (self.user, self.passwd))

249

self.http.putheader("Authorization", "Basic %s" % auth)

250

self.http.endheaders()

251

252

try:

253

return self.http.getresponse()

254

except httplib.BadStatusLine:

255

self.log.error("Bad status line (did the server go away?)")

256

257

self.open()

258

tries -= 1

259

if not tries:

260

raise

261

262

def isDirectory(self, path):

263

"""Return whether the path is a directory.

264

265

Assumes any path ending in a slash is a directory, and any that

266

redirects to a location ending in a slash is also a directory.

267

"""

268

if path.endswith("/"):

269

return True

270

271

self.log.info("Checking %s" % path)

272

response = self.request("HEAD", path)

273

response.close()

274

if response.status != 301:

275

return False

276

277

url = response.getheader("location")

278

(scheme, netloc, redirect_path, query, fragment) \

279

= urlsplit(url, self.scheme, self.FRAGMENTS)

280

281

if len(scheme) and scheme != self.scheme:

282

return False

283

elif len(netloc) and netloc != self.full_netloc:

284

return False

285

elif redirect_path != as_dir(path):

286

return False

287

else:

288

return True

289

290

def list(self, dirname):

291

"""Download the HTML index at subdir and scrape for URLs.

292

293

Returns a list of directory names (links ending with /, or

294

that result in redirects to themselves ending in /) and

295

filenames (everything else) that reside underneath the path.

296

"""

297

self.log.info("Getting %s" % dirname)

298

response = self.request("GET", dirname)

299

try:

300

soup = BeautifulSoup()

301

soup.feed(response.read())

302

finally:

303

response.close()

304

305

dirnames = []

306

filenames = []

307

for anchor in soup("a"):

308

url = urljoin(self.path, anchor.get("href"))

309

(scheme, netloc, path, query, fragment) \

310

= urlsplit(url, self.scheme, self.FRAGMENTS)

311

312

# XXX: Only follow URLs that are directly underneath the one

313

# we were looking at. This avoids accidentally walking the

314

# entire world-wide-web, but does mean that "download.html"

315

# URLs won't work. Better suggestions accepted. --keybuk 27jun05

316

if len(scheme) and scheme != self.scheme:

317

continue

318

elif len(netloc) and netloc != self.full_netloc:

319

continue

320

elif not under_only(self.path, path):

321

continue

322

323

filename = subdir(self.path, path)

324

if self.isDirectory(path):

325

dirnames.append(as_dir(filename))

326

else:

327

filenames.append(filename)

328

329

return (dirnames, filenames)

330

331

332

def walk(url):

333

"""Return a walker for the URL given."""

334

(scheme, netloc, path, query, fragment) = urlsplit(url, "file")

335

if scheme in ["ftp"]:

336

return FTPWalker(url)

337

elif scheme in ["http", "https"]:

338

return HTTPWalker(url)

339

elif scheme in ["file"]:

340

return os.walk(url)

341

else:

342

raise WalkerError, "Unknown scheme: %s" % scheme

343

344

def combine_url(base, subdir, filename):

345

"""Combine a URL from the three parts returned by walk()."""

346

subdir_url = urljoin(base, subdir)

347

return urljoin(subdir_url, filename)

Older »