1
"""HTTP and FTP walker.
3
This module implements classes to walk HTTP and FTP sites to find files.
12
from urllib import unquote_plus
13
from urlparse import urlsplit, urljoin
14
from BeautifulSoup import BeautifulSoup
16
from hct.util import log
17
from hct.util.path import as_dir, subdir, under_only
20
class WalkerError(Exception): pass
21
class FTPWalkerError(WalkerError): pass
22
class HTTPWalkerError(WalkerError): pass
25
class WalkerBase(object):
26
"""Base class for URL walkers.
28
This class is a base class for those wishing to implement protocol
29
specific walkers. Walkers behave much like the os.walk() function,
30
but taking a URL and working remotely.
32
A typical usage would be:
33
for (dirpath, dirnames, filenames) in ProtWalker(url):
36
Sub-classes are required to implement the open(), list() and close()
40
# URL schemes the walker supports, the first is the default
41
URL_SCHEMES = ["ftp", "http", "https"]
43
# Whether to ignore or parse fragments in the URL
46
def __init__(self, base, log_parent=None):
47
self.log = log.get_logger(type(self).__name__, log_parent)
50
(scheme, netloc, path, query, fragment) \
51
= urlsplit(base, self.URL_SCHEMES[0], self.FRAGMENTS)
52
if scheme not in self.URL_SCHEMES:
53
raise WalkerError, "Can't handle %s scheme" % scheme
55
self.full_netloc = netloc
58
(user_passwd, host) = netloc.split("@", 1)
59
self.host = unquote_plus(host)
62
(user, passwd) = user_passwd.split(":", 1)
63
self.user = unquote_plus(user)
64
self.passwd = unquote_plus(passwd)
66
self.user = unquote_plus(user_passwd)
69
self.host = unquote_plus(netloc)
74
self.fragment = fragment
76
self.path = as_dir(path)
79
"""Walk through the URL.
81
Yields (dirpath, dirnames, filenames) for each path under the base;
82
dirnames can be modified as with os.walk.
88
subdir = subdirs.pop(0)
90
(dirnames, filenames) = self.list(subdir)
91
yield (subdir, dirnames, filenames)
93
for dirname in dirnames:
94
subdirs.append(urljoin(subdir, as_dir(dirname)))
101
"""Open the FTP connection.
103
Must be implemented by sub-classes.
105
raise NotImplementedError
108
"""Close the FTP connection.
110
Must be implemented by sub-classes.
112
raise NotImplementedError
115
"""Return listing of directory.
117
Must be implemented by sub-classes to return two lists, one of
118
directory names and one of file names; both underneath the directory
121
raise NotImplementedError
124
class FTPWalker(WalkerBase):
125
"""FTP URL scheme walker.
127
This class implements a walker for the FTP URL scheme; it's fairly
128
simple and just walks the FTP tree beneath the URL given using CWD
132
# URL schemes the walker supports, the first is the default
133
URL_SCHEMES = ["ftp"]
135
# Whether to ignore or parse fragments in the URL
138
def __init__(self, *args, **kwds):
139
super(FTPWalker, self).__init__(*args, **kwds)
141
if self.user is None:
142
self.user = "anonymous"
143
if self.passwd is None:
147
"""Open the FTP connection."""
148
self.log.info("Connecting to %s", self.host)
149
self.ftp = ftplib.FTP()
150
self.ftp.connect(self.host)
152
if self.user is not None:
153
self.log.info("Logging in as %s", self.user)
154
self.ftp.login(self.user, self.passwd)
157
self.log.info("Connected, working directory is %s", pwd)
160
"""Close the FTP connection."""
161
self.log.info("Closing connection")
165
def list(self, subdir):
166
"""Change directory and return listing.
168
Returns two lists, one of directory names and one of file names
171
self.log.info("Changing directory to %s", subdir)
175
self.log.info("Listing remote directory")
176
self.ftp.retrlines("LIST", listing.append)
181
# XXX: Assume UNIX listings for now --keybuk 24jun05
182
words = line.split(None, 8)
184
self.log.debug("Ignoring short line: %s", line)
188
filename = words[-1].lstrip()
189
i = filename.find(" -> ")
191
filename = filename[:i]
194
if mode.startswith("d"):
195
if filename not in (".", ".."):
196
dirnames.append(filename)
197
elif mode.startswith("-") or mode.startswith("l"):
198
filenames.append(filename)
200
return (dirnames, filenames)
203
class HTTPWalker(WalkerBase):
204
"""HTTP URL scheme walker.
206
This class implements a walker for the HTTP and HTTPS URL schemes.
207
It works by assuming any URL ending with a / is a directory, and
208
every other URL a file. URLs are tested using HEAD to see whether
209
they cause a redirect to one ending with a /.
211
HTML Directory pages are parsed to find all links within them that
212
lead to deeper URLs; this way it isn't tied to the Apache directory
213
listing format and can actually walk arbitrary trees.
216
# URL schemes the walker supports, the first is the default
217
URL_SCHEMES = ["http", "https"]
219
# Whether to ignore or parse fragments in the URL
223
"""Open the HTTP connection."""
224
self.log.info("Connecting to %s", self.host)
225
if self.scheme == "https":
226
self.http = httplib.HTTPSConnection(self.host)
228
self.http = httplib.HTTPConnection(self.host)
231
self.log.info("Connected")
234
"""Close the FTP connection."""
235
self.log.info("Closing connection")
239
def request(self, method, path):
240
"""Make an HTTP request.
242
Returns the HTTPResponse object.
246
self.http.putrequest(method, path)
247
if self.user is not None:
248
auth = base64.encodestring("%s:%s" % (self.user, self.passwd))
249
self.http.putheader("Authorization", "Basic %s" % auth)
250
self.http.endheaders()
253
return self.http.getresponse()
254
except httplib.BadStatusLine:
255
self.log.error("Bad status line (did the server go away?)")
262
def isDirectory(self, path):
263
"""Return whether the path is a directory.
265
Assumes any path ending in a slash is a directory, and any that
266
redirects to a location ending in a slash is also a directory.
268
if path.endswith("/"):
271
self.log.info("Checking %s" % path)
272
response = self.request("HEAD", path)
274
if response.status != 301:
277
url = response.getheader("location")
278
(scheme, netloc, redirect_path, query, fragment) \
279
= urlsplit(url, self.scheme, self.FRAGMENTS)
281
if len(scheme) and scheme != self.scheme:
283
elif len(netloc) and netloc != self.full_netloc:
285
elif redirect_path != as_dir(path):
290
def list(self, dirname):
291
"""Download the HTML index at subdir and scrape for URLs.
293
Returns a list of directory names (links ending with /, or
294
that result in redirects to themselves ending in /) and
295
filenames (everything else) that reside underneath the path.
297
self.log.info("Getting %s" % dirname)
298
response = self.request("GET", dirname)
300
soup = BeautifulSoup()
301
soup.feed(response.read())
307
for anchor in soup("a"):
308
url = urljoin(self.path, anchor.get("href"))
309
(scheme, netloc, path, query, fragment) \
310
= urlsplit(url, self.scheme, self.FRAGMENTS)
312
# XXX: Only follow URLs that are directly underneath the one
313
# we were looking at. This avoids accidentally walking the
314
# entire world-wide-web, but does mean that "download.html"
315
# URLs won't work. Better suggestions accepted. --keybuk 27jun05
316
if len(scheme) and scheme != self.scheme:
318
elif len(netloc) and netloc != self.full_netloc:
320
elif not under_only(self.path, path):
323
filename = subdir(self.path, path)
324
if self.isDirectory(path):
325
dirnames.append(as_dir(filename))
327
filenames.append(filename)
329
return (dirnames, filenames)
333
"""Return a walker for the URL given."""
334
(scheme, netloc, path, query, fragment) = urlsplit(url, "file")
335
if scheme in ["ftp"]:
336
return FTPWalker(url)
337
elif scheme in ["http", "https"]:
338
return HTTPWalker(url)
339
elif scheme in ["file"]:
342
raise WalkerError, "Unknown scheme: %s" % scheme
344
def combine_url(base, subdir, filename):
345
"""Combine a URL from the three parts returned by walk()."""
346
subdir_url = urljoin(base, subdir)
347
return urljoin(subdir_url, filename)