3
# Copyright 2009 Canonical Ltd. This software is licensed under the
4
# GNU Affero General Public License version 3 (see the file LICENSE).
7
Process LinkChecker .csv results for the staging server, stuff them into
8
a database and generate a report suitable for spamming developers with.
13
# pylint: disable-msg=W0403
17
from StringIO import StringIO
18
from optparse import OptionParser
19
from sqlobject import StringCol, IntCol, BoolCol, FloatCol, DatabaseIndex
20
from canonical.database.datetimecol import UtcDateTimeCol
21
from canonical.database.constants import UTC_NOW
22
from canonical.launchpad.scripts import db_options, logger_options, logger
23
from canonical.lp import initZopeless
24
from canonical.database.sqlbase import SQLBase
25
from canonical.config import config
26
from lp.services.mail.sendmail import simple_sendmail
29
class CheckedLink(SQLBase):
30
_table = 'CheckedLink'
31
urlname = StringCol(notNull=True)
32
recursionlevel = IntCol(notNull=True)
33
parentname = StringCol(notNull=True)
34
baseref = StringCol(notNull=True)
35
result = StringCol(notNull=True)
36
resultcode = IntCol(notNull=True)
37
warningstring = StringCol(notNull=True)
38
infostring = StringCol(notNull=True)
39
valid = BoolCol(notNull=True)
40
url = StringCol(notNull=True, unique=True, alternateID=True)
41
line = IntCol(notNull=True)
42
col = IntCol(notNull=True)
43
name = StringCol(notNull=True)
46
checktime = FloatCol(notNull=True)
47
brokensince = UtcDateTimeCol(notNull=False, default=UTC_NOW)
48
#cached = BoolCol(notNull=True)
50
resultcode_index = DatabaseIndex('resultcode')
51
recursionlevel_index = DatabaseIndex('recursionlevel')
54
def main(csvfile, log):
56
# Where we store broken links
59
# Suck in the csv file, updating the database and adding to the broken set
60
reader = csv.DictReader(
61
(line.replace('\0','') for line in csvfile
62
if not line.startswith('#'))
67
row['resultcode'] = 200
68
row['result'] = '200 Ok'
70
m = re.search('^(\d+)', row['result'] or '')
72
if row['result'] == 'URL is empty':
74
elif 'The read operation timed out' in row['result']:
75
row['result'] = '601 %s' % row['result']
76
row['resultcode'] = 601
78
row['result'] = '602 %s' % row['result']
79
row['resultcode'] = 602
81
row['resultcode'] = int(m.group(1))
83
# Cast input and nuke crap (to avoid confusing SQLObject)
84
row['recursionlevel'] = int(row['recursionlevel'])
85
row['valid'] = row['valid'] in ('True', 'true')
86
row['line'] = int(row['line'])
87
row['col'] = int(row['column']) # Renamed - column is a SQL keyword
89
row['dltime'] = float(row['dltime'])
90
row['dlsize'] = int(row['dlsize'])
91
row['checktime'] = float(row['checktime'])
93
if row['resultcode'] < 400:
94
row['brokensince'] = None
97
link = CheckedLink.byUrl(row['url'])
100
link = CheckedLink(**row)
105
# Delete any entries that were not spidered
106
# XXX StuartBishop 2005-07-04: Only if older than a threshold.
107
for link in CheckedLink.select():
112
new_broken_links = CheckedLink.select("""
113
resultcode in (404, 500, 601)
114
AND brokensince > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
115
- '1 day 12 hours'::interval
116
""", orderBy=["recursionlevel", "parentname", "url"])
118
rep = report("New Arrivals", new_broken_links, total, brokensince=False)
120
old_broken_links = CheckedLink.select("""
121
resultcode in (404, 500, 601)
122
AND brokensince <= CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
123
- '1 day 12 hours'::interval
125
CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
126
""", orderBy=["recursionlevel", "parentname", "url"])
128
rep += report("Old Favorites", old_broken_links, total, brokensince=True)
130
antique_broken_links = CheckedLink.select("""
131
resultcode in (404, 500, 601)
133
CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
134
""", orderBy=["brokensince", "recursionlevel", "parentname", "url"])
137
"Hall of Shame", antique_broken_links, total, brokensince=True
140
if not options.email:
141
# Print to stdout in system encoding - might raise UnicodeError on
142
# some systems. Tough.
145
# Override this setting - we are only here if email explicitly
146
# requested on the command line.
147
send_email_data = """
151
config.push('send_email_data', send_email_data)
153
"noreply@canonical.com", [options.email], options.subject,
154
rep, {'Keywords': 'LinkChecker', 'X-Fnord': 'Fnord'}
156
config.pop('send_email_data')
159
def report(title, links, total, brokensince=True):
163
heading = "%s (%d/%d)" % (title, links.count(), total)
164
print >> out, heading
165
print >> out, "=" * len(heading)
167
def print_row(title, value):
168
print >> out, "%-7s: %s" % (title, str(value))
171
print_row("Link", link.url)
172
print_row("Parent", link.parentname)
173
print_row("Result", link.result)
174
if link.warningstring:
175
print_row("Warning", link.warningstring)
177
print_row("Since", link.since.strftime('%A %d %B %Y'))
181
return out.getvalue()
184
if __name__ == '__main__':
185
parser = OptionParser("Usage: %prog [OPTIONS] [input.csv]")
187
logger_options(parser)
190
"-c", "--create", action="store_true", dest="create",
191
default=False, help="Create the database tables"
195
"-s", "--subject", dest="subject", help="Email using SUBJECT",
196
metavar="SUBJECT", default="LinkChecker report"
200
"-t", "--to", dest="email", help="Email to ADDRESS",
201
metavar="ADDRESS", default=None
204
options, args = parser.parse_args()
206
log = logger(options)
208
if len(args) == 0 or args[0] == '-':
209
log.debug("Reading from stdin")
212
csvfile = open(args[0], 'rb')
217
# Create the table if it doesn't exist. Unfortunately, this is broken
218
# so we only create the table if requested on the command line
219
CheckedLink.createTable(ifNotExists=True)