1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
#!/usr/bin/python2.4
# Copyright 2004-2005 Canonical Ltd. All rights reserved.
"""
Process LinkChecker .csv results for the staging server, stuff them into
a database and generate a report suitable for spamming developers with.
"""
__metaclass__ = type
import _pythonpath
import csv, re, sys
from StringIO import StringIO
from optparse import OptionParser
from sqlobject import StringCol, IntCol, BoolCol, FloatCol, DatabaseIndex
from canonical.database.datetimecol import UtcDateTimeCol
from canonical.database.constants import UTC_NOW
from canonical.launchpad.scripts import db_options, logger_options, logger
from canonical.lp import initZopeless
from canonical.database.sqlbase import SQLBase
from canonical.config import config
from canonical.launchpad.mail import simple_sendmail
class CheckedLink(SQLBase):
_table = 'CheckedLink'
urlname = StringCol(notNull=True)
recursionlevel = IntCol(notNull=True)
parentname = StringCol(notNull=True)
baseref = StringCol(notNull=True)
result = StringCol(notNull=True)
resultcode = IntCol(notNull=True)
warningstring = StringCol(notNull=True)
infostring = StringCol(notNull=True)
valid = BoolCol(notNull=True)
url = StringCol(notNull=True, unique=True, alternateID=True)
line = IntCol(notNull=True)
col = IntCol(notNull=True)
name = StringCol(notNull=True)
dltime = FloatCol()
dlsize = IntCol()
checktime = FloatCol(notNull=True)
brokensince = UtcDateTimeCol(notNull=False, default=UTC_NOW)
#cached = BoolCol(notNull=True)
resultcode_index = DatabaseIndex('resultcode')
recursionlevel_index = DatabaseIndex('recursionlevel')
def main(csvfile, log):
# Where we store broken links
broken = set()
# Suck in the csv file, updating the database and adding to the broken set
reader = csv.DictReader(
(line.replace('\0','') for line in csvfile
if not line.startswith('#'))
)
for row in reader:
# Get the result code
if row['valid']:
row['resultcode'] = 200
row['result'] = '200 Ok'
else:
m = re.search('^(\d+)', row['result'] or '')
if m is None:
if row['result'] == 'URL is empty':
continue
elif 'The read operation timed out' in row['result']:
row['result'] = '601 %s' % row['result']
row['resultcode'] = 601
else:
row['result'] = '602 %s' % row['result']
row['resultcode'] = 602
else:
row['resultcode'] = int(m.group(1))
# Cast input and nuke crap (to avoid confusing SQLObject)
row['recursionlevel'] = int(row['recursionlevel'])
row['valid'] = row['valid'] in ('True', 'true')
row['line'] = int(row['line'])
row['col'] = int(row['column']) # Renamed - column is a SQL keyword
del row['column']
row['dltime'] = float(row['dltime'])
row['dlsize'] = int(row['dlsize'])
row['checktime'] = float(row['checktime'])
del row['cached']
if row['resultcode'] < 400:
row['brokensince'] = None
try:
link = CheckedLink.byUrl(row['url'])
link.set(**row)
except LookupError:
link = CheckedLink(**row)
broken.add(link)
total = len(broken)
# Delete any entries that were not spidered
# XXX StuartBishop 2005-07-04: Only if older than a threshold.
for link in CheckedLink.select():
if link in broken:
continue
link.destroySelf()
new_broken_links = CheckedLink.select("""
resultcode in (404, 500, 601)
AND brokensince > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
- '1 day 12 hours'::interval
""", orderBy=["recursionlevel", "parentname", "url"])
rep = report("New Arrivals", new_broken_links, total, brokensince=False)
old_broken_links = CheckedLink.select("""
resultcode in (404, 500, 601)
AND brokensince <= CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
- '1 day 12 hours'::interval
AND brokensince >
CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
""", orderBy=["recursionlevel", "parentname", "url"])
rep += report("Old Favorites", old_broken_links, total, brokensince=True)
antique_broken_links = CheckedLink.select("""
resultcode in (404, 500, 601)
AND brokensince <=
CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
""", orderBy=["brokensince", "recursionlevel", "parentname", "url"])
rep += report(
"Hall of Shame", antique_broken_links, total, brokensince=True
)
if not options.email:
# Print to stdout in system encoding - might raise UnicodeError on
# some systems. Tough.
print rep
else:
# Override this setting - we are only here if email explicitly
# requested on the command line.
config.zopeless.send_email = True
simple_sendmail(
"noreply@canonical.com", [options.email], options.subject,
rep, {'Keywords': 'LinkChecker', 'X-Fnord': 'Fnord'}
)
def report(title, links, total, brokensince=True):
out = StringIO()
heading = "%s (%d/%d)" % (title, links.count(), total)
print >> out, heading
print >> out, "=" * len(heading)
def print_row(title, value):
print >> out, "%-7s: %s" % (title, str(value))
for link in links:
print_row("Link", link.url)
print_row("Parent", link.parentname)
print_row("Result", link.result)
if link.warningstring:
print_row("Warning", link.warningstring)
if brokensince:
print_row("Since", link.since.strftime('%A %d %B %Y'))
print >> out
print >> out
return out.getvalue()
if __name__ == '__main__':
parser = OptionParser("Usage: %prog [OPTIONS] [input.csv]")
db_options(parser)
logger_options(parser)
parser.add_option(
"-c", "--create", action="store_true", dest="create",
default=False, help="Create the database tables"
)
parser.add_option(
"-s", "--subject", dest="subject", help="Email using SUBJECT",
metavar="SUBJECT", default="LinkChecker report"
)
parser.add_option(
"-t", "--to", dest="email", help="Email to ADDRESS",
metavar="ADDRESS", default=None
)
options, args = parser.parse_args()
log = logger(options)
if len(args) == 0 or args[0] == '-':
log.debug("Reading from stdin")
csvfile = sys.stdin
else:
csvfile = open(args[0], 'rb')
ztm = initZopeless(implicitBegin=True)
if options.create:
# Create the table if it doesn't exist. Unfortunately, this is broken
# so we only create the table if requested on the command line
CheckedLink.createTable(ifNotExists=True)
main(csvfile, log)
ztm.commit()
|