~launchpad-pqm/launchpad/devel

1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
1
#!/usr/bin/env python
2
# Copyright 2004-2005 Canonical Ltd.  All rights reserved.
3
"""
4
Process LinkChecker .csv results for the staging server, stuff them into
5
a database and generate a report suitable for spamming developers with.
6
"""
7
8
__metaclass__ = type
9
10
import _pythonpath
11
12
import csv, re, sys
13
from StringIO import StringIO
14
from optparse import OptionParser
15
from sqlobject import StringCol, IntCol, BoolCol, FloatCol, DatabaseIndex
16
from canonical.database.datetimecol import UtcDateTimeCol
17
from canonical.database.constants import UTC_NOW
18
from canonical.launchpad.scripts import db_options, logger_options, logger
19
from canonical.lp import initZopeless
20
from canonical.database.sqlbase import SQLBase
21
from canonical.config import config
22
from canonical.launchpad.mail import simple_sendmail
23
24
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
25
class CheckedLink(SQLBase):
26
    _table = 'CheckedLink'
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
27
    urlname = StringCol(notNull=True)
28
    recursionlevel = IntCol(notNull=True)
29
    parentname = StringCol(notNull=True)
30
    baseref = StringCol(notNull=True)
31
    result = StringCol(notNull=True)
32
    resultcode = IntCol(notNull=True)
33
    warningstring = StringCol(notNull=True)
34
    infostring = StringCol(notNull=True)
35
    valid = BoolCol(notNull=True)
36
    url = StringCol(notNull=True, unique=True, alternateID=True)
37
    line = IntCol(notNull=True)
38
    col = IntCol(notNull=True)
39
    name = StringCol(notNull=True)
40
    dltime = FloatCol()
41
    dlsize = IntCol()
42
    checktime = FloatCol(notNull=True)
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
43
    brokensince = UtcDateTimeCol(notNull=False, default=UTC_NOW)
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
44
    #cached = BoolCol(notNull=True)
45
46
    resultcode_index = DatabaseIndex('resultcode')
47
    recursionlevel_index = DatabaseIndex('recursionlevel')
48
49
50
def main(csvfile, log):
51
52
    # Where we store broken links
53
    broken = set()
54
55
    # Suck in the csv file, updating the database and adding to the broken set
56
    reader = csv.DictReader(
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
57
            (line.replace('\0','') for line in csvfile
58
                if not line.startswith('#'))
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
59
            )
60
    for row in reader:
61
        # Get the result code
2041 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
62
        if row['valid']:
63
            row['resultcode'] = 200
64
            row['result'] = '200 Ok'
65
        else:
66
            m = re.search('^(\d+)', row['result'] or '')
67
            if m is None:
68
                if row['result'] == 'URL is empty':
69
                    continue
70
                elif 'The read operation timed out' in row['result']:
71
                    row['result'] = '601 %s' % row['result']
72
                    row['resultcode'] = 601
73
                else:
74
                    row['result'] = '602 %s' % row['result']
75
                    row['resultcode'] = 602
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
76
            else:
2041 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
77
                row['resultcode'] = int(m.group(1))
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
78
79
        # Cast input and nuke crap (to avoid confusing SQLObject)
80
        row['recursionlevel'] = int(row['recursionlevel'])
81
        row['valid'] = row['valid'] in ('True', 'true')
82
        row['line'] = int(row['line'])
83
        row['col'] = int(row['column']) # Renamed - column is a SQL keyword
84
        del row['column']
85
        row['dltime'] = float(row['dltime'])
86
        row['dlsize'] = int(row['dlsize'])
87
        row['checktime'] = float(row['checktime'])
88
        del row['cached']
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
89
        if row['resultcode'] < 400:
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
90
            row['brokensince'] = None
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
91
92
        try:
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
93
            link = CheckedLink.byUrl(row['url'])
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
94
            link.set(**row)
95
        except LookupError:
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
96
            link = CheckedLink(**row)
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
97
        broken.add(link)
98
99
    total = len(broken)
100
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
101
    # Delete any entries that were not spidered
102
    # TODO: Only if older than a threshold -- StuartBishop 20050704
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
103
    for link in CheckedLink.select():
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
104
        if link in broken:
105
            continue
106
        link.destroySelf()
107
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
108
    new_broken_links = CheckedLink.select("""
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
109
        resultcode in (404, 500, 601)
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
110
        AND brokensince > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
111
            - '1 day 12 hours'::interval
112
        """, orderBy=["recursionlevel", "parentname", "url"])
113
114
    rep = report("New Arrivals", new_broken_links, total, brokensince=False)
115
116
    old_broken_links = CheckedLink.select("""
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
117
        resultcode in (404, 500, 601)
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
118
        AND brokensince <= CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
119
            - '1 day 12 hours'::interval
120
        AND brokensince >
121
            CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
122
        """, orderBy=["recursionlevel", "parentname", "url"])
123
124
    rep += report("Old Favorites", old_broken_links, total, brokensince=True)
125
126
    antique_broken_links = CheckedLink.select("""
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
127
        resultcode in (404, 500, 601)
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
128
        AND brokensince <=
129
            CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '14 days'::interval
130
        """, orderBy=["brokensince", "recursionlevel", "parentname", "url"])
131
132
    rep += report(
133
            "Hall of Shame", antique_broken_links, total, brokensince=True
134
            )
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
135
136
    if not options.email:
137
        # Print to stdout in system encoding - might raise UnicodeError on
138
        # some systems. Tough.
139
        print rep
140
    else:
141
        # Override this setting - we are only here if email explicitly
142
        # requested on the command line.
143
        config.zopeless.send_email = True
144
        simple_sendmail(
145
                "noreply@canonical.com", [options.email], options.subject,
146
                rep, {'Keywords': 'LinkChecker', 'X-Fnord': 'Fnord'}
147
                )
148
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
149
def report(title, links, total, brokensince=True):
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
150
151
    out = StringIO()
152
153
    heading = "%s (%d/%d)" % (title, links.count(), total)
154
    print >> out, heading
155
    print >> out, "=" * len(heading)
156
157
    def print_row(title, value):
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
158
        print >> out, "%-7s: %s" % (title, str(value))
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
159
160
    for link in links:
2034 by Canonical.com Patch Queue Manager
[trivial] linkreport.py tweaks
161
        print_row("Link", link.url)
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
162
        print_row("Parent", link.parentname)
163
        print_row("Result", link.result)
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
164
        if link.warningstring:
165
            print_row("Warning", link.warningstring)
166
        if brokensince:
1991 by Canonical.com Patch Queue Manager
[trivial] Add hall of shame
167
            print_row("Since", link.since.strftime('%A %d %B %Y'))
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
168
        print >> out
169
    print >> out
170
171
    return out.getvalue()
172
173
174
if __name__ == '__main__':
175
    parser = OptionParser("Usage: %prog [OPTIONS] [input.csv]")
176
    db_options(parser)
177
    logger_options(parser)
178
179
    parser.add_option(
180
            "-c", "--create", action="store_true", dest="create",
181
            default=False, help="Create the database tables"
182
            )
183
184
    parser.add_option(
185
            "-s", "--subject", dest="subject", help="Email using SUBJECT",
186
            metavar="SUBJECT", default="LinkChecker report"
187
            )
188
189
    parser.add_option(
190
            "-t", "--to", dest="email", help="Email to ADDRESS",
191
            metavar="ADDRESS", default=None
192
            )
193
    
194
    options, args = parser.parse_args()
195
196
    log = logger(options)
197
198
    if len(args) == 0 or args[0] == '-':
199
        log.debug("Reading from stdin")
200
        csvfile = sys.stdin
201
    else:
202
        csvfile = open(args[0], 'rb')
203
204
    ztm = initZopeless(implicitBegin=True)
205
206
    if options.create:
207
        # Create the table if it doesn't exist. Unfortunately, this is broken
208
        # so we only create the table if requested on the command line
2027 by Canonical.com Patch Queue Manager
[trivial] linkchecker report tweaks
209
        CheckedLink.createTable(ifNotExists=True)
1990 by Canonical.com Patch Queue Manager
[r=spiv] Spam launchpad developers with errors picked up by LinkChecker
210
211
    main(csvfile, log)
212
    ztm.commit()
213