~launchpad-pqm/launchpad/devel : contents of utilities/roundup-sniffer.py at revision 11259

~launchpad-pqm/launchpad/devel : (revision 11259)

#!/usr/bin/env python
#
# Copyright 2009 Canonical Ltd.  This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).

"""
This script is here to help us discover what the text equivalent of a
Roundup numeric field is remotely, without access to the Roundup
database.

It does this by downloading all bugs from the remote bug tracker in
CSV format, which gives us numeric values for the fields we're
interested in (e.g. status and substatus).

It then discovers all distinct combinations of those fields then
downloads an example bug page for each. It scrapes the bug page to
find the text that corresponds to the numeric value we already have.

There is a race condition. Someone can edit the bug page between the
CSV download and the bug page download, so be sure to run this more
than once and compare the results.

To complicate matters, downloaded pages are cached. To redownload the
CSV or a bug page the cache file must be deleted. It is a completely
non-HTTP compliant cache! This is an aid during development when this
script is run many times, and also provides a measure of robustness
against errors; there's no need to start from the beginning every
time.

Perhaps the best way to make this work for a new Roundup instance is
to subclass RoundupSniffer and implement get_text_values() and
populate the class-level "fields" variable. See MplayerStatusSniffer
for an example.
"""

__metaclass__ = type

import csv
import optparse
import sys
import urllib2

from base64 import urlsafe_b64encode
from os import mkdir
from os.path import join, exists
from pprint import pprint
from time import sleep
from urllib import urlencode

from BeautifulSoup import BeautifulSoup


class RoundupSniffer:
    """Sniffs the meaning of numeric fields in remote Roundups."""

    fields = ('status',)

    def __init__(self, base_url, cache_dir):
        self.base_url = base_url
        self.cache_dir = cache_dir
        if not exists(self.cache_dir):
            mkdir(self.cache_dir)

    def fetch(self, url):
        """Fetch the URL, consulting the cache first."""
        filename = join(self.cache_dir, urlsafe_b64encode(url))
        if not exists(filename):
            open(filename, 'wb').write(
                urllib2.urlopen(url).read())
        return open(filename, 'rb')

    def get_all_bugs(self):
        all_fields = ['id']
        all_fields.extend(self.fields)
        query = [
            ('@action', 'export_csv'),
            ('@columns', ','.join(all_fields)),
            ('@sort', 'activity'),
            ('@group', 'priority'),
            ('@pagesize', '50'),
            ('@startwith', '0'),
            ]
        url = '%s?%s' % (self.base_url, urlencode(query))
        bugs = csv.DictReader(self.fetch(url))
        return list(bugs)

    def get_text_values(self, bug):
        raise NotImplementedError(self.get_text_values.func_name)


class MplayerStatusSniffer(RoundupSniffer):
    """Sniffer for the Mplayer/FFMpeg Roundup.

    http://roundup.mplayerhq.hu/roundup/ffmpeg/

    This looks to be a mostly unmodified instance, so this sniffer may
    be useful in general.
    """

    fields = ('status', 'substatus')

    def get_text_values(self, bug):
        """Returns the text of status and substatus for the given bug.

        This is done by downloading the HTML bug page and scraping it.
        """
        url = '%s%s' % (self.base_url, bug['id'])
        page = self.fetch(url).read()
        soup = BeautifulSoup(page)
        return tuple(
            node.string for node in
            soup.find('th', text='Status').findNext('td').findAll('span'))


def get_distinct(things, fields):
    """Identify every distinct combination of fields.

    For each combination also return one example thing.
    """
    def key(thing):
        return tuple(thing[field] for field in fields)
    return dict((key(thing), thing) for thing in things)


def gen_mapping(sniffer):
    """Generate a mapping from raw field values to text values."""
    bugs = sniffer.get_all_bugs()
    distinct_bugs = get_distinct(bugs, sniffer.fields)
    for raw_values, bug in distinct_bugs.items():
        text_values = sniffer.get_text_values(bug)
        yield raw_values, text_values


def parse_args(args):
    parser = optparse.OptionParser()
    parser.add_option(
        "--base-url", dest="base_url",
        help="The base URL at the remote Roundup instance.",
        metavar="URL")
    parser.add_option(
        "--delay", dest="delay", type="int",
        help=("The number of seconds to wait between each page "
              "load [default: %default]."))
    parser.add_option(
        "--cache-dir", dest="cache_dir",
        help=("A directory in which to cache fetched resources "
              "[default: %default]."),
        metavar="DIR")
    parser.add_option(
        "--sniffer-class", dest="sniffer_class",
        help="The sniffer class to use [default: %default].",
        metavar="CLASSNAME")
    parser.set_defaults(
        delay=0, cache_dir="roundup_sniffer_cache",
        sniffer_class="MplayerStatusSniffer")

    options, args = parser.parse_args(args)

    if not options.base_url:
        parser.error("Please specify a base URL.")
    if len(args) > 0:
        parser.error("Positional arguments are not accepted: %s" %
                     ' '.join(args))

    return options


if __name__ == '__main__':
    options = parse_args(sys.argv[1:])
    sniffer = eval(options.sniffer_class)(
        options.base_url, options.cache_dir)
    mapping = {}
    for raw, text in gen_mapping(sniffer):
        mapping[raw] = text
        sleep(options.delay)
    pprint(mapping)

10637.3.1 by Guilherme Salgado Use the default python version instead of a hard-coded version	1	#!/usr/bin/env python
8452.3.3 by Karl Fogel * utilities/: Add copyright header block to source files that were	2	#
8687.15.2 by Karl Fogel In files modified by r8688, change "<YEARS>" to "2009", as per	3	# Copyright 2009 Canonical Ltd. This software is licensed under the
8687.15.3 by Karl Fogel Shorten the copyright header block to two lines.	4	# GNU Affero General Public License version 3 (see the file LICENSE).
7403.5.13 by Gavin Panella Restore the sniffer script. There are better ways to get full information about what statuses a remote Roundup tracker can support, but this gives us information about what statuses are actually used. It will also work with only anonymous access to the remote tracker.	5
	6	"""
	7	This script is here to help us discover what the text equivalent of a
	8	Roundup numeric field is remotely, without access to the Roundup
	9	database.
	10
	11	It does this by downloading all bugs from the remote bug tracker in
	12	CSV format, which gives us numeric values for the fields we're
	13	interested in (e.g. status and substatus).
	14
	15	It then discovers all distinct combinations of those fields then
	16	downloads an example bug page for each. It scrapes the bug page to
	17	find the text that corresponds to the numeric value we already have.
	18
	19	There is a race condition. Someone can edit the bug page between the
	20	CSV download and the bug page download, so be sure to run this more
	21	than once and compare the results.
	22
	23	To complicate matters, downloaded pages are cached. To redownload the
	24	CSV or a bug page the cache file must be deleted. It is a completely
	25	non-HTTP compliant cache! This is an aid during development when this
	26	script is run many times, and also provides a measure of robustness
	27	against errors; there's no need to start from the beginning every
	28	time.
	29
7403.5.17 by Gavin Panella Typo.	30	Perhaps the best way to make this work for a new Roundup instance is
7403.5.13 by Gavin Panella Restore the sniffer script. There are better ways to get full information about what statuses a remote Roundup tracker can support, but this gives us information about what statuses are actually used. It will also work with only anonymous access to the remote tracker.	31	to subclass RoundupSniffer and implement get_text_values() and
	32	populate the class-level "fields" variable. See MplayerStatusSniffer
	33	for an example.
	34	"""
	35
	36	__metaclass__ = type
	37
	38	import csv
	39	import optparse
	40	import sys
	41	import urllib2
	42
	43	from base64 import urlsafe_b64encode
	44	from os import mkdir
	45	from os.path import join, exists
	46	from pprint import pprint
	47	from time import sleep
	48	from urllib import urlencode
	49
	50	from BeautifulSoup import BeautifulSoup
	51
	52
	53	class RoundupSniffer:
	54	"""Sniffs the meaning of numeric fields in remote Roundups."""
	55
	56	fields = ('status',)
	57
	58	def __init__(self, base_url, cache_dir):
	59	self.base_url = base_url
	60	self.cache_dir = cache_dir
	61	if not exists(self.cache_dir):
	62	mkdir(self.cache_dir)
	63
	64	def fetch(self, url):
	65	"""Fetch the URL, consulting the cache first."""
	66	filename = join(self.cache_dir, urlsafe_b64encode(url))
	67	if not exists(filename):
	68	open(filename, 'wb').write(
	69	urllib2.urlopen(url).read())
	70	return open(filename, 'rb')
	71
	72	def get_all_bugs(self):
	73	all_fields = ['id']
	74	all_fields.extend(self.fields)
	75	query = [
	76	('@action', 'export_csv'),
	77	('@columns', ','.join(all_fields)),
	78	('@sort', 'activity'),
	79	('@group', 'priority'),
	80	('@pagesize', '50'),
	81	('@startwith', '0'),
	82	]
	83	url = '%s?%s' % (self.base_url, urlencode(query))
	84	bugs = csv.DictReader(self.fetch(url))
	85	return list(bugs)
	86
	87	def get_text_values(self, bug):
	88	raise NotImplementedError(self.get_text_values.func_name)
	89
	90
	91	class MplayerStatusSniffer(RoundupSniffer):
	92	"""Sniffer for the Mplayer/FFMpeg Roundup.
	93
	94	http://roundup.mplayerhq.hu/roundup/ffmpeg/
95
96	This looks to be a mostly unmodified instance, so this sniffer may
97	be useful in general.
98	"""
99
100	fields = ('status', 'substatus')
101
102	def get_text_values(self, bug):
103	"""Returns the text of status and substatus for the given bug.
104
105	This is done by downloading the HTML bug page and scraping it.
106	"""
107	url = '%s%s' % (self.base_url, bug['id'])
108	page = self.fetch(url).read()
109	soup = BeautifulSoup(page)
110	return tuple(
111	node.string for node in
112	soup.find('th', text='Status').findNext('td').findAll('span'))
113
114
115	def get_distinct(things, fields):
116	"""Identify every distinct combination of fields.
117
118	For each combination also return one example thing.
119	"""
120	def key(thing):
121	return tuple(thing[field] for field in fields)
122	return dict((key(thing), thing) for thing in things)
123
124
125	def gen_mapping(sniffer):
126	"""Generate a mapping from raw field values to text values."""
127	bugs = sniffer.get_all_bugs()
128	distinct_bugs = get_distinct(bugs, sniffer.fields)
129	for raw_values, bug in distinct_bugs.items():
130	text_values = sniffer.get_text_values(bug)
131	yield raw_values, text_values
132
133
134	def parse_args(args):
135	parser = optparse.OptionParser()
136	parser.add_option(
137	"--base-url", dest="base_url",
138	help="The base URL at the remote Roundup instance.",
139	metavar="URL")
140	parser.add_option(
141	"--delay", dest="delay", type="int",
142	help=("The number of seconds to wait between each page "
143	"load [default: %default]."))
144	parser.add_option(
145	"--cache-dir", dest="cache_dir",
146	help=("A directory in which to cache fetched resources "
147	"[default: %default]."),
148	metavar="DIR")
149	parser.add_option(
150	"--sniffer-class", dest="sniffer_class",
151	help="The sniffer class to use [default: %default].",
152	metavar="CLASSNAME")
153	parser.set_defaults(
154	delay=0, cache_dir="roundup_sniffer_cache",
155	sniffer_class="MplayerStatusSniffer")
156
157	options, args = parser.parse_args(args)
158
159	if not options.base_url:
160	parser.error("Please specify a base URL.")
161	if len(args) > 0:
162	parser.error("Positional arguments are not accepted: %s" %
163	' '.join(args))
164
165	return options
166
167
168	if __name__ == '__main__':
169	options = parse_args(sys.argv[1:])
170	sniffer = eval(options.sniffer_class)(
171	options.base_url, options.cache_dir)
172	mapping = {}
173	for raw, text in gen_mapping(sniffer):
174	mapping[raw] = text
175	sleep(options.delay)
176	pprint(mapping)