3
Peter Bengtsson, mail@peterbe.com, 2004
5
slimmer.py is a simple set of functions for compressing/optimizing
6
HTML, XHTML and CSS documents as strings.
7
Ideally used from other modules used something like this::
10
>>> code = open('file.html').read()
11
>>> slimmed = slimmer.xhtml_slimmer(code)
12
>>> print len(code), len(slimmed)
14
You have to estimate yourself if you think it's worth using slimmer
15
on your documents if you're running a dynamic setting such as a
16
web application (e.g. Zope with CheckoutableTemplates).
17
On my PC I slimmed a 1MB .html document in 2.2 seconds and saved
18
100KB. Saved 31KB on a 110KB .css file in 0.063 seconds.
19
And lastly, saved 17% in size in 0.016 seconds for www.python.org.
23
0.1.17 Aug 2005 Fix in css_slimmer() for voice-family: hack (thanks Jens)
25
0.1.16 Jun 2005 Improved js_slimmer() for sloppy function definitions
27
0.1.15 Jun 2005 Improved js_slimmer() for sloppy if|else|else if statements
29
0.1.14 Apr 2005 Added unit test of Holly-hack for CSS
31
0.1.13 Apr 2005 Improved js_slimmer() to make 'y = 123;y = document;' to instead
32
become 'y=123;y=document;'
34
0.1.12 Mar 2005 Fixed css_slimmer() to put a linebreak before //-->
36
0.1.11 Feb 2005 Fixed js_slimmer() for some curly bracket endings
38
0.1.10 Jan 2005 (Major patch by Baruch Even)
39
- Fixed the -t option for testing, it didn't work, --test did work.
40
- Fixed a typo s/whatspace/whitespace/
41
- Fixed a bug were more than one consecutive space turned into nothing,
42
added test 6 for this.
43
- Revamped other code to completely eliminate end of lines. It works in
45
- Changed the test cases to fit
46
- Removed the last ; before } -> s/;}/}/
47
- Changed the test cases to fit
49
0.1.9 Jan 2005 CLI interface can accept URLs
51
0.1.8 Dec 2004 Added an option (UNQUOTE_HTML_ATTRIBUTES) to remove
52
quotes from HTML attributes. (default is off)
54
0.1.7 Dec 2004 Separate out from CheckoutableTemplates and __all__
55
variable fixed for js_slimmer.
57
0.1.6 Dec 2004 Care for MacIE5 CSS Hack (http://www.sam-i-am.com/work/sandbox/css/mac_ie5_hack.html)
59
0.1.5 Nov 2004 Some improvements to js_slimmer()
61
0.1.4 Nov 2004 Added first draft of js_slimmer()
63
0.1.3 Nov 2004 Much improved CLI functions
65
0.1.2 Sep 2004 Added basic CLI functions (see run())
67
0.1.1 Sep 2004 Major speed improvment by removing
68
the unquote_numerical feature.
70
0.1.0 Sep 2004 First version numbering
74
__all__=['acceptableSyntax','slimmer','css_slimmer',
75
'html_slimmer','xhtml_slimmer','js_slimmer',
78
import re, os, sys, getopt
83
# If you're slimming HTML docs and really want to
84
# convert border="0" to border=0, be aware that this
85
# can take 5 times longer than without but compresses
86
# the document at least twice as good.
87
UNQUOTE_HTML_ATTRIBUTES = 0
90
# Define the syntax options we accept
96
OK_SYNTAX = (HTML, XHTML, CSS, JS)
98
def acceptableSyntax(syntax):
99
""" return the syntax as we recognize it or None """
100
syntax = str(syntax).lower().strip().replace(' ','').replace('-','')
101
syntax = syntax.replace('stylesheet','css') # allow for alias
102
if syntax in OK_SYNTAX:
107
def slimmer(code, syntax=XHTML):
108
""" wrap all function we have """
110
return _xhtml_slimmer(code)
112
return _html_slimmer(code)
114
return _css_slimmer(code)
116
return _js_slimmer(code)
119
css_comments = re.compile(r'/\*.*?\*/', re.MULTILINE|re.DOTALL)
120
hex_colour = re.compile(r'#\w{2}\w{2}\w{2}')
122
def _css_slimmer(css):
123
""" remove repeating whitespace ( \t\n) """
125
#css = css_comments.sub('', css) # remove comments
126
remove_next_comment = 1
127
for css_comment in css_comments.findall(css):
128
if css_comment[-3:]=='\*/':
129
remove_next_comment=0
131
if remove_next_comment:
132
css = css.replace(css_comment, '')
134
remove_next_comment = 1
136
css = re.sub(r'\s\s+', ' ', css) # >= 2 whitespace becomes one whitespace
137
css = re.sub(r'\s+\n', '', css) # no whitespace before end of line
138
# Remove space before and after certain chars
139
for char in ('{', '}', ':', ';', ','):
140
css = re.sub(char+r'\s', char, css)
141
css = re.sub(r'\s'+char, char, css)
142
css = re.sub(r'\s+</',r'</', css) # no extraspace before </style>
143
css = re.sub(r'}\s(#|\w)', r'}\1', css)
144
css = re.sub(r';}', r'}', css) # no need for the ; before end of attributes
145
css = re.sub(r'}//-->', r'}\n//-->', css)
146
css = simplifyHexColours(css)
148
# voice-family hack. The declation: '''voice-family: "\"}\""''' requires
149
# that extra space between the ':' and the first '"' which _css_slimmer()
150
# removed. Put it back (http://real.issuetrackerproduct.com/0168)
151
css = re.sub(r'voice-family:"\\"}\\""', r'voice-family: "\\"}\\""', css)
157
f_IMD = re.I|re.MULTILINE|re.DOTALL
158
f_MD = re.MULTILINE|re.DOTALL
161
html_comments_oneline = re.compile(r'<!--.*?-->', re.I)
163
html_inline_css = re.compile(r'<style.*?>.*?</style>', f_IMD)
164
html_inline_js = re.compile(r'<script.*?>.*?</script>', f_IMD)
166
any_tag = re.compile(r"<\w.*?>", f_IMD)
167
excess_whitespace = re.compile(r' \s+|\s +', f_M)
168
excess_whitespace1 = re.compile(r'\w\s+\w', f_M)
169
excess_whitespace2 = re.compile(r'"\s+>', f_M)
170
excess_whitespace3 = re.compile(r"'\s+>", f_M)
171
excess_whitespace4 = re.compile('"\s\s+\w+="|\'\s\s+\w+=\'|"\s\s+\w+=|\'\s\s+\w+=', f_M)
172
excess_whitespace6 = re.compile(r"\d\s+>", f_M)
174
quotes_in_tag = re.compile('([a-zA-Z]+)="([a-zA-Z0-9-_\.]+)"')
176
def _html_slimmer(html, xml=0):
177
""" Optimize like XHTML but go one step further """
178
# 1. optimize inline CSS
179
for styletag in html_inline_css.findall(html):
180
html = html.replace(styletag, css_slimmer(styletag))
182
# 2. optimize inline Javascript
183
for scripttag in html_inline_js.findall(html):
184
html = html.replace(scripttag, js_slimmer(scripttag))
186
# 2. Remove excessive whitespace between tags
187
html = re.sub(r'>\s+<','><', html)
189
# 3. Remove oneline comments
190
html = html_comments_oneline.sub('', html)
192
# 4. In every tag, remove quotes on numerical attributes and all
193
# excessive whitespace
195
ew1 = excess_whitespace1 # shortcut
196
ew6 = excess_whitespace6 # shortcut
197
ew4 = excess_whitespace4 # shortcut
199
for tag in uniqify(any_tag.findall(html)):
200
# 4a. observe exceptions
201
if tag.startswith('<!') or tag.find('</')>-1:
205
# 4b. remove excess whitespace inside the tag
206
tag= excess_whitespace2.sub('">', tag)
207
tag= excess_whitespace3.sub("'>", tag)
209
for each in ew1.findall(tag)+ew6.findall(tag):
210
tag = tag.replace(each, excess_whitespace.sub(' ',each))
211
for each in ew4.findall(tag):
212
tag = tag.replace(each, each[0]+' '+each[1:].lstrip())
215
if not xml and UNQUOTE_HTML_ATTRIBUTES:
216
tag= quotes_in_tag.sub(r'\1=\2', tag)
218
# has the tag been improved?
220
html = html.replace(original, tag)
226
def _xhtml_slimmer(xhtml):
227
# currently not difference
228
return _html_slimmer(xhtml, xml=1)
231
excess_whitespace_js = re.compile('^\s+(\S)',re.MULTILINE)
232
excess_whitespace_js2 = re.compile('(\S+);\s+(\S+)', re.MULTILINE)
233
whitespaced_func_def = re.compile('(function)\s+(\S+\(.*?\))\s*{\s*(\S+)', f_IMD)
234
whitespaced_func_def2 = re.compile('function\s*\(\)\s*{\s*(\S+)', f_IMD)
235
js_comments_singlelines = re.compile('//.*?$', re.DOTALL|re.MULTILINE|re.I)
236
js_comments_singlelines2 = re.compile('((^|;|\s)//.*?$)', re.DOTALL|re.MULTILINE|re.I)
237
js_comment_end = re.compile('-->')
238
js_comment_start = re.compile('(<!--(.*?))$\s(\w+)', re.MULTILINE)
239
#js_comment_start2 = re.compile('(\<\!--(.*?)(\n+|[\r\n]+)\s*(\w+))', re.DOTALL|re.MULTILINE)
240
whitespaced_controls = re.compile('(for|else if|if|catch|while)\s*\((.*?)\)\s*{\s*(\S+)', f_IMD)
241
single_whitespaced_controls = re.compile('(try|else)\s*{\s*(\S+)', f_IMD)
242
sloppy_conditionals = re.compile('\(\s*(\S+)\s*(==|!=)\s*(\S+)\)')
243
sloppy_ifs = re.compile('}\s*(if|else if|else)\s*({|\()')
244
sloppy_declarations = re.compile('var\s+(\w+)\s*=\s*(\d+|\w+|\"[\w+ ]\")')
245
sloppy_simple_declarations = re.compile('(\w+)\s*=\s*(\d+|\w+|\"[\w+ ]\")')
246
sloppy_increments = re.compile('(\w+)\s*(\+=|-=)\s*(\d*|\"\w+\")')
247
js_multiline_comments = re.compile(r'/\*.*?\*/', re.MULTILINE|re.DOTALL)
248
closing_curly_brackets = re.compile(r'\s*}', re.MULTILINE)
249
opening_curly_brackets = re.compile(r'{\s*', re.MULTILINE)
250
function_space = re.compile(r'(function\s*\w+\((.*?)\)\s*{(.*?)})', re.MULTILINE|re.DOTALL)
251
variable_declaration_singleline = re.compile('(var\s+(\w+)\s*=.*?;)')
254
# 1. remove all whitespace starting every line
255
js = excess_whitespace_js.sub(r'\1',js)
257
# 2. Remove all /* multiline comments */
258
js = js_multiline_comments.sub('',js)
260
# 3. // style comments
261
for comment, start in js_comments_singlelines2.findall(js):
262
# ...except those that contain -->
266
if not js_comment_end.findall(comment):
267
js = js.replace(comment, replacewith)
269
js = js_comment_start.sub(r'<!--\n\3', js)
271
# 3. excessive whitespace after semicolons
272
js = excess_whitespace_js2.sub(r'\1;\2', js)
274
# 4. functions defined with lots of whitespace
275
js = whitespaced_func_def.sub(r'\1 \2{\3', js)
276
js = whitespaced_func_def2.sub(r'function(){\1', js)
278
# 5. control statements with lots of whitespace
279
js = whitespaced_controls.sub(r'\1(\2){\3', js)
281
# 6. control statements without params with lots of whitespace
282
js = single_whitespaced_controls.sub(r'\1{\2', js)
284
# 7. convert '(page == "foo")' to '(page=="foo")'
285
js = sloppy_conditionals.sub(r'(\1\2\3)', js)
287
# 8. convert '} else if {' to '}else if{'
288
js = sloppy_ifs.sub(r'}\1\2', js)
290
# 9. convert 'var x = foo' to 'var x=foo'
291
js = sloppy_declarations.sub(r'var \1=\2',js)
292
js = sloppy_simple_declarations.sub(r'\1=\2', js)
294
# 10. whitespace around closing } curly brackets
295
js = opening_curly_brackets.sub('{', js)
296
js = closing_curly_brackets.sub('}', js)
298
# 11. sloppy increments
299
js = sloppy_increments.sub(r'\1\2\3', js)
301
function_spaces = function_space.findall(js)
303
for wholefunction, arguments, functioncode in function_spaces:
304
arguments = [x.strip() for x in arguments.split(',')]
305
variable_declarations = variable_declaration_singleline.findall(functioncode)
306
print variable_declarations
312
## ----- Some fancier names
315
def css_slimmer(css):
316
return _css_slimmer(css)
318
def xhtml_slimmer(xhtml):
319
return _xhtml_slimmer(xhtml)
321
def html_slimmer(html):
322
return _html_slimmer(html)
325
return _js_slimmer(js)
328
## ----- Methods related to simplifying HEX colour codes
331
""" borrowed from Tim Peters' algorithm on ASPN Cookbook """
332
# REMEMBER! This will shuffle the order of the list
338
def simplifyHexColours(text):
339
""" Replace all colour declarations where pairs repeat.
340
I.e. #FFFFFF => #FFF; #CCEEFF => #CEF
341
and #EFEFEF, #EFCDI9 avoided """
342
colour_replacements = {}
343
all_hex_encodings = hex_colour.findall(text)
345
for e in uniqify(all_hex_encodings):
346
if e[1]==e[2] and e[3]==e[4] and e[5]==e[6]:
347
colour_replacements[e] = '#'+e[1]+e[3]+e[5]
348
mreplacer = MultiReplacer(colour_replacements)
349
return mreplacer(text)
352
def __init__(self, replacements, delimiter='\t', wholeWords=None, caseInsensitive=None):
353
# Build replacements dictionary - may come in as a mapping or as a file
354
self.replacements = {}
356
# replacements is a mapping
357
self.replacements.update(replacements)
359
# replacements is a file
360
replacementsFile = open(replacements, 'r')
361
for line in replacementsFile.readlines():
362
fromValue, toValue = line.split(delimiter)[:2] # Split line
364
while toValue[-1] in '\r\n': # Strip newlines
365
toValue = toValue[:-1]
367
self.replacements[fromValue] = toValue # Add to dictionary
368
replacementsFile.close()
370
# Build char to char mapping...
373
charMap = map(chr, range(256))
374
for fromValue, toValue in self.replacements.items():
375
if len(fromValue) <> 1 or len(toValue) <> 1:
378
charMap[ord(fromValue.upper())] = toValue
379
charMap[ord(fromValue.lower())] = toValue
381
charMap[ord(fromValue)] = toValue
383
self.charMap = "".join(charMap)
386
# String to string mapping - use a regular expression
387
fromVals = self.replacements.keys()
390
# Build regexp pattern
392
rePattern = '|'.join(map(re.escape, fromVals))
395
+ '|'.join(map(re.escape, fromVals)) + r')\b'
399
self.reObject = re.compile(rePattern, re.I)
401
self.reObject = re.compile(rePattern)
403
def __call__(self, string):
404
# apply replacement to string
406
# Char to char mapping
408
return string.translate(self.charMap)
410
# String to string mapping
411
return self.reObject.sub(self.__replaceMatch, string)
413
def __replaceMatch(self, match):
414
item = match.group(0)
415
return self.replacements.get(item)
419
print "Usage: python slimmer.py /path/to/input.html [xhtml|html|css] /path/to/output.html"
428
def _is_openable_url(path_or_url):
430
if path_or_url.lower().startswith('http'):
431
return _pingable(path_or_url)
435
def __guess_syntax(filepath):
438
if os.path.isfile(filepath) or _is_openable_url(filepath):
439
if filepath.lower().endswith('.css'):
441
elif filepath.lower().endswith('.js'):
444
if os.path.isfile(filepath):
447
f=urllib2.urlopen(filepath)
451
while len(lines) < 50 and line is not None:
457
break # paranoid safety
462
lines = '\n'.join([x for x in lines_list if x.find('!DOCTYPE')>-1])
463
if lines.find('HTML 4.0')>-1:
465
elif lines.find('XHTML 1.0')>-1:
467
elif lines.find('<html>') > -1:
470
lines = '\n'.join(lines_list)
471
if lines.lower().find('<html') > -1:
474
if filepath.lower().endswith('.html') or \
475
filepath.lower().endswith('.htm'):
482
usage="""slimmer.py Compress web files on the command line
483
Peter Bengtsson, <mail@peterbe.com>, Nov 2004
485
USAGE: python slimmer.py [OPTIONS] /path/to/input.html [xhtml|html|css]
488
-t, --test Perform a speed and compression test
489
--output Save result to file
490
--version Prints version and exits
491
-h, --help Prints this message
493
If you don't specify the content type after the input filename,
494
the program will try to guess it by opening the file and looking
495
at the file extension.
498
$ python slimmer.py index.html XHTML --output=index.optimized.html
499
$ python slimmer.py --test screen.css
509
class Usage(Exception):
510
def __init__(self, msg):
518
opts, args = getopt.getopt(argv[1:], "ho:vt",
519
["help", "output=", "version", "test"])
520
except getopt.error, msg:
522
# more code, unchanged
524
print >>sys.stderr, err.msg
525
print >>sys.stderr, "for help use --help"
535
elif o in ('-h', '--help'):
538
elif o in ('-o', '--output'):
540
elif o in ("-t", "--test"):
551
if arg in ('-t', '--test'):
553
elif arg.startswith('--output='):
555
elif acceptableSyntax(arg):
556
syntax = acceptableSyntax(arg)
557
elif os.path.isfile(arg) or _is_openable_url(arg):
560
otherargs.append(arg)
562
if inputfile and syntax is None:
563
syntax = __guess_syntax(inputfile)
565
if inputfile is None:
566
print >>sys.stderr, "No input file"
567
print >>sys.stderr, "for help use --help"
570
if not acceptableSyntax(syntax):
571
print >>sys.stderr, "Unrecognized syntax"
572
print >>sys.stderr, "for help use --help"
576
print >>sys.stderr, "Unrecognized arguments %r"%otherargs
577
print >>sys.stderr, "for help use --help"
582
run(inputfile, syntax, speedtest, outputfile)
587
from time import time
589
def run(inputfile, syntax, speedtest, outputfile):
590
if os.path.isfile(inputfile):
591
contents = open(inputfile).read()
593
contents = urllib2.urlopen(inputfile).read()
595
slimmed = slimmer(contents, syntax)
600
open(outputfile, 'w').write(slimmed)
604
before = len(contents)
606
after_zlibbed = len(slimmed.encode('zlib'))
608
if size_before > 100000:
609
size_before = "%s (%sK)"%(size_before, size_before/1024)
611
if size_after > 100000:
612
size_after = "%s (%sK)"%(size_after, size_after/1024)
613
size_difference = before-after
614
if size_difference > 10000:
615
size_difference = "%s (%sK)"%(size_difference, size_difference/1024)
616
print "Took %s seconds"%round(t, 3)
617
print "Bytes before: %s"%size_before
618
print "Bytes after: %s"%size_after
619
print "Bytes after zlib: %s"%after_zlibbed
620
print "Bytes saved: %s "%size_difference,
621
print "(%s%% of original size)"%(100*round(after/float(before), 2))
623
print >>sys.stdout, slimmed
626
if __name__=='__main__':