~launchpad-pqm/launchpad/devel : revision 3973.1.72

1

"""

2

slimmer.py

3

Peter Bengtsson, mail@peterbe.com, 2004

4

5

slimmer.py is a simple set of functions for compressing/optimizing

6

HTML, XHTML and CSS documents as strings.

7

Ideally used from other modules used something like this::

8

9

>>> import slimmer

10

>>> code = open('file.html').read()

11

>>> slimmed = slimmer.xhtml_slimmer(code)

12

>>> print len(code), len(slimmed)

13

14

You have to estimate yourself if you think it's worth using slimmer

15

on your documents if you're running a dynamic setting such as a

16

web application (e.g. Zope with CheckoutableTemplates).

17

On my PC I slimmed a 1MB .html document in 2.2 seconds and saved

18

100KB. Saved 31KB on a 110KB .css file in 0.063 seconds.

19

And lastly, saved 17% in size in 0.016 seconds for www.python.org.

20

21

22

Changes::

23

0.1.17 Aug 2005 Fix in css_slimmer() for voice-family: hack (thanks Jens)

24

25

0.1.16 Jun 2005 Improved js_slimmer() for sloppy function definitions

26

27

0.1.15 Jun 2005 Improved js_slimmer() for sloppy if|else|else if statements

28

29

0.1.14 Apr 2005 Added unit test of Holly-hack for CSS

30

31

0.1.13 Apr 2005 Improved js_slimmer() to make 'y = 123;y = document;' to instead

32

become 'y=123;y=document;'

33

34

0.1.12 Mar 2005 Fixed css_slimmer() to put a linebreak before //-->

35

36

0.1.11 Feb 2005 Fixed js_slimmer() for some curly bracket endings

37

38

0.1.10 Jan 2005 (Major patch by Baruch Even)

39

- Fixed the -t option for testing, it didn't work, --test did work.

40

- Fixed a typo s/whatspace/whitespace/

41

- Fixed a bug were more than one consecutive space turned into nothing,

42

added test 6 for this.

43

- Revamped other code to completely eliminate end of lines. It works in

44

FireFox 1.0

45

- Changed the test cases to fit

46

- Removed the last ; before } -> s/;}/}/

47

- Changed the test cases to fit

48

49

0.1.9 Jan 2005 CLI interface can accept URLs

50

51

0.1.8 Dec 2004 Added an option (UNQUOTE_HTML_ATTRIBUTES) to remove

52

quotes from HTML attributes. (default is off)

53

54

0.1.7 Dec 2004 Separate out from CheckoutableTemplates and __all__

55

variable fixed for js_slimmer.

56

57

0.1.6 Dec 2004 Care for MacIE5 CSS Hack (http://www.sam-i-am.com/work/sandbox/css/mac_ie5_hack.html)

58

59

0.1.5 Nov 2004 Some improvements to js_slimmer()

60

61

0.1.4 Nov 2004 Added first draft of js_slimmer()

62

63

0.1.3 Nov 2004 Much improved CLI functions

64

65

0.1.2 Sep 2004 Added basic CLI functions (see run())

66

67

0.1.1 Sep 2004 Major speed improvment by removing

68

the unquote_numerical feature.

69

70

0.1.0 Sep 2004 First version numbering

71

"""

72

73

__version__='0.1.17'

74

__all__=['acceptableSyntax','slimmer','css_slimmer',

75

'html_slimmer','xhtml_slimmer','js_slimmer',

76

'__version__']

77

78

import re, os, sys, getopt

79

import urllib2

80

81

## Options

82

#

83

# If you're slimming HTML docs and really want to

84

# convert border="0" to border=0, be aware that this

85

# can take 5 times longer than without but compresses

86

# the document at least twice as good.

87

UNQUOTE_HTML_ATTRIBUTES = 0

88

89

90

# Define the syntax options we accept

91

HTML = 'html'

92

XHTML = 'xhtml'

93

CSS = 'css'

94

JS = 'js'

95

96

OK_SYNTAX = (HTML, XHTML, CSS, JS)

97

98

def acceptableSyntax(syntax):

99

""" return the syntax as we recognize it or None """

100

syntax = str(syntax).lower().strip().replace(' ','').replace('-','')

101

syntax = syntax.replace('stylesheet','css') # allow for alias

102

if syntax in OK_SYNTAX:

103

return syntax

104

else:

105

return None

106

107

def slimmer(code, syntax=XHTML):

108

""" wrap all function we have """

109

if syntax == XHTML:

110

return _xhtml_slimmer(code)

111

elif syntax == HTML:

112

return _html_slimmer(code)

113

elif syntax == CSS:

114

return _css_slimmer(code)

115

elif syntax == JS:

116

return _js_slimmer(code)

117

118

# CSS

119

css_comments = re.compile(r'/\*.*?\*/', re.MULTILINE|re.DOTALL)

120

hex_colour = re.compile(r'#\w{2}\w{2}\w{2}')

121

122

def _css_slimmer(css):

123

""" remove repeating whitespace ( \t\n) """

124

125

#css = css_comments.sub('', css) # remove comments

126

remove_next_comment = 1

127

for css_comment in css_comments.findall(css):

128

if css_comment[-3:]=='\*/':

129

remove_next_comment=0

130

continue

131

if remove_next_comment:

132

css = css.replace(css_comment, '')

133

else:

134

remove_next_comment = 1

135

136

css = re.sub(r'\s\s+', ' ', css) # >= 2 whitespace becomes one whitespace

137

css = re.sub(r'\s+\n', '', css) # no whitespace before end of line

138

# Remove space before and after certain chars

139

for char in ('{', '}', ':', ';', ','):

140

css = re.sub(char+r'\s', char, css)

141

css = re.sub(r'\s'+char, char, css)

142

css = re.sub(r'\s+</',r'</', css) # no extraspace before </style>

143

css = re.sub(r'}\s(#|\w)', r'}\1', css)

144

css = re.sub(r';}', r'}', css) # no need for the ; before end of attributes

145

css = re.sub(r'}//-->', r'}\n//-->', css)

146

css = simplifyHexColours(css)

147

148

# voice-family hack. The declation: '''voice-family: "\"}\""''' requires

149

# that extra space between the ':' and the first '"' which _css_slimmer()

150

# removed. Put it back (http://real.issuetrackerproduct.com/0168)

151

css = re.sub(r'voice-family:"\\"}\\""', r'voice-family: "\\"}\\""', css)

152

153

return css.strip()

154

155

156

# HTML

157

f_IMD = re.I|re.MULTILINE|re.DOTALL

158

f_MD = re.MULTILINE|re.DOTALL

159

f_M = re.MULTILINE

160

161

html_comments_oneline = re.compile(r'', re.I)

162

163

html_inline_css = re.compile(r'<style.*?>.*?</style>', f_IMD)

164

html_inline_js = re.compile(r'<script.*?>.*?</script>', f_IMD)

165

166

any_tag = re.compile(r"<\w.*?>", f_IMD)

167

excess_whitespace = re.compile(r' \s+|\s +', f_M)

168

excess_whitespace1 = re.compile(r'\w\s+\w', f_M)

169

excess_whitespace2 = re.compile(r'"\s+>', f_M)

170

excess_whitespace3 = re.compile(r"'\s+>", f_M)

171

excess_whitespace4 = re.compile('"\s\s+\w+="|\'\s\s+\w+=\'|"\s\s+\w+=|\'\s\s+\w+=', f_M)

172

excess_whitespace6 = re.compile(r"\d\s+>", f_M)

173

174

quotes_in_tag = re.compile('([a-zA-Z]+)="([a-zA-Z0-9-_\.]+)"')

175

176

def _html_slimmer(html, xml=0):

177

""" Optimize like XHTML but go one step further """

178

# 1. optimize inline CSS

179

for styletag in html_inline_css.findall(html):

180

html = html.replace(styletag, css_slimmer(styletag))

181

182

# 2. optimize inline Javascript

183

for scripttag in html_inline_js.findall(html):

184

html = html.replace(scripttag, js_slimmer(scripttag))

185

186

# 2. Remove excessive whitespace between tags

187

html = re.sub(r'>\s+<','><', html)

188

189

# 3. Remove oneline comments

190

html = html_comments_oneline.sub('', html)

191

192

# 4. In every tag, remove quotes on numerical attributes and all

193

# excessive whitespace

194

195

ew1 = excess_whitespace1 # shortcut

196

ew6 = excess_whitespace6 # shortcut

197

ew4 = excess_whitespace4 # shortcut

198

199

for tag in uniqify(any_tag.findall(html)):

200

# 4a. observe exceptions

201

if tag.startswith('<!') or tag.find('</')>-1:

202

continue

203

original = tag

204

205

# 4b. remove excess whitespace inside the tag

206

tag= excess_whitespace2.sub('">', tag)

207

tag= excess_whitespace3.sub("'>", tag)

208

209

for each in ew1.findall(tag)+ew6.findall(tag):

210

tag = tag.replace(each, excess_whitespace.sub(' ',each))

211

for each in ew4.findall(tag):

212

tag = tag.replace(each, each[0]+' '+each[1:].lstrip())

213

214

# 4c. remove quotes

215

if not xml and UNQUOTE_HTML_ATTRIBUTES:

216

tag= quotes_in_tag.sub(r'\1=\2', tag)

217

218

# has the tag been improved?

219

if original != tag:

220

html = html.replace(original, tag)

221

222

return html.strip()

223

224

225

226

def _xhtml_slimmer(xhtml):

227

# currently not difference

228

return _html_slimmer(xhtml, xml=1)

229

230

231

excess_whitespace_js = re.compile('^\s+(\S)',re.MULTILINE)

232

excess_whitespace_js2 = re.compile('(\S+);\s+(\S+)', re.MULTILINE)

233

whitespaced_func_def = re.compile('(function)\s+(\S+$.*?$)\s*{\s*(\S+)', f_IMD)

234

whitespaced_func_def2 = re.compile('function\s*\s*{\s*(\S+)', f_IMD)

235

js_comments_singlelines = re.compile('//.*?$', re.DOTALL|re.MULTILINE|re.I)

236

js_comments_singlelines2 = re.compile('((^|;|\s)//.*?$)', re.DOTALL|re.MULTILINE|re.I)

237

js_comment_end = re.compile('-->')

238

js_comment_start = re.compile('(<!--(.*?))$\s(\w+)', re.MULTILINE)

239

#js_comment_start2 = re.compile('(\<\!--(.*?)(\n+|[\r\n]+)\s*(\w+))', re.DOTALL|re.MULTILINE)

240

whitespaced_controls = re.compile('(for|else if|if|catch|while)\s*$(.*?)$\s*{\s*(\S+)', f_IMD)

241

single_whitespaced_controls = re.compile('(try|else)\s*{\s*(\S+)', f_IMD)

242

sloppy_conditionals = re.compile('$\s*(\S+)\s*(==|!=)\s*(\S+)$')

243

sloppy_ifs = re.compile('}\s*(if|else if|else)\s*({|\()')

244

sloppy_declarations = re.compile('var\s+(\w+)\s*=\s*(\d+|\w+|\"[\w+ ]\")')

245

sloppy_simple_declarations = re.compile('(\w+)\s*=\s*(\d+|\w+|\"[\w+ ]\")')

246

sloppy_increments = re.compile('(\w+)\s*(\+=|-=)\s*(\d*|\"\w+\")')

247

js_multiline_comments = re.compile(r'/\*.*?\*/', re.MULTILINE|re.DOTALL)

248

closing_curly_brackets = re.compile(r'\s*}', re.MULTILINE)

249

opening_curly_brackets = re.compile(r'{\s*', re.MULTILINE)

250

function_space = re.compile(r'(function\s*\w+$(.*?)$\s*{(.*?)})', re.MULTILINE|re.DOTALL)

251

variable_declaration_singleline = re.compile('(var\s+(\w+)\s*=.*?;)')

252

253

def _js_slimmer(js):

254

# 1. remove all whitespace starting every line

255

js = excess_whitespace_js.sub(r'\1',js)

256

257

# 2. Remove all /* multiline comments */

258

js = js_multiline_comments.sub('',js)

259

260

# 3. // style comments

261

for comment, start in js_comments_singlelines2.findall(js):

262

# ...except those that contain -->

263

replacewith = ''

264

if start == ';':

265

replacewith = ';'

266

if not js_comment_end.findall(comment):

267

js = js.replace(comment, replacewith)

268

269

js = js_comment_start.sub(r'<!--\n\3', js)

270

271

# 3. excessive whitespace after semicolons

272

js = excess_whitespace_js2.sub(r'\1;\2', js)

273

274

# 4. functions defined with lots of whitespace

275

js = whitespaced_func_def.sub(r'\1 \2{\3', js)

276

js = whitespaced_func_def2.sub(r'function(){\1', js)

277

278

# 5. control statements with lots of whitespace

279

js = whitespaced_controls.sub(r'\1(\2){\3', js)

280

281

# 6. control statements without params with lots of whitespace

282

js = single_whitespaced_controls.sub(r'\1{\2', js)

283

284

# 7. convert '(page == "foo")' to '(page=="foo")'

285

js = sloppy_conditionals.sub(r'(\1\2\3)', js)

286

287

# 8. convert '} else if {' to '}else if{'

288

js = sloppy_ifs.sub(r'}\1\2', js)

289

290

# 9. convert 'var x = foo' to 'var x=foo'

291

js = sloppy_declarations.sub(r'var \1=\2',js)

292

js = sloppy_simple_declarations.sub(r'\1=\2', js)

293

294

# 10. whitespace around closing } curly brackets

295

js = opening_curly_brackets.sub('{', js)

296

js = closing_curly_brackets.sub('}', js)

297

298

# 11. sloppy increments

299

js = sloppy_increments.sub(r'\1\2\3', js)

300

301

function_spaces = function_space.findall(js)

302

303

for wholefunction, arguments, functioncode in function_spaces:

304

arguments = [x.strip() for x in arguments.split(',')]

305

variable_declarations = variable_declaration_singleline.findall(functioncode)

306

print variable_declarations

307

308

309

return js.strip()

310

311

312

## ----- Some fancier names

313

##

314

315

def css_slimmer(css):

316

return _css_slimmer(css)

317

318

def xhtml_slimmer(xhtml):

319

return _xhtml_slimmer(xhtml)

320

321

def html_slimmer(html):

322

return _html_slimmer(html)

323

324

def js_slimmer(js):

325

return _js_slimmer(js)

326

327

328

## ----- Methods related to simplifying HEX colour codes

329

330

def uniqify(all):

331

""" borrowed from Tim Peters' algorithm on ASPN Cookbook """

332

# REMEMBER! This will shuffle the order of the list

333

u = {}

334

for each in all:

335

u[each]=1

336

return u.keys()

337

338

def simplifyHexColours(text):

339

""" Replace all colour declarations where pairs repeat.

340

I.e. #FFFFFF => #FFF; #CCEEFF => #CEF

341

and #EFEFEF, #EFCDI9 avoided """

342

colour_replacements = {}

343

all_hex_encodings = hex_colour.findall(text)

344

345

for e in uniqify(all_hex_encodings):

346

if e[1]==e[2] and e[3]==e[4] and e[5]==e[6]:

347

colour_replacements[e] = '#'+e[1]+e[3]+e[5]

348

mreplacer = MultiReplacer(colour_replacements)

349

return mreplacer(text)

350

351

class MultiReplacer:

352

def __init__(self, replacements, delimiter='\t', wholeWords=None, caseInsensitive=None):

353

# Build replacements dictionary - may come in as a mapping or as a file

354

self.replacements = {}

355

try:

356

# replacements is a mapping

357

self.replacements.update(replacements)

358

except TypeError:

359

# replacements is a file

360

replacementsFile = open(replacements, 'r')

361

for line in replacementsFile.readlines():

362

fromValue, toValue = line.split(delimiter)[:2] # Split line

363

364

while toValue[-1] in '\r\n': # Strip newlines

365

toValue = toValue[:-1]

366

367

self.replacements[fromValue] = toValue # Add to dictionary

368

replacementsFile.close()

369

370

# Build char to char mapping...

371

self.charMap = None

372

if not wholeWords:

373

charMap = map(chr, range(256))

374

for fromValue, toValue in self.replacements.items():

375

if len(fromValue) <> 1 or len(toValue) <> 1:

376

break

377

if caseInsensitive:

378

charMap[ord(fromValue.upper())] = toValue

379

charMap[ord(fromValue.lower())] = toValue

380

else:

381

charMap[ord(fromValue)] = toValue

382

else:

383

self.charMap = "".join(charMap)

384

return

385

386

# String to string mapping - use a regular expression

387

fromVals = self.replacements.keys()

388

fromVals.sort()

389

390

# Build regexp pattern

391

if not wholeWords:

392

rePattern = '|'.join(map(re.escape, fromVals))

393

else:

394

rePattern = r'\b(' \

395

+ '|'.join(map(re.escape, fromVals)) + r')\b'

396

397

# Compile regexp

398

if caseInsensitive:

399

self.reObject = re.compile(rePattern, re.I)

400

else:

401

self.reObject = re.compile(rePattern)

402

403

def __call__(self, string):

404

# apply replacement to string

405

406

# Char to char mapping

407

if self.charMap:

408

return string.translate(self.charMap)

409

410

# String to string mapping

411

return self.reObject.sub(self.__replaceMatch, string)

412

413

def __replaceMatch(self, match):

414

item = match.group(0)

415

return self.replacements.get(item)

416

417

418

def __grr():

419

print "Usage: python slimmer.py /path/to/input.html [xhtml|html|css] /path/to/output.html"

420

421

def _pingable(url):

422

try:

423

urllib2.urlopen(url)

424

return 1

425

except:

426

return 0

427

428

def _is_openable_url(path_or_url):

429

# looks like a URL?

430

if path_or_url.lower().startswith('http'):

431

return _pingable(path_or_url)

432

else:

433

return 0

434

435

def __guess_syntax(filepath):

436

lines = []

437

438

if os.path.isfile(filepath) or _is_openable_url(filepath):

439

if filepath.lower().endswith('.css'):

440

return 'css'

441

elif filepath.lower().endswith('.js'):

442

return 'js'

443

444

if os.path.isfile(filepath):

445

f=open(filepath)

446

else:

447

f=urllib2.urlopen(filepath)

448

449

line = f.readline()

450

c = 0

451

while len(lines) < 50 and line is not None:

452

if line.strip():

453

lines.append(line)

454

line = f.readline()

455

c += 1

456

if c>100:

457

break # paranoid safety

458

459

f.close()

460

461

lines_list = lines

462

lines = '\n'.join([x for x in lines_list if x.find('!DOCTYPE')>-1])

463

if lines.find('HTML 4.0')>-1:

464

return 'html'

465

elif lines.find('XHTML 1.0')>-1:

466

return 'xhtml'

467

elif lines.find('<html>') > -1:

468

return 'html'

469

else:

470

lines = '\n'.join(lines_list)

471

if lines.lower().find('<html') > -1:

472

return 'html'

473

474

if filepath.lower().endswith('.html') or \

475

filepath.lower().endswith('.htm'):

476

return 'html'

477

478

479

return None

480

481

482

usage="""slimmer.py Compress web files on the command line

483

Peter Bengtsson, <mail@peterbe.com>, Nov 2004

484

485

USAGE: python slimmer.py [OPTIONS] /path/to/input.html [xhtml|html|css]

486

487

Options:

488

-t, --test Perform a speed and compression test

489

--output Save result to file

490

--version Prints version and exits

491

-h, --help Prints this message

492

493

If you don't specify the content type after the input filename,

494

the program will try to guess it by opening the file and looking

495

at the file extension.

496

497

Examples:

498

$ python slimmer.py index.html XHTML --output=index.optimized.html

499

$ python slimmer.py --test screen.css

500

"""

501

502

503

def __showversion():

504

print __version__

505

506

def __usage():

507

print usage

508

509

class Usage(Exception):

510

def __init__(self, msg):

511

self.msg = msg

512

513

def main(argv=None):

514

if argv is None:

515

argv = sys.argv

516

try:

517

try:

518

opts, args = getopt.getopt(argv[1:], "ho:vt",

519

["help", "output=", "version", "test"])

520

except getopt.error, msg:

521

raise Usage(msg)

522

# more code, unchanged

523

except Usage, err:

524

print >>sys.stderr, err.msg

525

print >>sys.stderr, "for help use --help"

526

return 2

527

528

outputfile = None

529

speedtest = 0

530

531

for o, a in opts:

532

if o == "--version":

533

__showversion()

534

return 2

535

elif o in ('-h', '--help'):

536

__usage()

537

return 3

538

elif o in ('-o', '--output'):

539

outputfile = a

540

elif o in ("-t", "--test"):

541

speedtest = 1

542

543

if not args:

544

__usage()

545

return 4

546

547

syntax = None

548

inputfile = None

549

otherargs = []

550

for arg in args:

551

if arg in ('-t', '--test'):

552

speedtest = 1

553

elif arg.startswith('--output='):

554

outputfile = arg[9:]

555

elif acceptableSyntax(arg):

556

syntax = acceptableSyntax(arg)

557

elif os.path.isfile(arg) or _is_openable_url(arg):

558

inputfile = arg

559

else:

560

otherargs.append(arg)

561

562

if inputfile and syntax is None:

563

syntax = __guess_syntax(inputfile)

564

565

if inputfile is None:

566

print >>sys.stderr, "No input file"

567

print >>sys.stderr, "for help use --help"

568

return 2

569

570

if not acceptableSyntax(syntax):

571

print >>sys.stderr, "Unrecognized syntax"

572

print >>sys.stderr, "for help use --help"

573

return 2

574

575

if otherargs:

576

print >>sys.stderr, "Unrecognized arguments %r"%otherargs

577

print >>sys.stderr, "for help use --help"

578

return 2

579

580

581

582

run(inputfile, syntax, speedtest, outputfile)

583

584

return 0

585

586

587

from time import time

588

589

def run(inputfile, syntax, speedtest, outputfile):

590

if os.path.isfile(inputfile):

591

contents = open(inputfile).read()

592

else:

593

contents = urllib2.urlopen(inputfile).read()

594

t0=time()

595

slimmed = slimmer(contents, syntax)

596

t=time()-t0

597

598

599

if outputfile:

600

open(outputfile, 'w').write(slimmed)

601

602

603

elif speedtest:

604

before = len(contents)

605

after = len(slimmed)

606

after_zlibbed = len(slimmed.encode('zlib'))

607

size_before = before

608

if size_before > 100000:

609

size_before = "%s (%sK)"%(size_before, size_before/1024)

610

size_after = after

611

if size_after > 100000:

612

size_after = "%s (%sK)"%(size_after, size_after/1024)

613

size_difference = before-after

614

if size_difference > 10000:

615

size_difference = "%s (%sK)"%(size_difference, size_difference/1024)

616

print "Took %s seconds"%round(t, 3)

617

print "Bytes before: %s"%size_before

618

print "Bytes after: %s"%size_after

619

print "Bytes after zlib: %s"%after_zlibbed

620

print "Bytes saved: %s "%size_difference,

621

print "(%s%% of original size)"%(100*round(after/float(before), 2))

622

else:

623

print >>sys.stdout, slimmed

624

625

626

if __name__=='__main__':

627

sys.exit(main())

628

629