~launchpad-pqm/launchpad/devel

7637.2.1 by Guilherme Salgado
Parsing functions to extract the info we need from apache logs.
1
#!/usr/bin/env python
2
"""Apache Log Parser
3
4
Parser for Apache log files. This is a port to python of Peter Hickman's
5
Apache::LogEntry Perl module:
6
<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
7
8
Takes the Apache logging format defined in your httpd.conf and generates
9
a regular expression which is used to a line from the log file and
10
return it as a dictionary with keys corresponding to the fields defined
11
in the log format.
12
13
Example:
14
15
    import apachelog, sys
16
17
    # Format copied and pasted from Apache conf - use raw string + single quotes
18
    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
19
    
20
    p = apachelog.parser(format)
21
22
    for line in open('/var/apache/access.log'):
23
        try:
24
           data = p.parse(line)
25
        except:
26
           sys.stderr.write("Unable to parse %s" % line)
27
28
The return dictionary from the parse method depends on the input format.
29
For the above example, the returned dictionary would look like;
30
31
    {
32
    '%>s': '200',
33
    '%b': '2607',
34
    '%h': '212.74.15.68',
35
    '%l': '-',
36
    '%r': 'GET /images/previous.png HTTP/1.1',
37
    '%t': '[23/Jan/2004:11:36:20 +0000]',
38
    '%u': '-',
39
    '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40
    '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
41
    }
42
43
...given an access log entry like (split across lines for formatting);
44
45
    212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46
        200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
48
49
You can also re-map the field names by subclassing (or re-pointing) the
50
alias method.
51
52
Generally you should be able to copy and paste the format string from
53
your Apache configuration, but remember to place it in a raw string
54
using single-quotes, so that backslashes are handled correctly.
55
56
This module provides three of the most common log formats in the
57
formats dictionary;
58
59
    # Common Log Format (CLF)
60
    p = apachelog.parser(apachlog.formats['common'])
61
62
    # Common Log Format with Virtual Host
63
    p = apachelog.parser(apachlog.formats['vhcommon'])
64
65
    # NCSA extended/combined log format
66
    p = apachelog.parser(apachlog.formats['extended'])
67
68
For notes regarding performance while reading lines from a file
69
in Python, see <http://effbot.org/zone/readline-performance.htm>.
70
Further performance boost can be gained by using psyco
71
<http://psyco.sourceforge.net/>
72
73
On my system, using a loop like;
74
75
    for line in open('access.log'):
76
        p.parse(line)
77
78
...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79
up that to ~75,000 lines / second.
80
81
The parse_date function is intended as a fast way to convert a log
82
date into something useful, without incurring a significant date
83
parsing overhead - good enough for basic stuff but will be a problem
84
if you need to deal with log from multiple servers in different
85
timezones.
86
"""
87
88
__version__ = "1.1"
89
__license__ = """Released under the same terms as Perl.
90
See: http://dev.perl.org/licenses/
91
"""
92
__author__ = "Harry Fuecks <hfuecks@gmail.com>"
93
__contributors__ = [
94
    "Peter Hickman <peterhi@ntlworld.com>",
95
    "Loic Dachary <loic@dachary.org>"
96
    ]
97
    
98
import re
99
100
class ApacheLogParserError(Exception):
101
    pass
102
103
class parser:
104
    
105
    def __init__(self, format):
106
        """
107
        Takes the log format from an Apache configuration file.
108
109
        Best just copy and paste directly from the .conf file
110
        and pass using a Python raw string e.g.
111
        
112
        format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
113
        p = apachelog.parser(format)
114
        """
115
        self._names = []
116
        self._regex = None
117
        self._pattern = ''
118
        self._parse_format(format)
119
    
120
    def _parse_format(self, format):
121
        """
122
        Converts the input format to a regular
123
        expression, as well as extracting fields
124
125
        Raises an exception if it couldn't compile
126
        the generated regex.
127
        """
128
        format = format.strip()
129
        format = re.sub('[ \t]+',' ',format)
130
        
131
        subpatterns = []
132
133
        findquotes = re.compile(r'^\\"')
8137.17.24 by Barry Warsaw
thread merge
134
        findreferreragent = re.compile('Referer|User-Agent', re.I)
7637.2.1 by Guilherme Salgado
Parsing functions to extract the info we need from apache logs.
135
        findpercent = re.compile('^%.*t$')
136
        lstripquotes = re.compile(r'^\\"')
137
        rstripquotes = re.compile(r'\\"$')
138
        self._names = []
139
        
140
        for element in format.split(' '):
141
142
            hasquotes = 0
143
            if findquotes.search(element): hasquotes = 1
144
145
            if hasquotes:
146
                element = lstripquotes.sub('', element)
147
                element = rstripquotes.sub('', element)
148
            
149
            self._names.append(self.alias(element))
150
            
151
            subpattern = '(\S*)'
152
            
153
            if hasquotes:
154
                if element == '%r' or findreferreragent.search(element):
155
                    subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
156
                else:
157
                    subpattern = r'\"([^\"]*)\"'
158
                
159
            elif findpercent.search(element):
160
                subpattern = r'(\[[^\]]+\])'
161
                
12225.4.2 by William Grant
Usernames can contain spaces, so contrib.apachelog shouldn't parse it with \S+.
162
            elif element in ('%U', '%u'):
7637.2.1 by Guilherme Salgado
Parsing functions to extract the info we need from apache logs.
163
                subpattern = '(.+?)'
164
            
165
            subpatterns.append(subpattern)
166
        
167
        self._pattern = '^' + ' '.join(subpatterns) + '$'
168
        try:
169
            self._regex = re.compile(self._pattern)
170
        except Exception, e:
171
            raise ApacheLogParserError(e)
172
        
173
    def parse(self, line):
174
        """
175
        Parses a single line from the log file and returns
176
        a dictionary of it's contents.
177
178
        Raises and exception if it couldn't parse the line
179
        """
180
        line = line.strip()
181
        match = self._regex.match(line)
182
        
183
        if match:
184
            data = {}
185
            for k, v in zip(self._names, match.groups()):
186
                data[k] = v
187
            return data
188
        
189
        raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
190
191
    def alias(self, name):
192
        """
193
        Override / replace this method if you want to map format
194
        field names to something else. This method is called
195
        when the parser is constructed, not when actually parsing
196
        a log file
197
        
198
        Takes and returns a string fieldname
199
        """
200
        return name
201
202
    def pattern(self):
203
        """
204
        Returns the compound regular expression the parser extracted
205
        from the input format (a string)
206
        """
207
        return self._pattern
208
209
    def names(self):
210
        """
211
        Returns the field names the parser extracted from the
212
        input format (a list)
213
        """
214
        return self._names
215
216
months = {
217
    'Jan':'01',
218
    'Feb':'02',
219
    'Mar':'03',
220
    'Apr':'04',
221
    'May':'05',
222
    'Jun':'06',
223
    'Jul':'07',
224
    'Aug':'08',
225
    'Sep':'09',
226
    'Oct':'10',
227
    'Nov':'11',
228
    'Dec':'12'
229
    }
230
231
def parse_date(date):
232
    """
233
    Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
234
    (including square brackets) and returns a two element
235
    tuple containing first a timestamp of the form
236
    YYYYMMDDHH24IISS e.g. 20061205105144 and second the
237
    timezone offset as is e.g.;
238
239
    parse_date('[05/Dec/2006:10:51:44 +0000]')  
240
    >> ('20061205105144', '+0000')
241
242
    It does not attempt to adjust the timestamp according
243
    to the timezone - this is your problem.
244
    """
245
    date = date[1:-1]
246
    elems = [
247
        date[7:11],
248
        months[date[3:6]],
249
        date[0:2],
250
        date[12:14],
251
        date[15:17],
252
        date[18:20],
253
        ]
254
    return (''.join(elems),date[21:])
255
256
257
"""
258
Frequenty used log formats stored here
259
"""
260
formats = {
261
    # Common Log Format (CLF)
262
    'common':r'%h %l %u %t \"%r\" %>s %b',
263
264
    # Common Log Format with Virtual Host
265
    'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
266
267
    # NCSA extended/combined log format
8137.17.24 by Barry Warsaw
thread merge
268
    'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
7637.2.1 by Guilherme Salgado
Parsing functions to extract the info we need from apache logs.
269
    }
270
271
if __name__ == '__main__':
272
    import unittest
273
274
    class TestApacheLogParser(unittest.TestCase):
275
276
        def setUp(self):
8137.17.24 by Barry Warsaw
thread merge
277
            self.format = formats['extended']
7637.2.1 by Guilherme Salgado
Parsing functions to extract the info we need from apache logs.
278
            self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
279
                          '%{User-Agent}i'.split(' ')
280
            self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
281
                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
282
                           '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
283
                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
284
            self.line1  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
285
                          r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
286
                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
287
                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
288
                          r'Gecko/20021202"'
289
            self.line2  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
290
                          r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
291
                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
292
                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
293
                          r'Gecko/20021202"'
294
            self.line3  = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
295
                          r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
296
                          r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
297
                          r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
298
                          r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
299
                          r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
300
                          r'YPC 3.0.3; yplus 4.0.00d)"'
301
            self.p = parser(self.format)
302
303
        def testpattern(self):
304
            self.assertEqual(self.pattern, self.p.pattern())
305
306
        def testnames(self):
307
            self.assertEqual(self.fields, self.p.names())
308
309
        def testline1(self):
310
            data = self.p.parse(self.line1)
311
            self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
312
            self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
313
            self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
314
            self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
315
            self.assertEqual(
316
                data['%r'],
317
                'GET /images/previous.png HTTP/1.1',
318
                msg = 'Line 1 %r'
319
                )
320
            self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
321
            self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
322
            self.assertEqual(
323
                data['%{Referer}i'],
324
                'http://peterhi.dyndns.org/bandwidth/index.html',
325
                msg = 'Line 1 %{Referer}i'
326
                )
327
            self.assertEqual(
328
                data['%{User-Agent}i'],
329
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
330
                msg = 'Line 1 %{User-Agent}i'
331
                )
332
333
        
334
        def testline2(self):
335
            data = self.p.parse(self.line2)
336
            self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
337
            self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
338
            self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
339
            self.assertEqual(
340
                data['%t'],
341
                '[23/Jan/2004:11:36:20 +0000]',
342
                msg = 'Line 2 %t'
343
                )
344
            self.assertEqual(
345
                data['%r'],
346
                r'GET /images/previous.png=\" HTTP/1.1',
347
                msg = 'Line 2 %r'
348
                )
349
            self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
350
            self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
351
            self.assertEqual(
352
                data['%{Referer}i'],
353
                'http://peterhi.dyndns.org/bandwidth/index.html',
354
                msg = 'Line 2 %{Referer}i'
355
                )
356
            self.assertEqual(
357
                data['%{User-Agent}i'],
358
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
359
                msg = 'Line 2 %{User-Agent}i'
360
                )
361
362
        def testline3(self):
363
            data = self.p.parse(self.line3)
364
            self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
365
            self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
366
            self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
367
            self.assertEqual(
368
                data['%t'],
369
                '[20/Jul/2004:13:18:55 -0700]',
370
                msg = 'Line 3 %t'
371
                )
372
            self.assertEqual(
373
                data['%r'],
374
                r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
375
                r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
376
                r'HTTP/1.1',
377
                msg = 'Line 3 %r'
378
                )
379
            self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
380
            self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
381
            self.assertEqual(
382
                data['%{Referer}i'],
383
                r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
384
                r'%20bimini\"',
385
                msg = 'Line 3 %{Referer}i'
386
                )
387
            self.assertEqual(
388
                data['%{User-Agent}i'],
389
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
390
                'yplus 4.0.00d)',
391
                msg = 'Line 3 %{User-Agent}i'
392
                )
393
394
395
        def testjunkline(self):
396
            self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
397
398
        def testhasquotesaltn(self):
399
            p = parser(r'%a \"%b\" %c')
400
            line = r'foo "xyz" bar'
401
            data = p.parse(line)
402
            self.assertEqual(data['%a'],'foo', '%a')
403
            self.assertEqual(data['%b'],'xyz', '%c')
404
            self.assertEqual(data['%c'],'bar', '%c')
405
406
        def testparsedate(self):
407
            date = '[05/Dec/2006:10:51:44 +0000]'
408
            self.assertEqual(('20061205105144','+0000'),parse_date(date))
409
410
    unittest.main()