7637.2.1
by Guilherme Salgado
Parsing functions to extract the info we need from apache logs. |
1 |
#!/usr/bin/env python
|
2 |
"""Apache Log Parser
|
|
3 |
||
4 |
Parser for Apache log files. This is a port to python of Peter Hickman's
|
|
5 |
Apache::LogEntry Perl module:
|
|
6 |
<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
|
|
7 |
||
8 |
Takes the Apache logging format defined in your httpd.conf and generates
|
|
9 |
a regular expression which is used to a line from the log file and
|
|
10 |
return it as a dictionary with keys corresponding to the fields defined
|
|
11 |
in the log format.
|
|
12 |
||
13 |
Example:
|
|
14 |
||
15 |
import apachelog, sys
|
|
16 |
||
17 |
# Format copied and pasted from Apache conf - use raw string + single quotes
|
|
18 |
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
|
|
19 |
|
|
20 |
p = apachelog.parser(format)
|
|
21 |
||
22 |
for line in open('/var/apache/access.log'):
|
|
23 |
try:
|
|
24 |
data = p.parse(line)
|
|
25 |
except:
|
|
26 |
sys.stderr.write("Unable to parse %s" % line)
|
|
27 |
||
28 |
The return dictionary from the parse method depends on the input format.
|
|
29 |
For the above example, the returned dictionary would look like;
|
|
30 |
||
31 |
{
|
|
32 |
'%>s': '200',
|
|
33 |
'%b': '2607',
|
|
34 |
'%h': '212.74.15.68',
|
|
35 |
'%l': '-',
|
|
36 |
'%r': 'GET /images/previous.png HTTP/1.1',
|
|
37 |
'%t': '[23/Jan/2004:11:36:20 +0000]',
|
|
38 |
'%u': '-',
|
|
39 |
'%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
|
|
40 |
'%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
|
|
41 |
}
|
|
42 |
||
43 |
...given an access log entry like (split across lines for formatting);
|
|
44 |
||
45 |
212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
|
|
46 |
200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
|
|
47 |
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
|
|
48 |
||
49 |
You can also re-map the field names by subclassing (or re-pointing) the
|
|
50 |
alias method.
|
|
51 |
||
52 |
Generally you should be able to copy and paste the format string from
|
|
53 |
your Apache configuration, but remember to place it in a raw string
|
|
54 |
using single-quotes, so that backslashes are handled correctly.
|
|
55 |
||
56 |
This module provides three of the most common log formats in the
|
|
57 |
formats dictionary;
|
|
58 |
||
59 |
# Common Log Format (CLF)
|
|
60 |
p = apachelog.parser(apachlog.formats['common'])
|
|
61 |
||
62 |
# Common Log Format with Virtual Host
|
|
63 |
p = apachelog.parser(apachlog.formats['vhcommon'])
|
|
64 |
||
65 |
# NCSA extended/combined log format
|
|
66 |
p = apachelog.parser(apachlog.formats['extended'])
|
|
67 |
||
68 |
For notes regarding performance while reading lines from a file
|
|
69 |
in Python, see <http://effbot.org/zone/readline-performance.htm>.
|
|
70 |
Further performance boost can be gained by using psyco
|
|
71 |
<http://psyco.sourceforge.net/>
|
|
72 |
||
73 |
On my system, using a loop like;
|
|
74 |
||
75 |
for line in open('access.log'):
|
|
76 |
p.parse(line)
|
|
77 |
||
78 |
...was able to parse ~60,000 lines / second. Adding psyco to the mix,
|
|
79 |
up that to ~75,000 lines / second.
|
|
80 |
||
81 |
The parse_date function is intended as a fast way to convert a log
|
|
82 |
date into something useful, without incurring a significant date
|
|
83 |
parsing overhead - good enough for basic stuff but will be a problem
|
|
84 |
if you need to deal with log from multiple servers in different
|
|
85 |
timezones.
|
|
86 |
"""
|
|
87 |
||
88 |
__version__ = "1.1" |
|
89 |
__license__ = """Released under the same terms as Perl. |
|
90 |
See: http://dev.perl.org/licenses/
|
|
91 |
"""
|
|
92 |
__author__ = "Harry Fuecks <hfuecks@gmail.com>" |
|
93 |
__contributors__ = [ |
|
94 |
"Peter Hickman <peterhi@ntlworld.com>", |
|
95 |
"Loic Dachary <loic@dachary.org>"
|
|
96 |
]
|
|
97 |
||
98 |
import re |
|
99 |
||
100 |
class ApacheLogParserError(Exception): |
|
101 |
pass
|
|
102 |
||
103 |
class parser: |
|
104 |
||
105 |
def __init__(self, format): |
|
106 |
"""
|
|
107 |
Takes the log format from an Apache configuration file.
|
|
108 |
||
109 |
Best just copy and paste directly from the .conf file
|
|
110 |
and pass using a Python raw string e.g.
|
|
111 |
|
|
112 |
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
|
|
113 |
p = apachelog.parser(format)
|
|
114 |
"""
|
|
115 |
self._names = [] |
|
116 |
self._regex = None |
|
117 |
self._pattern = '' |
|
118 |
self._parse_format(format) |
|
119 |
||
120 |
def _parse_format(self, format): |
|
121 |
"""
|
|
122 |
Converts the input format to a regular
|
|
123 |
expression, as well as extracting fields
|
|
124 |
||
125 |
Raises an exception if it couldn't compile
|
|
126 |
the generated regex.
|
|
127 |
"""
|
|
128 |
format = format.strip() |
|
129 |
format = re.sub('[ \t]+',' ',format) |
|
130 |
||
131 |
subpatterns = [] |
|
132 |
||
133 |
findquotes = re.compile(r'^\\"') |
|
8137.17.24
by Barry Warsaw
thread merge |
134 |
findreferreragent = re.compile('Referer|User-Agent', re.I) |
7637.2.1
by Guilherme Salgado
Parsing functions to extract the info we need from apache logs. |
135 |
findpercent = re.compile('^%.*t$') |
136 |
lstripquotes = re.compile(r'^\\"') |
|
137 |
rstripquotes = re.compile(r'\\"$') |
|
138 |
self._names = [] |
|
139 |
||
140 |
for element in format.split(' '): |
|
141 |
||
142 |
hasquotes = 0 |
|
143 |
if findquotes.search(element): hasquotes = 1 |
|
144 |
||
145 |
if hasquotes: |
|
146 |
element = lstripquotes.sub('', element) |
|
147 |
element = rstripquotes.sub('', element) |
|
148 |
||
149 |
self._names.append(self.alias(element)) |
|
150 |
||
151 |
subpattern = '(\S*)' |
|
152 |
||
153 |
if hasquotes: |
|
154 |
if element == '%r' or findreferreragent.search(element): |
|
155 |
subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"' |
|
156 |
else: |
|
157 |
subpattern = r'\"([^\"]*)\"' |
|
158 |
||
159 |
elif findpercent.search(element): |
|
160 |
subpattern = r'(\[[^\]]+\])' |
|
161 |
||
12225.4.2
by William Grant
Usernames can contain spaces, so contrib.apachelog shouldn't parse it with \S+. |
162 |
elif element in ('%U', '%u'): |
7637.2.1
by Guilherme Salgado
Parsing functions to extract the info we need from apache logs. |
163 |
subpattern = '(.+?)' |
164 |
||
165 |
subpatterns.append(subpattern) |
|
166 |
||
167 |
self._pattern = '^' + ' '.join(subpatterns) + '$' |
|
168 |
try: |
|
169 |
self._regex = re.compile(self._pattern) |
|
170 |
except Exception, e: |
|
171 |
raise ApacheLogParserError(e) |
|
172 |
||
173 |
def parse(self, line): |
|
174 |
"""
|
|
175 |
Parses a single line from the log file and returns
|
|
176 |
a dictionary of it's contents.
|
|
177 |
||
178 |
Raises and exception if it couldn't parse the line
|
|
179 |
"""
|
|
180 |
line = line.strip() |
|
181 |
match = self._regex.match(line) |
|
182 |
||
183 |
if match: |
|
184 |
data = {} |
|
185 |
for k, v in zip(self._names, match.groups()): |
|
186 |
data[k] = v |
|
187 |
return data |
|
188 |
||
189 |
raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) ) |
|
190 |
||
191 |
def alias(self, name): |
|
192 |
"""
|
|
193 |
Override / replace this method if you want to map format
|
|
194 |
field names to something else. This method is called
|
|
195 |
when the parser is constructed, not when actually parsing
|
|
196 |
a log file
|
|
197 |
|
|
198 |
Takes and returns a string fieldname
|
|
199 |
"""
|
|
200 |
return name |
|
201 |
||
202 |
def pattern(self): |
|
203 |
"""
|
|
204 |
Returns the compound regular expression the parser extracted
|
|
205 |
from the input format (a string)
|
|
206 |
"""
|
|
207 |
return self._pattern |
|
208 |
||
209 |
def names(self): |
|
210 |
"""
|
|
211 |
Returns the field names the parser extracted from the
|
|
212 |
input format (a list)
|
|
213 |
"""
|
|
214 |
return self._names |
|
215 |
||
216 |
months = { |
|
217 |
'Jan':'01', |
|
218 |
'Feb':'02', |
|
219 |
'Mar':'03', |
|
220 |
'Apr':'04', |
|
221 |
'May':'05', |
|
222 |
'Jun':'06', |
|
223 |
'Jul':'07', |
|
224 |
'Aug':'08', |
|
225 |
'Sep':'09', |
|
226 |
'Oct':'10', |
|
227 |
'Nov':'11', |
|
228 |
'Dec':'12' |
|
229 |
}
|
|
230 |
||
231 |
def parse_date(date): |
|
232 |
"""
|
|
233 |
Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
|
|
234 |
(including square brackets) and returns a two element
|
|
235 |
tuple containing first a timestamp of the form
|
|
236 |
YYYYMMDDHH24IISS e.g. 20061205105144 and second the
|
|
237 |
timezone offset as is e.g.;
|
|
238 |
||
239 |
parse_date('[05/Dec/2006:10:51:44 +0000]')
|
|
240 |
>> ('20061205105144', '+0000')
|
|
241 |
||
242 |
It does not attempt to adjust the timestamp according
|
|
243 |
to the timezone - this is your problem.
|
|
244 |
"""
|
|
245 |
date = date[1:-1] |
|
246 |
elems = [ |
|
247 |
date[7:11], |
|
248 |
months[date[3:6]], |
|
249 |
date[0:2], |
|
250 |
date[12:14], |
|
251 |
date[15:17], |
|
252 |
date[18:20], |
|
253 |
]
|
|
254 |
return (''.join(elems),date[21:]) |
|
255 |
||
256 |
||
257 |
"""
|
|
258 |
Frequenty used log formats stored here
|
|
259 |
"""
|
|
260 |
formats = { |
|
261 |
# Common Log Format (CLF)
|
|
262 |
'common':r'%h %l %u %t \"%r\" %>s %b', |
|
263 |
||
264 |
# Common Log Format with Virtual Host
|
|
265 |
'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b', |
|
266 |
||
267 |
# NCSA extended/combined log format
|
|
8137.17.24
by Barry Warsaw
thread merge |
268 |
'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"', |
7637.2.1
by Guilherme Salgado
Parsing functions to extract the info we need from apache logs. |
269 |
}
|
270 |
||
271 |
if __name__ == '__main__': |
|
272 |
import unittest |
|
273 |
||
274 |
class TestApacheLogParser(unittest.TestCase): |
|
275 |
||
276 |
def setUp(self): |
|
8137.17.24
by Barry Warsaw
thread merge |
277 |
self.format = formats['extended'] |
7637.2.1
by Guilherme Salgado
Parsing functions to extract the info we need from apache logs. |
278 |
self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\ |
279 |
'%{User-Agent}i'.split(' ') |
|
280 |
self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\ |
|
281 |
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ |
|
282 |
'(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ |
|
283 |
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$' |
|
284 |
self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ |
|
285 |
r'"GET /images/previous.png HTTP/1.1" 200 2607 '\ |
|
286 |
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ |
|
287 |
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ |
|
288 |
r'Gecko/20021202"' |
|
289 |
self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ |
|
290 |
r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\ |
|
291 |
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ |
|
292 |
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ |
|
293 |
r'Gecko/20021202"' |
|
294 |
self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\ |
|
295 |
r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\ |
|
296 |
r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\ |
|
297 |
r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\ |
|
298 |
r'bin/search?p=\"grady%20white%20306%20bimini\"" '\ |
|
299 |
r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ |
|
300 |
r'YPC 3.0.3; yplus 4.0.00d)"' |
|
301 |
self.p = parser(self.format) |
|
302 |
||
303 |
def testpattern(self): |
|
304 |
self.assertEqual(self.pattern, self.p.pattern()) |
|
305 |
||
306 |
def testnames(self): |
|
307 |
self.assertEqual(self.fields, self.p.names()) |
|
308 |
||
309 |
def testline1(self): |
|
310 |
data = self.p.parse(self.line1) |
|
311 |
self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h') |
|
312 |
self.assertEqual(data['%l'], '-', msg = 'Line 1 %l') |
|
313 |
self.assertEqual(data['%u'], '-', msg = 'Line 1 %u') |
|
314 |
self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t') |
|
315 |
self.assertEqual( |
|
316 |
data['%r'], |
|
317 |
'GET /images/previous.png HTTP/1.1', |
|
318 |
msg = 'Line 1 %r' |
|
319 |
)
|
|
320 |
self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s') |
|
321 |
self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b') |
|
322 |
self.assertEqual( |
|
323 |
data['%{Referer}i'], |
|
324 |
'http://peterhi.dyndns.org/bandwidth/index.html', |
|
325 |
msg = 'Line 1 %{Referer}i' |
|
326 |
)
|
|
327 |
self.assertEqual( |
|
328 |
data['%{User-Agent}i'], |
|
329 |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', |
|
330 |
msg = 'Line 1 %{User-Agent}i' |
|
331 |
)
|
|
332 |
||
333 |
||
334 |
def testline2(self): |
|
335 |
data = self.p.parse(self.line2) |
|
336 |
self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h') |
|
337 |
self.assertEqual(data['%l'], '-', msg = 'Line 2 %l') |
|
338 |
self.assertEqual(data['%u'], '-', msg = 'Line 2 %u') |
|
339 |
self.assertEqual( |
|
340 |
data['%t'], |
|
341 |
'[23/Jan/2004:11:36:20 +0000]', |
|
342 |
msg = 'Line 2 %t' |
|
343 |
)
|
|
344 |
self.assertEqual( |
|
345 |
data['%r'], |
|
346 |
r'GET /images/previous.png=\" HTTP/1.1', |
|
347 |
msg = 'Line 2 %r' |
|
348 |
)
|
|
349 |
self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s') |
|
350 |
self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b') |
|
351 |
self.assertEqual( |
|
352 |
data['%{Referer}i'], |
|
353 |
'http://peterhi.dyndns.org/bandwidth/index.html', |
|
354 |
msg = 'Line 2 %{Referer}i' |
|
355 |
)
|
|
356 |
self.assertEqual( |
|
357 |
data['%{User-Agent}i'], |
|
358 |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', |
|
359 |
msg = 'Line 2 %{User-Agent}i' |
|
360 |
)
|
|
361 |
||
362 |
def testline3(self): |
|
363 |
data = self.p.parse(self.line3) |
|
364 |
self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h') |
|
365 |
self.assertEqual(data['%l'], '-', msg = 'Line 3 %l') |
|
366 |
self.assertEqual(data['%u'], '-', msg = 'Line 3 %u') |
|
367 |
self.assertEqual( |
|
368 |
data['%t'], |
|
369 |
'[20/Jul/2004:13:18:55 -0700]', |
|
370 |
msg = 'Line 3 %t' |
|
371 |
)
|
|
372 |
self.assertEqual( |
|
373 |
data['%r'], |
|
374 |
r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\ |
|
375 |
r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\ |
|
376 |
r'HTTP/1.1', |
|
377 |
msg = 'Line 3 %r' |
|
378 |
)
|
|
379 |
self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s') |
|
380 |
self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b') |
|
381 |
self.assertEqual( |
|
382 |
data['%{Referer}i'], |
|
383 |
r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\ |
|
384 |
r'%20bimini\"', |
|
385 |
msg = 'Line 3 %{Referer}i' |
|
386 |
)
|
|
387 |
self.assertEqual( |
|
388 |
data['%{User-Agent}i'], |
|
389 |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
|
|
390 |
'yplus 4.0.00d)', |
|
391 |
msg = 'Line 3 %{User-Agent}i' |
|
392 |
)
|
|
393 |
||
394 |
||
395 |
def testjunkline(self): |
|
396 |
self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') |
|
397 |
||
398 |
def testhasquotesaltn(self): |
|
399 |
p = parser(r'%a \"%b\" %c') |
|
400 |
line = r'foo "xyz" bar' |
|
401 |
data = p.parse(line) |
|
402 |
self.assertEqual(data['%a'],'foo', '%a') |
|
403 |
self.assertEqual(data['%b'],'xyz', '%c') |
|
404 |
self.assertEqual(data['%c'],'bar', '%c') |
|
405 |
||
406 |
def testparsedate(self): |
|
407 |
date = '[05/Dec/2006:10:51:44 +0000]' |
|
408 |
self.assertEqual(('20061205105144','+0000'),parse_date(date)) |
|
409 |
||
410 |
unittest.main() |