~loggerhead-team/loggerhead/trunk-rich

« back to all changes in this revision

Viewing changes to loggerhead/textindex.py

Committer: Launchpad Patch Queue Manager
Date: 2008-06-25 01:19:01 UTC
mfrom: (164 trunk)
mto: (157.1.4 loggerhead) (422.1.1 merge_pqm_updates)
mto: This revision was merged to the branch mainline in revision 423.
Revision ID: launchpad@pqm.canonical.com-20080625011901-0o0zq7ad282aih5b

[rs=mwhudson] many many improvements to loggerhead -- faster
templating, leaner HTTP server, less caching, cleaner urls

files added:
loggerhead/apps

loggerhead/apps/__init__.py

loggerhead/apps/branch.py

loggerhead/apps/config.py

loggerhead/apps/filesystem.py

loggerhead/static/css/zptstyle.css

loggerhead/templatefunctions.py

loggerhead/templates/annotate.pt

loggerhead/templates/atom.pt

loggerhead/templates/browse.pt

loggerhead/templates/changelog.pt

loggerhead/templates/collapse-all-button.pt

loggerhead/templates/collapse-button.pt

loggerhead/templates/inventory.pt

loggerhead/templates/macros.pt

loggerhead/templates/modified-file-link-log.pt

loggerhead/templates/modified-file-link-rev.pt

loggerhead/templates/revision.pt

loggerhead/templates/revisionfilechanges.pt

loggerhead/templates/revisioninfo.pt

loggerhead/tests/simple.pt

loggerhead/tests/test_templating.py

loggerhead/zptsupport.py

serve-branches.py

files removed:
dev.cfg

loggerhead/config

loggerhead/config/__init__.py

loggerhead/config/app.cfg

loggerhead/config/log.cfg

loggerhead/sqlobject-history

loggerhead/static/css/style.css

loggerhead/templates/annotate.kid

loggerhead/templates/atom.kid

loggerhead/templates/browse.kid

loggerhead/templates/changelog.kid

loggerhead/templates/inventory.kid

loggerhead/templates/master.kid

loggerhead/templates/revision.kid

loggerhead/textindex.py

files modified:
README.txt

loggerhead.conf.example

loggerhead/branchview.py

loggerhead/changecache.py

loggerhead/controllers/__init__.py

loggerhead/controllers/annotate_ui.py

loggerhead/controllers/atom_ui.py

loggerhead/controllers/bundle_ui.py

loggerhead/controllers/changelog_ui.py

loggerhead/controllers/download_ui.py

loggerhead/controllers/inventory_ui.py

loggerhead/controllers/revision_ui.py

loggerhead/history.py

loggerhead/static/javascript/collapse.js

loggerhead/tests/test_corners.py

loggerhead/tests/test_simple.py

loggerhead/util.py

start-loggerhead.py

stop-loggerhead.py

Show diffs side-by-side

added added

removed removed

loggerhead/textindex.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""

indexing of the comment text of revisions, for fast searching.

two separate database files are created:

- recorded: revid -> 1 (if the revid is indexed)

- index: 3-letter substring -> list(revids)

"""

import os

import re

import time

from loggerhead import util

from loggerhead.lockfile import LockFile

from loggerhead.changecache import FakeShelf

# if any substring index reaches this many revids, replace the entry with

# an ALL marker -- it's not worth an explicit index.

ALL_THRESHOLD = 1000

ALL = 'ALL'

with_lock = util.with_lock('_lock')

def normalize_string(s):

"""

remove any punctuation and normalize all whitespace to a single space.

"""

s = util.to_utf8(s).lower()

# remove apostrophes completely.

s = re.sub(r"'", '', s)

# convert other garbage into space

s = re.sub(r'[^\w\d]', ' ', s)

# compress multiple spaces into one.

s = re.sub(r'\s{2,}', ' ', s)

# and finally remove leading/trailing whitespace

s = s.strip()

return s

class TextIndex (object):

def __init__(self, history, cache_path):

self.history = history

self.log = history.log

if not os.path.exists(cache_path):

os.mkdir(cache_path)

self._recorded_filename = os.path.join(cache_path, 'textindex-recorded.sql')

self._index_filename = os.path.join(cache_path, 'textindex.sql')

# use a lockfile since the cache folder could be shared across different processes.

self._lock = LockFile(os.path.join(cache_path, 'index-lock'))

self._closed = False

self.log.info('Using search index; %d entries.', len(self))

def _index(self):

return FakeShelf(self._index_filename)

def _recorded(self):

return FakeShelf(self._recorded_filename)

def _is_indexed(self, revid, recorded):

return recorded.get(util.to_utf8(revid)) is not None

@with_lock

def is_indexed(self, revid):

recorded = self._recorded()

try:

return self._is_indexed(revid, recorded)

finally:

recorded.close()

@with_lock

def __len__(self):

recorded = self._recorded()

try:

return recorded.count()

100

finally:

101

recorded.close()

102

103

@with_lock

104

def close(self):

105

self._closed = True

106

107

@with_lock

108

def closed(self):

109

return self._closed

110

111

@with_lock

112

def flush(self):

113

pass

114

115

@with_lock

116

def full(self):

117

recorded = self._recorded()

118

last_revid = util.to_utf8(self.history.last_revid)

119

try:

120

return (recorded.count() >= len(self.history.get_revision_history())

121

and recorded.get(last_revid) is not None)

122

finally:

123

recorded.close()

124

125

def _index_change(self, change, recorded, index):

126

"""

127

currently, only indexes the 'comment' field.

128

"""

129

comment = normalize_string(change.comment)

130

if len(comment) < 3:

131

return

132

for i in xrange(len(comment) - 2):

133

sub = comment[i:i + 3]

134

orig = revid_set = index.get(sub)

135

if revid_set is None:

136

revid_set = set()

137

elif revid_set == ALL:

138

# this entry got too big

139

continue

140

revid_set.add(change.revid)

141

if len(revid_set) > ALL_THRESHOLD:

142

revid_set = ALL

143

if orig is not None:

144

index.update([(sub, revid_set)], commit=False)

145

else:

146

index.add([(sub, revid_set)], commit=False)

147

148

recorded.add([(util.to_utf8(change.revid), True)], commit=False)

149

150

@with_lock

151

def index_changes(self, revid_list):

152

recorded = self._recorded()

153

index = self._index()

154

try:

155

revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]

156

change_list = self.history.get_changes(revid_list)

157

for change in change_list:

158

self._index_change(change, recorded, index)

159

finally:

160

index.close(commit=True)

161

recorded.close(commit=True)

162

163

@with_lock

164

def find(self, text, revid_list=None):

165

index = self._index()

166

try:

167

text = normalize_string(text)

168

if len(text) < 3:

169

return []

170

171

total_set = None

172

if revid_list is not None:

173

total_set = set(revid_list)

174

seen_all = False

175

176

for i in xrange(len(text) - 2):

177

sub = text[i:i + 3]

178

revid_set = index.get(sub)

179

if revid_set is None:

180

# zero matches, stop here.

181

return []

182

if revid_set == ALL:

183

# skip

184

seen_all = True

185

continue

186

if total_set is None:

187

total_set = revid_set

188

else:

189

total_set.intersection_update(revid_set)

190

if len(total_set) == 0:

191

return []

192

finally:

193

index.close()

194

195

# tricky: if seen_all is True, one of the substring indices was ALL

196

# (in other words, unindexed), so our results are actually a superset

197

# of the exact answer.

198

199

# if we cared, we could do a direct match on the result set and cull

200

# out any that aren't actually matches. for now, i'm gonna say that

201

# we DON'T care, and if one of the substrings hit ALL, there's a small

202

# chance that we'll give a few false positives.

203

return total_set

204

205

def check_rebuild(self, max_time=3600):

206

"""

207

check if there are any un-indexed revisions, and if so, index them.

208

but don't spend longer than C{max_time} on it.

209

"""

210

if self.closed() or self.full():

211

# all done

212

return

213

214

self.log.info('Building search index...')

215

work = list(self.history.get_revision_history())

216

start_time = time.time()

217

last_update = time.time()

218

count = 0

219

220

jump = 100

221

for i in xrange(0, len(work), jump):

222

r = work[i:i + jump]

223

self.index_changes(r)

224

if self.closed():

225

return

226

227

count += jump

228

now = time.time()

229

if now - start_time > 3600:

230

# there's no point working for hours. eventually we might even

231

# hit the next re-index interval, which would suck mightily.

232

self.log.info('Search indexing has worked for an hour; giving up for now.')

233

return

234

if now - last_update > 60:

235

self.log.info('Search indexing continues: %d/%d' % (min(count, len(work)), len(work)))

236

last_update = time.time()

237

# give someone else a chance at the lock

238

time.sleep(1)

239

self.log.info('Search index completed.')

240

self.flush()

Older »