~loggerhead-team/loggerhead/trunk-rich

« back to all changes in this revision

Viewing changes to loggerhead/textindex.py

Committer: Robey Pointer
Date: 2007-05-21 06:19:12 UTC
Revision ID: robey@lag.net-20070521061912-doe5d89zirkvd22r

yes, of course, the first release was in december *2006* :)

files added:
dev.cfg

homepage

homepage/download

homepage/download/loggerhead-1.0-0.13.tar.gz

homepage/download/loggerhead-1.1-0.13.tar.gz

homepage/download/loggerhead-1.1.1-0.13.tar.gz

homepage/index.html

homepage/loggerhead-background.png

homepage/loggerhead-logo-medium.png

homepage/loggerhead.css

homepage/sphere.png

loggerhead/branchview.py

loggerhead/config

loggerhead/config/__init__.py

loggerhead/config/app.cfg

loggerhead/config/log.cfg

loggerhead/controllers/bundle_ui.py

loggerhead/json.py

loggerhead/model.py

loggerhead/sqlobject-history

loggerhead/static/css/style.css

loggerhead/templates/annotate.kid

loggerhead/templates/atom.kid

loggerhead/templates/browse.kid

loggerhead/templates/changelog.kid

loggerhead/templates/inventory.kid

loggerhead/templates/master.kid

loggerhead/templates/revision.kid

loggerhead/tests/test_controllers.py

loggerhead/tests/test_model.py

loggerhead/textindex.py

push-website

files removed:
loggerhead/apps

loggerhead/apps/__init__.py

loggerhead/apps/branch.py

loggerhead/apps/config.py

loggerhead/apps/filesystem.py

loggerhead/static/css/zptstyle.css

loggerhead/templatefunctions.py

loggerhead/templates/annotate.pt

loggerhead/templates/atom.pt

loggerhead/templates/browse.pt

loggerhead/templates/changelog.pt

loggerhead/templates/collapse-all-button.pt

loggerhead/templates/collapse-button.pt

loggerhead/templates/inventory.pt

loggerhead/templates/macros.pt

loggerhead/templates/modified-file-link-log.pt

loggerhead/templates/modified-file-link-rev.pt

loggerhead/templates/revision.pt

loggerhead/templates/revisionfilechanges.pt

loggerhead/templates/revisioninfo.pt

loggerhead/tests/simple.pt

loggerhead/tests/test_corners.py

loggerhead/tests/test_filechangecache.py

loggerhead/tests/test_simple.py

loggerhead/tests/test_templating.py

loggerhead/wholehistory.py

loggerhead/zptsupport.py

serve-branches.py

files modified:
MANIFEST.in

Makefile

NEWS

README.txt

loggerhead.conf.example

loggerhead/TODO

loggerhead/changecache.py

loggerhead/controllers/__init__.py

loggerhead/controllers/annotate_ui.py

loggerhead/controllers/atom_ui.py

loggerhead/controllers/changelog_ui.py

loggerhead/controllers/download_ui.py

loggerhead/controllers/inventory_ui.py

loggerhead/controllers/revision_ui.py

loggerhead/daemon.py

loggerhead/history.py

loggerhead/lockfile.py

loggerhead/lsprof.py

loggerhead/release.py

loggerhead/static/javascript/collapse.js

loggerhead/util.py

setup.py

start-loggerhead.py

stop-loggerhead.py

Show diffs side-by-side

added added

removed removed

loggerhead/textindex.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""

indexing of the comment text of revisions, for fast searching.

two separate 'shelve' files are created:

- recorded: revid -> 1 (if the revid is indexed)

- index: 3-letter substring -> list(revids)

"""

import logging

import os

import re

import shelve

import threading

import time

from loggerhead import util

from loggerhead.util import decorator

from loggerhead.lockfile import LockFile

# if any substring index reaches this many revids, replace the entry with

# an ALL marker -- it's not worth an explicit index.

ALL_THRESHOLD = 1000

ALL = 'ALL'

with_lock = util.with_lock('_lock')

def normalize_string(s):

"""

remove any punctuation and normalize all whitespace to a single space.

"""

s = util.to_utf8(s).lower()

# remove apostrophes completely.

s = re.sub(r"'", '', s)

# convert other garbage into space

s = re.sub(r'[^\w\d]', ' ', s)

# compress multiple spaces into one.

s = re.sub(r'\s{2,}', ' ', s)

# and finally remove leading/trailing whitespace

s = s.strip()

return s

class TextIndex (object):

def __init__(self, history, cache_path):

self.history = history

self.log = history.log

if not os.path.exists(cache_path):

os.mkdir(cache_path)

self._recorded_filename = os.path.join(cache_path, 'textindex-recorded')

self._index_filename = os.path.join(cache_path, 'textindex')

# use a lockfile since the cache folder could be shared across different processes.

self._lock = LockFile(os.path.join(cache_path, 'index-lock'))

self._closed = False

self.log.info('Using search index; %d entries.', len(self))

def _is_indexed(self, revid, recorded):

return recorded.get(util.to_utf8(revid), None) is not None

@with_lock

def is_indexed(self, revid):

recorded = shelve.open(self._recorded_filename, 'c', protocol=2)

try:

return self._is_indexed(revid, recorded)

finally:

recorded.close()

@with_lock

def __len__(self):

recorded = shelve.open(self._recorded_filename, 'c', protocol=2)

try:

return len(recorded)

finally:

recorded.close()

100

@with_lock

101

def close(self):

102

self._closed = True

103

104

@with_lock

105

def closed(self):

106

return self._closed

107

108

@with_lock

109

def flush(self):

110

pass

111

112

@with_lock

113

def full(self):

114

recorded = shelve.open(self._recorded_filename, 'c', protocol=2)

115

try:

116

return (len(recorded) >= len(self.history.get_revision_history())) and (util.to_utf8(self.history.last_revid) in recorded)

117

finally:

118

recorded.close()

119

120

def _index_change(self, change, recorded, index):

121

"""

122

currently, only indexes the 'comment' field.

123

"""

124

comment = normalize_string(change.comment)

125

if len(comment) < 3:

126

return

127

for i in xrange(len(comment) - 2):

128

sub = comment[i:i + 3]

129

revid_set = index.get(sub, None)

130

if revid_set is None:

131

revid_set = set()

132

elif revid_set == ALL:

133

# this entry got too big

134

continue

135

revid_set.add(change.revid)

136

if len(revid_set) > ALL_THRESHOLD:

137

revid_set = ALL

138

index[sub] = revid_set

139

140

recorded[util.to_utf8(change.revid)] = True

141

142

@with_lock

143

def index_changes(self, revid_list):

144

recorded = shelve.open(self._recorded_filename, 'c', protocol=2)

145

index = shelve.open(self._index_filename, 'c', protocol=2)

146

try:

147

revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]

148

change_list = self.history.get_changes(revid_list)

149

for change in change_list:

150

self._index_change(change, recorded, index)

151

finally:

152

index.close()

153

recorded.close()

154

155

@with_lock

156

def find(self, text, revid_list=None):

157

index = shelve.open(self._index_filename, 'c', protocol=2)

158

try:

159

text = normalize_string(text)

160

if len(text) < 3:

161

return []

162

163

total_set = None

164

if revid_list is not None:

165

total_set = set(revid_list)

166

seen_all = False

167

168

for i in xrange(len(text) - 2):

169

sub = text[i:i + 3]

170

revid_set = index.get(sub, None)

171

if revid_set is None:

172

# zero matches, stop here.

173

return []

174

if revid_set == ALL:

175

# skip

176

seen_all = True

177

continue

178

if total_set is None:

179

total_set = revid_set

180

else:

181

total_set.intersection_update(revid_set)

182

if len(total_set) == 0:

183

return []

184

finally:

185

index.close()

186

187

# tricky: if seen_all is True, one of the substring indices was ALL

188

# (in other words, unindexed), so our results are actually a superset

189

# of the exact answer.

190

191

# if we cared, we could do a direct match on the result set and cull

192

# out any that aren't actually matches. for now, i'm gonna say that

193

# we DON'T care, and if one of the substrings hit ALL, there's a small

194

# chance that we'll give a few false positives.

195

return total_set

196

197

def check_rebuild(self, max_time=3600):

198

"""

199

check if there are any un-indexed revisions, and if so, index them.

200

but don't spend longer than C{max_time} on it.

201

"""

202

if self.closed() or self.full():

203

# all done

204

return

205

206

self.log.info('Building search index...')

207

work = list(self.history.get_revision_history())

208

start_time = time.time()

209

last_update = time.time()

210

count = 0

211

212

jump = 100

213

for i in xrange(0, len(work), jump):

214

r = work[i:i + jump]

215

self.index_changes(r)

216

if self.closed():

217

return

218

219

count += jump

220

now = time.time()

221

if now - start_time > 3600:

222

# there's no point working for hours. eventually we might even

223

# hit the next re-index interval, which would suck mightily.

224

self.log.info('Search indexing has worked for an hour; giving up for now.')

225

return

226

if now - last_update > 60:

227

self.log.info('Search indexing continues: %d/%d' % (min(count, len(work)), len(work)))

228

last_update = time.time()

229

# give someone else a chance at the lock

230

time.sleep(1)

231

self.log.info('Search index completed.')

232

self.flush()

233

Older »