~loggerhead-team/loggerhead/trunk-rich

« back to all changes in this revision

Viewing changes to loggerhead/textindex.py

Committer: Martin Albisetti
Date: 2008-08-06 04:20:07 UTC
Revision ID: argentina@gmail.com-20080806042007-wulv8gkbxyj349qb

Don't filter something we don't filter :)

files added:
COPYING.txt

loggerhead/apps

loggerhead/apps/__init__.py

loggerhead/apps/branch.py

loggerhead/apps/config.py

loggerhead/apps/filesystem.py

loggerhead/controllers/directory_ui.py

loggerhead/controllers/search_ui.py

loggerhead/search.py

loggerhead/static/css/annotate.css

loggerhead/static/css/diff.css

loggerhead/static/css/files.css

loggerhead/static/css/global.css

loggerhead/static/images/bg_Tabs.gif

loggerhead/static/images/bg_infobox.gif

loggerhead/static/images/bg_menuTabs.gif

loggerhead/static/images/bg_search_input.gif

loggerhead/static/images/bg_submenuTabs.gif

loggerhead/static/images/deleteCode.gif

loggerhead/static/images/favicon.ico

loggerhead/static/images/ico_branch.gif

loggerhead/static/images/ico_committer.gif

loggerhead/static/images/ico_description.gif

loggerhead/static/images/ico_diff.gif

loggerhead/static/images/ico_file.gif

loggerhead/static/images/ico_file_download.gif

loggerhead/static/images/ico_file_flecha.gif

loggerhead/static/images/ico_file_modify.gif

loggerhead/static/images/ico_folder.gif

loggerhead/static/images/ico_folder_up.gif

loggerhead/static/images/ico_link.gif

loggerhead/static/images/ico_mergefrom.gif

loggerhead/static/images/ico_mergeto.gif

loggerhead/static/images/ico_planilla.gif

loggerhead/static/images/ico_rss.gif

loggerhead/static/images/ico_time.gif

loggerhead/static/images/newCode.gif

loggerhead/static/images/treeCollapsed.png

loggerhead/static/images/treeDiff.png

loggerhead/static/images/treeExpanded.png

loggerhead/static/javascript/custom.js

loggerhead/static/javascript/mootools-1.2-core.js

loggerhead/static/javascript/mootools-1.2-more.js

loggerhead/templatefunctions.py

loggerhead/templates/annotate.pt

loggerhead/templates/atom.pt

loggerhead/templates/browse.pt

loggerhead/templates/changelog.pt

loggerhead/templates/collapse-all-button.pt

loggerhead/templates/collapse-button.pt

loggerhead/templates/directory.pt

loggerhead/templates/inventory.pt

loggerhead/templates/macros.pt

loggerhead/templates/modified-file-link-rev.pt

loggerhead/templates/revision.pt

loggerhead/templates/revisionfilechanges.pt

loggerhead/templates/revisioninfo.pt

loggerhead/templates/search.pt

loggerhead/tests/simple.pt

loggerhead/tests/test_templating.py

loggerhead/wholehistory.py

loggerhead/zptsupport.py

serve-branches

serve-branches.1

start-loggerhead.1

stop-loggerhead.1

files removed:
.project

dev.cfg

homepage

homepage/download

homepage/download/loggerhead-1.0-0.13.tar.gz

homepage/download/loggerhead-1.1-0.13.tar.gz

homepage/download/loggerhead-1.1.1-0.13.tar.gz

homepage/index.html

homepage/loggerhead-background.png

homepage/loggerhead-logo-medium.png

homepage/loggerhead.css

homepage/sphere.png

loggerhead/branchview.py

loggerhead/config

loggerhead/config/__init__.py

loggerhead/config/app.cfg

loggerhead/config/log.cfg

loggerhead/controllers/bundle_ui.py

loggerhead/history2.py

loggerhead/json.py

loggerhead/model.py

loggerhead/release.py

loggerhead/sqlobject-history

loggerhead/static/css/style.css

loggerhead/static/images/bazaar-banner.png

loggerhead/static/images/favicon.ico

loggerhead/static/images/feed-icon-16x16.gif

loggerhead/static/images/header_inner.png

loggerhead/static/images/info.png

loggerhead/static/images/loggerhead-banner.png

loggerhead/static/images/nav-small-down.gif

loggerhead/static/images/nav-small-in.gif

loggerhead/static/images/nav-small-out.gif

loggerhead/static/images/nav-small-right.gif

loggerhead/static/images/ok.png

loggerhead/static/images/tg_under_the_hood.png

loggerhead/static/images/under_the_hood_blue.png

loggerhead/static/javascript/collapse.js

loggerhead/templates/annotate.kid

loggerhead/templates/atom.kid

loggerhead/templates/browse.kid

loggerhead/templates/changelog.kid

loggerhead/templates/inventory.kid

loggerhead/templates/master.kid

loggerhead/templates/revision.kid

loggerhead/textindex.py

push-website

files renamed:
start-loggerhead.py => start-loggerhead

stop-loggerhead.py => stop-loggerhead

files modified:
.bzrignore

MANIFEST.in

NEWS

README.txt

loggerhead.conf.example

loggerhead/__init__.py

loggerhead/changecache.py

loggerhead/controllers/__init__.py

loggerhead/controllers/annotate_ui.py

loggerhead/controllers/atom_ui.py

loggerhead/controllers/changelog_ui.py

loggerhead/controllers/download_ui.py

loggerhead/controllers/inventory_ui.py

loggerhead/controllers/revision_ui.py

loggerhead/daemon.py

loggerhead/history.py

loggerhead/lockfile.py

loggerhead/lsprof.py

loggerhead/tests/test_corners.py

loggerhead/tests/test_filechangecache.py

loggerhead/tests/test_simple.py

loggerhead/util.py

setup.py

Show diffs side-by-side

added added

removed removed

loggerhead/textindex.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""

indexing of the comment text of revisions, for fast searching.

two separate database files are created:

- recorded: revid -> 1 (if the revid is indexed)

- index: 3-letter substring -> list(revids)

"""

import logging

import os

import re

import threading

import time

from loggerhead import util

from loggerhead.util import decorator

from loggerhead.lockfile import LockFile

from loggerhead.changecache import FakeShelf

# if any substring index reaches this many revids, replace the entry with

# an ALL marker -- it's not worth an explicit index.

ALL_THRESHOLD = 1000

ALL = 'ALL'

with_lock = util.with_lock('_lock')

def normalize_string(s):

"""

remove any punctuation and normalize all whitespace to a single space.

"""

s = util.to_utf8(s).lower()

# remove apostrophes completely.

s = re.sub(r"'", '', s)

# convert other garbage into space

s = re.sub(r'[^\w\d]', ' ', s)

# compress multiple spaces into one.

s = re.sub(r'\s{2,}', ' ', s)

# and finally remove leading/trailing whitespace

s = s.strip()

return s

class TextIndex (object):

def __init__(self, history, cache_path):

self.history = history

self.log = history.log

if not os.path.exists(cache_path):

os.mkdir(cache_path)

self._recorded_filename = os.path.join(cache_path, 'textindex-recorded.sql')

self._index_filename = os.path.join(cache_path, 'textindex.sql')

# use a lockfile since the cache folder could be shared across different processes.

self._lock = LockFile(os.path.join(cache_path, 'index-lock'))

self._closed = False

self.log.info('Using search index; %d entries.', len(self))

def _index(self):

return FakeShelf(self._index_filename)

def _recorded(self):

return FakeShelf(self._recorded_filename)

def _is_indexed(self, revid, recorded):

return recorded.get(util.to_utf8(revid)) is not None

@with_lock

def is_indexed(self, revid):

recorded = self._recorded()

try:

return self._is_indexed(revid, recorded)

finally:

recorded.close()

@with_lock

def __len__(self):

100

recorded = self._recorded()

101

try:

102

return recorded.count()

103

finally:

104

recorded.close()

105

106

@with_lock

107

def close(self):

108

self._closed = True

109

110

@with_lock

111

def closed(self):

112

return self._closed

113

114

@with_lock

115

def flush(self):

116

pass

117

118

@with_lock

119

def full(self):

120

recorded = self._recorded()

121

last_revid = util.to_utf8(self.history.last_revid)

122

try:

123

return (recorded.count() >= len(self.history.get_revision_history())

124

and recorded.get(last_revid) is not None)

125

finally:

126

recorded.close()

127

128

def _index_change(self, change, recorded, index):

129

"""

130

currently, only indexes the 'comment' field.

131

"""

132

comment = normalize_string(change.comment)

133

if len(comment) < 3:

134

return

135

for i in xrange(len(comment) - 2):

136

sub = comment[i:i + 3]

137

orig = revid_set = index.get(sub)

138

if revid_set is None:

139

revid_set = set()

140

elif revid_set == ALL:

141

# this entry got too big

142

continue

143

revid_set.add(change.revid)

144

if len(revid_set) > ALL_THRESHOLD:

145

revid_set = ALL

146

if orig is not None:

147

index.update([(sub, revid_set)], commit=False)

148

else:

149

index.add([(sub, revid_set)], commit=False)

150

151

recorded.add([(util.to_utf8(change.revid), True)], commit=False)

152

153

@with_lock

154

def index_changes(self, revid_list):

155

recorded = self._recorded()

156

index = self._index()

157

try:

158

revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]

159

change_list = self.history.get_changes(revid_list)

160

for change in change_list:

161

self._index_change(change, recorded, index)

162

finally:

163

index.close(commit=True)

164

recorded.close(commit=True)

165

166

@with_lock

167

def find(self, text, revid_list=None):

168

index = self._index()

169

try:

170

text = normalize_string(text)

171

if len(text) < 3:

172

return []

173

174

total_set = None

175

if revid_list is not None:

176

total_set = set(revid_list)

177

seen_all = False

178

179

for i in xrange(len(text) - 2):

180

sub = text[i:i + 3]

181

revid_set = index.get(sub)

182

if revid_set is None:

183

# zero matches, stop here.

184

return []

185

if revid_set == ALL:

186

# skip

187

seen_all = True

188

continue

189

if total_set is None:

190

total_set = revid_set

191

else:

192

total_set.intersection_update(revid_set)

193

if len(total_set) == 0:

194

return []

195

finally:

196

index.close()

197

198

# tricky: if seen_all is True, one of the substring indices was ALL

199

# (in other words, unindexed), so our results are actually a superset

200

# of the exact answer.

201

202

# if we cared, we could do a direct match on the result set and cull

203

# out any that aren't actually matches. for now, i'm gonna say that

204

# we DON'T care, and if one of the substrings hit ALL, there's a small

205

# chance that we'll give a few false positives.

206

return total_set

207

208

def check_rebuild(self, max_time=3600):

209

"""

210

check if there are any un-indexed revisions, and if so, index them.

211

but don't spend longer than C{max_time} on it.

212

"""

213

if self.closed() or self.full():

214

# all done

215

return

216

217

self.log.info('Building search index...')

218

work = list(self.history.get_revision_history())

219

start_time = time.time()

220

last_update = time.time()

221

count = 0

222

223

jump = 100

224

for i in xrange(0, len(work), jump):

225

r = work[i:i + jump]

226

self.index_changes(r)

227

if self.closed():

228

return

229

230

count += jump

231

now = time.time()

232

if now - start_time > 3600:

233

# there's no point working for hours. eventually we might even

234

# hit the next re-index interval, which would suck mightily.

235

self.log.info('Search indexing has worked for an hour; giving up for now.')

236

return

237

if now - last_update > 60:

238

self.log.info('Search indexing continues: %d/%d' % (min(count, len(work)), len(work)))

239

last_update = time.time()

240

# give someone else a chance at the lock

241

time.sleep(1)

242

self.log.info('Search index completed.')

243

self.flush()

244

Older »