~loggerhead-team/loggerhead/trunk-rich

« back to all changes in this revision

Viewing changes to loggerhead/textindex.py

  • Committer: Robey Pointer
  • Date: 2007-05-21 06:19:12 UTC
  • Revision ID: robey@lag.net-20070521061912-doe5d89zirkvd22r
yes, of course, the first release was in december *2006* :)

Show diffs side-by-side

added added

removed removed

Lines of Context:
19
19
"""
20
20
indexing of the comment text of revisions, for fast searching.
21
21
 
22
 
two separate database files are created:
 
22
two separate 'shelve' files are created:
23
23
 
24
24
    - recorded: revid -> 1 (if the revid is indexed)
25
25
    - index: 3-letter substring -> list(revids)
28
28
import logging
29
29
import os
30
30
import re
 
31
import shelve
31
32
import threading
32
33
import time
33
34
 
34
35
from loggerhead import util
35
36
from loggerhead.util import decorator
36
37
from loggerhead.lockfile import LockFile
37
 
from loggerhead.changecache import FakeShelf
38
38
 
39
39
# if any substring index reaches this many revids, replace the entry with
40
40
# an ALL marker -- it's not worth an explicit index.
69
69
        if not os.path.exists(cache_path):
70
70
            os.mkdir(cache_path)
71
71
        
72
 
        self._recorded_filename = os.path.join(cache_path, 'textindex-recorded.sql')
73
 
        self._index_filename = os.path.join(cache_path, 'textindex.sql')
 
72
        self._recorded_filename = os.path.join(cache_path, 'textindex-recorded')
 
73
        self._index_filename = os.path.join(cache_path, 'textindex')
74
74
        
75
75
        # use a lockfile since the cache folder could be shared across different processes.
76
76
        self._lock = LockFile(os.path.join(cache_path, 'index-lock'))
77
77
        self._closed = False
78
78
        
79
79
        self.log.info('Using search index; %d entries.', len(self))
80
 
 
81
 
    def _index(self):
82
 
        return FakeShelf(self._index_filename)
83
 
 
84
 
    def _recorded(self):
85
 
        return FakeShelf(self._recorded_filename)
86
80
    
87
81
    def _is_indexed(self, revid, recorded):
88
 
        return recorded.get(util.to_utf8(revid)) is not None
 
82
        return recorded.get(util.to_utf8(revid), None) is not None
89
83
        
90
84
    @with_lock
91
85
    def is_indexed(self, revid):
92
 
        recorded = self._recorded()
 
86
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
93
87
        try:
94
88
            return self._is_indexed(revid, recorded)
95
89
        finally:
97
91
    
98
92
    @with_lock
99
93
    def __len__(self):
100
 
        recorded = self._recorded()
 
94
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
101
95
        try:
102
 
            return recorded.count()
 
96
            return len(recorded)
103
97
        finally:
104
98
            recorded.close()
105
99
 
117
111
    
118
112
    @with_lock
119
113
    def full(self):
120
 
        recorded = self._recorded()
121
 
        last_revid = util.to_utf8(self.history.last_revid)
 
114
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
122
115
        try:
123
 
            return (recorded.count() >= len(self.history.get_revision_history())
124
 
                    and recorded.get(last_revid) is not None)
 
116
            return (len(recorded) >= len(self.history.get_revision_history())) and (util.to_utf8(self.history.last_revid) in recorded)
125
117
        finally:
126
118
            recorded.close()
127
119
 
134
126
            return
135
127
        for i in xrange(len(comment) - 2):
136
128
            sub = comment[i:i + 3]
137
 
            orig = revid_set = index.get(sub)
 
129
            revid_set = index.get(sub, None)
138
130
            if revid_set is None:
139
131
                revid_set = set()
140
132
            elif revid_set == ALL:
143
135
            revid_set.add(change.revid)
144
136
            if len(revid_set) > ALL_THRESHOLD:
145
137
                revid_set = ALL
146
 
            if orig is not None:
147
 
                index.update([(sub, revid_set)], commit=False)
148
 
            else:
149
 
                index.add([(sub, revid_set)], commit=False)
150
 
 
151
 
        recorded.add([(util.to_utf8(change.revid), True)], commit=False)
 
138
            index[sub] = revid_set
 
139
        
 
140
        recorded[util.to_utf8(change.revid)] = True
152
141
 
153
142
    @with_lock
154
143
    def index_changes(self, revid_list):
155
 
        recorded = self._recorded()
156
 
        index = self._index()
 
144
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
 
145
        index = shelve.open(self._index_filename, 'c', protocol=2)
157
146
        try:
158
147
            revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]
159
148
            change_list = self.history.get_changes(revid_list)
160
149
            for change in change_list:
161
150
                self._index_change(change, recorded, index)
162
151
        finally:
163
 
            index.close(commit=True)
164
 
            recorded.close(commit=True)
 
152
            index.close()
 
153
            recorded.close()
165
154
    
166
155
    @with_lock
167
156
    def find(self, text, revid_list=None):
168
 
        index = self._index()
 
157
        index = shelve.open(self._index_filename, 'c', protocol=2)
169
158
        try:
170
159
            text = normalize_string(text)
171
160
            if len(text) < 3:
178
167
            
179
168
            for i in xrange(len(text) - 2):
180
169
                sub = text[i:i + 3]
181
 
                revid_set = index.get(sub)
 
170
                revid_set = index.get(sub, None)
182
171
                if revid_set is None:
183
172
                    # zero matches, stop here.
184
173
                    return []