~loggerhead-team/loggerhead/trunk-rich

« back to all changes in this revision

Viewing changes to loggerhead/textindex.py

  • Committer: Michael Hudson
  • Date: 2007-12-18 10:10:43 UTC
  • mfrom: (128.5.1 mwhudson.dev)
  • mto: This revision was merged to the branch mainline in revision 144.
  • Revision ID: michael.hudson@canonical.com-20071218101043-rayfbdl4kxdl3jdu
merge aaron's changes

Show diffs side-by-side

added added

removed removed

Lines of Context:
19
19
"""
20
20
indexing of the comment text of revisions, for fast searching.
21
21
 
22
 
two separate 'shelve' files are created:
 
22
two separate database files are created:
23
23
 
24
24
    - recorded: revid -> 1 (if the revid is indexed)
25
25
    - index: 3-letter substring -> list(revids)
28
28
import logging
29
29
import os
30
30
import re
31
 
import shelve
32
31
import threading
33
32
import time
34
33
 
35
34
from loggerhead import util
36
35
from loggerhead.util import decorator
37
36
from loggerhead.lockfile import LockFile
 
37
from loggerhead.changecache import FakeShelf
38
38
 
39
39
# if any substring index reaches this many revids, replace the entry with
40
40
# an ALL marker -- it's not worth an explicit index.
69
69
        if not os.path.exists(cache_path):
70
70
            os.mkdir(cache_path)
71
71
        
72
 
        self._recorded_filename = os.path.join(cache_path, 'textindex-recorded')
73
 
        self._index_filename = os.path.join(cache_path, 'textindex')
 
72
        self._recorded_filename = os.path.join(cache_path, 'textindex-recorded.sql')
 
73
        self._index_filename = os.path.join(cache_path, 'textindex.sql')
74
74
        
75
75
        # use a lockfile since the cache folder could be shared across different processes.
76
76
        self._lock = LockFile(os.path.join(cache_path, 'index-lock'))
77
77
        self._closed = False
78
78
        
79
79
        self.log.info('Using search index; %d entries.', len(self))
 
80
 
 
81
    def _index(self):
 
82
        return FakeShelf(self._index_filename)
 
83
 
 
84
    def _recorded(self):
 
85
        return FakeShelf(self._recorded_filename)
80
86
    
81
87
    def _is_indexed(self, revid, recorded):
82
 
        return recorded.get(util.to_utf8(revid), None) is not None
 
88
        return recorded.get(util.to_utf8(revid)) is not None
83
89
        
84
90
    @with_lock
85
91
    def is_indexed(self, revid):
86
 
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
 
92
        recorded = self._recorded()
87
93
        try:
88
94
            return self._is_indexed(revid, recorded)
89
95
        finally:
91
97
    
92
98
    @with_lock
93
99
    def __len__(self):
94
 
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
 
100
        recorded = self._recorded()
95
101
        try:
96
 
            return len(recorded)
 
102
            return recorded.count()
97
103
        finally:
98
104
            recorded.close()
99
105
 
111
117
    
112
118
    @with_lock
113
119
    def full(self):
114
 
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
 
120
        recorded = self._recorded()
 
121
        last_revid = util.to_utf8(self.history.last_revid)
115
122
        try:
116
 
            return (len(recorded) >= len(self.history.get_revision_history())) and (util.to_utf8(self.history.last_revid) in recorded)
 
123
            return (recorded.count() >= len(self.history.get_revision_history())
 
124
                    and recorded.get(last_revid) is not None)
117
125
        finally:
118
126
            recorded.close()
119
127
 
126
134
            return
127
135
        for i in xrange(len(comment) - 2):
128
136
            sub = comment[i:i + 3]
129
 
            revid_set = index.get(sub, None)
 
137
            orig = revid_set = index.get(sub)
130
138
            if revid_set is None:
131
139
                revid_set = set()
132
140
            elif revid_set == ALL:
135
143
            revid_set.add(change.revid)
136
144
            if len(revid_set) > ALL_THRESHOLD:
137
145
                revid_set = ALL
138
 
            index[sub] = revid_set
139
 
        
140
 
        recorded[util.to_utf8(change.revid)] = True
 
146
            if orig is not None:
 
147
                index.update([(sub, revid_set)], commit=False)
 
148
            else:
 
149
                index.add([(sub, revid_set)], commit=False)
 
150
 
 
151
        recorded.add([(util.to_utf8(change.revid), True)], commit=False)
141
152
 
142
153
    @with_lock
143
154
    def index_changes(self, revid_list):
144
 
        recorded = shelve.open(self._recorded_filename, 'c', protocol=2)
145
 
        index = shelve.open(self._index_filename, 'c', protocol=2)
 
155
        recorded = self._recorded()
 
156
        index = self._index()
146
157
        try:
147
158
            revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]
148
159
            change_list = self.history.get_changes(revid_list)
149
160
            for change in change_list:
150
161
                self._index_change(change, recorded, index)
151
162
        finally:
152
 
            index.close()
153
 
            recorded.close()
 
163
            index.close(commit=True)
 
164
            recorded.close(commit=True)
154
165
    
155
166
    @with_lock
156
167
    def find(self, text, revid_list=None):
157
 
        index = shelve.open(self._index_filename, 'c', protocol=2)
 
168
        index = self._index()
158
169
        try:
159
170
            text = normalize_string(text)
160
171
            if len(text) < 3:
167
178
            
168
179
            for i in xrange(len(text) - 2):
169
180
                sub = text[i:i + 3]
170
 
                revid_set = index.get(sub, None)
 
181
                revid_set = index.get(sub)
171
182
                if revid_set is None:
172
183
                    # zero matches, stop here.
173
184
                    return []