-
Notifications
You must be signed in to change notification settings - Fork 24
/
keysearch.py
393 lines (306 loc) · 13.9 KB
/
keysearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 11:31:44 2016
@author: Chris
"""
import textwrap
import pickle
import nltk
from gensim import corpora
from gensim.models import TfidfModel
# I lazily made this a global constant so that I wouldn't have to include
# it in the save and load features.
enc_format='utf-8'
class KeySearch(object):
"""
KeySearch, which is short for "keyword search" stores a completed gensim
tf-idf corpus. Whereas SimSearch stores an LSI model, and only understands
conceptual relationships between documents, KeySearch actually knows what
words occur in each document.
It has several key functions:
1. It has functions for converting new text sources (that is, texts not
already in the corpus) into tf-idf vectors.
2. It stores the corpus vocabulary in the form of a gensim dictionary.
3. It supports boolean keyword search (though this is NOT indexed!).
4. It stores the document metadata:
- Title
- Text source file
- Line numbers in source file
- Tags
Saving & Loading
================
The KeySearch object can be saved to and loaded from a directory
using `save` and `load`. The typical useage, however is to simply save and
load the SimSearch object (which also saves the underlying KeySearch).
When saving the KeySearch, only the dictionary, feature vectors, and
and document metadata are saved. The original text is not saved in any
form.
"""
def __init__(self, dictionary, tfidf_model, corpus_tfidf, titles,
tagsToDocs={}, docsToTags={}, files=[], doc_line_nums=[]):
"""
KeySearch requires a completed gensim corpus, along with some
additional metadata
Parameters:
dictionary - gensim dictionary
tfidf_model - gensim TfidfModel
corpus_tfidf - gensim corpora
titles - List of string titles.
tagsToDocs - Mapping of tags to doc ids
docsToTags - List of tags for each doc
files - Unique files in the corpus
doc_line_nums -
"""
self.dictionary = dictionary
self.tfidf_model = tfidf_model
self.corpus_tfidf = corpus_tfidf
self.titles = titles
# Create mappings for the entry tags.
self.tagsToDocs = tagsToDocs
self.docsToTags = docsToTags
self.files = files
self.doc_line_nums = doc_line_nums
def printTags(self):
"""
Print all of the tags present in the corpus, plus the number of docs
tagged with each.
"""
print 'All tags in corpus (# of documents):'
# Get all the tags and sort them alphabetically.
tags = self.tagsToDocs.keys()
tags.sort()
# Print each tag followed by the number of documents.
for tag in tags:
print '%20s %3d' % (tag, len(self.tagsToDocs[tag]))
def getTfidfForText(self, text):
"""
This function takes new input `text` (not part of the original corpus),
and processes it into a tf-idf vector.
The input text should be a single string.
"""
# If the string is not already unicode, decode the string into unicode
# so the NLTK can handle it.
if isinstance(text, str):
try:
text = text.decode(enc_format)
except:
print '======== Failed to decode input text! ========'
print 'Make sure text is encoded in', enc_format
print 'Input text:'
print text
return []
# If the string ends in a newline, remove it.
text = text.replace('\n', ' ')
# Convert everything to lowercase, then use NLTK to tokenize.
tokens = nltk.word_tokenize(text.lower())
# We don't need to do any special filtering of tokens here (stopwords,
# infrequent words, etc.). If a token is not in the dictionary, it is
# simply ignored. So the dictionary effectively does the token
# filtering for us.
# Convert the tokenized text into a bag of words representation.
bow_vec = self.dictionary.doc2bow(tokens)
# Convert the bag-of-words representation to tf-idf
return self.tfidf_model[bow_vec]
def getTfidfForFile(self, filename):
"""
Convert the text in the provided file to a tf-idf vector.
"""
# Open the file and read all lines.
with open(filename) as f:
text = f.readlines()
# Combine the lines into a single string.
text = " ".join(text)
# Pass the text down.
return self.getTfidfForText(text)
def getTfidfForDoc(self, doc_id):
"""
Return the tf-idf vector for the specified document.
"""
return self.corpus_tfidf[doc_id]
def keywordSearch(self, includes=[], excludes=[], docs=[]):
"""
Performs a boolean keyword search over the corpus.
All words in the dictionary are lower case. This function will convert
all supplied keywords to lower case.
Parameters:
includes A list of words (as strings) that the documents
*must include*.
excludes A list of words (as strings) that the documents
*must not include*.
docs The list of documents to search in, represented by
by doc_ids. If this list is empty, the entire corpus
is searched.
"""
# If no doc ids were supplied, search the entire corpus.
if not docs:
docs = range(0, len(self.corpus_tfidf))
# Convert all the keywords to their IDs.
# Force them to lower case in the process.
include_ids = []
exclude_ids = []
for word in includes:
# Lookup the ID for the word.
word_id = self.getIDForWord(word.lower())
# Verify the word exists in the dictionary.
if word_id == -1:
print 'WARNING: Word \'' + word.lower() + '\'not in dictionary!'
continue
# Add the word id to the list.
include_ids.append(word_id)
for word in excludes:
exclude_ids.append(self.getIDForWord(word.lower()))
results = []
# For each of the documents to search...
for doc_id in docs:
# Get the sparse tf-idf vector for the next document.
vec_tfidf = self.corpus_tfidf[doc_id]
# Create a list of the word ids in this document.
doc_words = [tfidf[0] for tfidf in vec_tfidf]
match = True
# Check for words that must be present.
for word_id in include_ids:
if not word_id in doc_words:
match = False
break
# If we failed the 'includes' test, skip to the next document.
if not match:
continue
# Check for words that must not be present.
for word_id in exclude_ids:
if word_id in doc_words:
match = False
break
# If we passed the 'excludes' test, this is a valid result.
if match:
results.append(doc_id)
return results
def printTopNWords(self, topn=10):
"""
Print the 'topn' most frequent words in the corpus.
This is useful for checking to see if you have any common, bogus tokens
that need to be filtered out of the corpus.
"""
# Get the dictionary as a list of tuples.
# The tuple is (word_id, count)
word_counts = [(key, value) for (key, value) in self.dictionary.dfs.iteritems()]
# Sort the list by the 'value' of the tuple (incidence count)
from operator import itemgetter
word_counts = sorted(word_counts, key=itemgetter(1))
# Print the most common words.
# The list is sorted smallest to biggest, so...
print 'Top', topn, 'most frequent words'
for i in range(-1, -topn, -1):
print ' %s %d' % (self.dictionary[word_counts[i][0]].ljust(10), word_counts[i][1])
def getVocabSize(self):
"""
Returns the number of unique words in the final vocabulary (after all
filtering).
"""
return len(self.dictionary.keys())
def getIDForWord(self, input_word):
"""
Lookup the ID for a specific word.
Returns -1 if the word isn't in the dictionary.
"""
# All words in dictionary are lower case.
input_word = input_word.lower()
# First check if the word exists in the dictionary.
if not input_word in self.dictionary.values():
return -1
# If it is, look up the ID.
else:
return self.dictionary.token2id[input_word]
def getDocLocation(self, doc_id):
"""
Return the filename and line numbers that 'doc_id' came from.
"""
line_nums = self.doc_line_nums[doc_id]
filename = self.files[line_nums[0]]
return filename, line_nums[1], line_nums[2]
def readDocSource(self, doc_id):
"""
Reads the original source file for the document 'doc_id' and retrieves
the source lines.
"""
# Lookup the source for the doc.
line_nums = self.doc_line_nums[doc_id]
filename = self.files[line_nums[0]]
line_start = line_nums[1]
line_end = line_nums[2]
results = []
# Open the file and read just the specified lines.
with open(filename) as fp:
for i, line in enumerate(fp):
# 'i' starts at 0 but line numbers start at 1.
line_num = i + 1
if line_num > line_end:
break
if line_num >= line_start:
results.append(line)
return results
def printDocSourcePretty(self, doc_id, max_lines=8, indent=' '):
"""
Prints the original source lines for the document 'doc_id'.
This function leverages the 'textwrap' Python module to limit the
print output to 80 columns.
"""
# Read in the document.
lines = self.readDocSource(doc_id)
# Limit the result to 'max_lines'.
truncated = False
if len(lines) > max_lines:
truncated = True
lines = lines[0:max_lines]
# Convert the list of strings to a single string.
lines = '\n'.join(lines)
# Remove indentations in the source text.
dedented_text = textwrap.dedent(lines).strip()
# Add an ellipsis to the end to show we truncated the doc.
if truncated:
dedented_text = dedented_text + ' ...'
# Wrap the text so it prints nicely--within 80 columns.
# Print the text indented slightly.
pretty_text = textwrap.fill(dedented_text, initial_indent=indent, subsequent_indent=indent, width=80)
print pretty_text
def save(self, save_dir='./'):
"""
Write out the built corpus to a save directory.
"""
# Store the tag tables.
pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
# Store the document titles.
pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
# Write out the tfidf model.
self.tfidf_model.save(save_dir + 'documents.tfidf_model')
# Write out the tfidf corpus.
corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)
# Write out the dictionary.
self.dictionary.save(save_dir + 'documents.dict')
# Save the filenames.
pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
# Save the file ID and line numbers for each document.
pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
# Objects that are not saved:
# - stop_list - You don't need to filter stop words for new input
# text, they simply aren't found in the dictionary.
# - frequency - This preliminary word count object is only used for
# removing infrequent words. Final word counts are in
# the `dictionary` object.
@classmethod
def load(cls, save_dir='./'):
"""
Load the corpus from a save directory.
"""
tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
tagsToDocs = tables[0]
docsToTags = tables[1]
titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
ksearch = KeySearch(dictionary, tfidf_model,
corpus_tfidf, titles, tagsToDocs,
docsToTags, files, doc_line_nums)
return ksearch