1:- module(bc_search, [
2 bc_search/4, 3 bc_index/1, 4 bc_index_remove/1, 5 bc_index_remove/0, 6 bc_index_all/0,
7 bc_index_clean/0,
8 bc_cosine_similarity/3, 9 bc_add_stopword/1, 10 bc_terms/1 11]).
15:- use_module(library(error)). 16:- use_module(library(debug)). 17:- use_module(library(docstore)). 18:- use_module(library(sort_dict)). 19:- use_module(library(dcg/basics)). 20:- use_module(library(porter_stem)). 21
22:- use_module(bc_excerpt). 23
26
27:- dynamic(content_index/4). 28:- dynamic(indexed/1). 29:- dynamic(term/1). 30:- dynamic(term_idf/2). 31:- dynamic(entry_tfidf/4). 32:- dynamic(stopword/1).
40bc_index_clean:-
41 with_mutex(bc_index,
42 index_clean_unsafe).
43
44index_clean_unsafe:-
45 debug(bc_index, 'cleaning index', []),
46 retractall(content_index(_, _, _, _)),
47 retractall(indexed(_)),
48 retractall(term(_)),
49 retractall(term_idf(_, _)),
50 retractall(entry_term_tfidf(_, _, _, _)).
51
52bc_terms(List):-
53 findall(Idf-Term, term_idf(Term, Idf), Pairs),
54 sort(Pairs, List).
55
56bc_add_stopword(Word):-
57 must_be(atom, Word),
58 ( stopword(Word)
59 -> true
60 ; assertz(stopword(Word))).
61
62:- bc_add_stopword(the). 63:- bc_add_stopword(to). 64:- bc_add_stopword(for). 65:- bc_add_stopword(in). 66:- bc_add_stopword(is). 67:- bc_add_stopword(it). 68:- bc_add_stopword(about). 69:- bc_add_stopword(have). 70:- bc_add_stopword(many). 71:- bc_add_stopword(other). 72:- bc_add_stopword(some). 73:- bc_add_stopword(that). 74:- bc_add_stopword(them). 75:- bc_add_stopword(this). 76:- bc_add_stopword(when). 77
82
83bc_cosine_similarity(Id1, Id2, Cosine):-
84 must_be(atom, Id1),
85 must_be(atom, Id2),
86 entry_tfidf(Id1, Vector1, Norm1, NonZero1),
87 score_vector(Vector1, Norm1, NonZero1, Id2, Cosine).
88
89score_vector(Vector1, Norm1, NonZero1, Id, Cosine):-
90 entry_tfidf(Id, Vector2, Norm2, NonZero2),
91 ord_intersection(NonZero1, NonZero2, NonZero),
92 dot_product(NonZero, 0, Vector1, Vector2, Product),
93 Cosine is Product / (Norm1 * Norm2).
94
97
98dot_product([Index|Indices], Acc, Vector1, Vector2, Product):-
99 arg(Index, Vector1, TfIdf1),
100 arg(Index, Vector2, TfIdf2),
101 Tmp is TfIdf1 * TfIdf2 + Acc,
102 dot_product(Indices, Tmp, Vector1, Vector2, Product).
103
104dot_product([], Acc, _, _, Acc).
105
106term_freq(Id, Term, Freq):-
107 ( content_index(Term, Id, _, Freq)
108 -> true
109 ; Freq = 0).
110
112
113term_idf_rebuild:-
114 debug(bc_search, 'rebuilding IDF factor index', []),
115 retractall(term_idf(_, _)),
116 findall(Term, term(Term), Terms),
117 findall(_, indexed(_), Docs),
118 length(Docs, Count),
119 maplist(add_term_idf(Count), Terms).
120
123
124add_term_idf(Count, Term):-
125 findall(_, content_index(Term, _, _, _), Docs),
126 length(Docs, NumDocs),
127 ( NumDocs = 0
128 -> Idf = 1
129 ; Idf is 1 + log(Count/NumDocs)),
130 assertz(term_idf(Term, Idf)).
131
133
134entry_tfidf_rebuild_all:-
135 debug(bc_search, 'rebuilding all TF*IDF vectors', []),
136 findall(Id, indexed(Id), Ids),
137 maplist(entry_tfidf_rebuild, Ids).
138
140
141entry_tfidf_rebuild(Id):-
142 debug(bc_search, 'rebuilding TF*IDF vector for ~w', [Id]),
143 retractall(entry_tfidf(Id, _, _, _)),
144 findall(Term-Idf, term_idf(Term, Idf), IdfsPairs),
145 maplist(entry_term_tfidf(Id), IdfsPairs, Items),
146 items_tfidf_vector_norm(Items, Vector, Norm, NonZero),
147 assertz(entry_tfidf(Id, Vector, Norm, NonZero)).
148
149items_tfidf_vector_norm(Items, Vector, Norm, NonZero):-
150 Vector =.. [tfid|Items],
151 findall(Index, (
152 arg(Index, Vector, Value), Value > 0),
153 NonZero),
154 tfidf_vector_norm(NonZero, 0, Vector, Norm).
155
158
159entry_term_tfidf(Id, Term-Idf, TfIdf):-
160 term_freq(Id, Term, Freq),
161 TfIdf is Freq * Idf.
162
164
165tfidf_vector_norm([Index|Indices], Acc, Vector, Norm):-
166 arg(Index, Vector, TfIdf),
167 Tmp is TfIdf * TfIdf + Acc,
168 tfidf_vector_norm(Indices, Tmp, Vector, Norm).
169
170tfidf_vector_norm([], Sum, _, Norm):-
171 Norm is sqrt(Sum).
180bc_search(Type, Language, Query, Results):-
181 split(Query, Language, Tokens),
182 ds_find(entry, (type=Type, published=true),
183 [slug, tags, title, author, date_published,
184 date_updated, description, language], Entries),
185 query_tfidf_norm(Tokens, Vector, Norm, NonZero),
186 ( NonZero = []
187 -> Results = []
188 ; maplist(score_entry(Vector, Norm, NonZero), Entries, Scored),
189 include(above_zero, Scored, Filtered),
190 sort_dict(search_score, desc, Filtered, Sorted),
191 maplist(add_excerpt, Sorted, WithExcerpt),
192 maplist(add_author, WithExcerpt, Results)).
193
194query_tfidf_norm(Tokens, Vector, Norm, NonZero):-
195 findall(Term-Idf, term_idf(Term, Idf), IdfsPairs),
196 maplist(query_term_tfidf(Tokens), IdfsPairs, Items),
197 items_tfidf_vector_norm(Items, Vector, Norm, NonZero).
198
201
202query_term_tfidf(Tokens, Term-Idf, TfIdf):-
203 ( memberchk(Term, Tokens)
204 -> TfIdf = Idf
205 ; TfIdf = 0).
206
207above_zero(Entry):-
208 Entry.search_score > 0.
209
210add_excerpt(Entry, WithExcerpt):-
211 ds_id(Entry, Id),
212 ds_col_get(entry, Id, [html], Excerpt),
213 bc_excerpt(Excerpt.html, 200, Text),
214 WithExcerpt = Entry.put(excerpt, Text).
215
216add_author(Entry, WithAuthor):-
217 ds_col_get(user, Entry.author, [fullname, link], Author),
218 WithAuthor = Entry.put(author, Author).
219
223
224score_entry(Vector, Norm, NonZero, Entry, WithScore):-
225 ds_id(Entry, Id),
226 score_vector(Vector, Norm, NonZero, Id, Score),
227 WithScore = Entry.put(search_score, Score).
233bc_index_all:-
234 debug(bc_search, 'indexing all entries', []),
235 with_mutex(bc_index, (
236 ds_all(entry, [], Entries),
237 maplist(index_entry, Entries),
238 term_idf_rebuild,
239 entry_tfidf_rebuild_all)).
240
241index_entry(Entry):-
242 ds_id(Entry, Id),
243 index_unsafe(Id).
251bc_index(Id):-
252 must_be(atom, Id),
253 with_mutex(bc_index, (
254 index_unsafe(Id),
255 term_idf_rebuild,
256 entry_tfidf_rebuild(Id))).
257
258index_unsafe(Id):-
259 debug(bc_search, 'indexing ~w', [Id]),
260 retractall(content_index(_, Id, _, _)),
261 retractall(indexed(Id)),
262 ds_col_get(entry, Id,
263 [content, tags, title, slug, language], Entry),
264 atom_string(Language, Entry.language),
265 index_content(Id, Entry.content, Language),
266 index_tags(Id, Entry.tags, Language),
267 index_title(Id, Entry.title, Language),
268 index_slug(Id, Entry.slug, Language),
269 assertz(indexed(Id)).
270
271index_content(Id, Content, Language):-
272 split(Content, Language, Tokens),
273 length(Tokens, Length),
274 maplist(add_content_token(Id, Length), Tokens).
275
276index_tags(Id, Tags, Language):-
277 atomic_list_concat(Tags, ' ', Concat),
278 split(Concat, Language, Tokens),
279 maplist(add_tag_token(Id), Tokens).
280
281index_title(Id, Title, Language):-
282 split(Title, Language, Tokens),
283 maplist(add_tag_token(Id), Tokens).
284
285index_slug(Id, Slug, Language):-
286 split(Slug, Language, Tokens),
287 maplist(add_tag_token(Id), Tokens).
288
291
292add_tag_token(Id, Token):-
293 add_term(Token),
294 retractall(content_index(Token, Id, _, _)),
295 assertz(content_index(Token, Id, 1, 1)).
302bc_index_remove(Id):-
303 must_be(atom, Id),
304 with_mutex(bc_index,
305 retractall(content_index(_, Id, _, _))).
311bc_index_remove:-
312 with_mutex(bc_index,
313 retractall(content_index(_, _, _, _))).
314
317
318add_content_token(Id, Length, Token):-
319 add_term(Token),
320 ( content_index(Token, Id, Count, _)
321 -> retractall(content_index(Token, Id, _, _)),
322 NewCount is Count + 1,
323 NewRel is NewCount / Length,
324 assertz(content_index(Token, Id, NewCount, NewRel))
325 ; NewRel is 1/Length,
326 assertz(content_index(Token, Id, 1, NewRel))).
327
330
331add_term(Token):-
332 ( term(Token)
333 -> true
334 ; assertz(term(Token))).
335
338
339stem_term(en, Term, Stemmed):- !,
340 catch(porter_stem(Term, Stemmed), Error, true),
341 ( var(Error)
342 -> true
343 ; Stemmed = Term).
344
345stem_term(_, Term, Term).
346
347split(Text, Language, Stemmed):-
348 must_be(atom, Language),
349 atom_codes(Text, Codes),
350 split(Codes, [], [], Tokens),
351 exclude(unused_token, Tokens, Filtered),
352 maplist(stem_term(Language), Filtered, Stemmed).
353
356
357unused_token(Token):-
358 atom_length(Token, Length),
359 Length < 2, !.
360
361unused_token(Token):-
362 stopword(Token).
363
366
367split([Code|Codes], Context, Acc, Tokens):-
368 ( split_at([Code|Codes], Context)
369 -> reverse(Acc, Token),
370 atom_codes(Atom, Token),
371 downcase_atom(Atom, Lower),
372 Tokens = [Lower|Rest],
373 split(Codes, [Code|Context], [], Rest)
374 ; split(Codes, [Code|Context], [Code|Acc], Tokens)).
375
376split([], _, Acc, [Lower]):-
377 reverse(Acc, Token),
378 atom_codes(Atom, Token),
379 downcase_atom(Atom, Lower).
380
382
384
385split_at([46,Code|_], _):-
386 code_type(Code, digit), !,
387 fail.
388
389split_at([Code|_], _):-
390 \+ code_type(Code, alnum)
Search support */