source: trunk/NewView/SearchUnit.pas@ 43

Last change on this file since 43 was 43, checked in by RBRi, 19 years ago

% use new debug unit

  • Property svn:eol-style set to native
File size: 16.4 KB
Line 
1Unit SearchUnit;
2
3// NewView - a new OS/2 Help Viewer
4// Copyright 2003 Aaron Lawrence (aaronl at consultant dot com)
5// This software is released under the Gnu Public License - see readme.txt
6
7Interface
8
9// Contains code to search help files.
10
11uses
12 Classes,
13 HelpFile,
14 TextSearchQuery,
15 IPFFileFormatUnit;
16
17const
18 // match weightings
19 mwOnlyTitleWord = 200;
20 mwFirstTitleWord = 50;
21 mwTitleWord = 20;
22
23 mwOnlyIndexWord = 100;
24 mwFirstIndexWord = 20;
25 mwIndexWord = 10;
26 mwTopicTextWord = 1;
27
28 // best case match weighting of a word
29 mwExactWord = 20;
30
31
32// note on weightings. The title/index weightings
33// are multipled by word weightings.
34// Topic text matches are equal to word weighting
35// times word weighting.
36
37procedure SearchHelpFile( HelpFile: THelpFile;
38 Query: TTextSearchQuery;
39 Results: TList;
40 WordSequences: TList );
41
42// clear a lsit of word sequences (as produced by above)
43procedure ClearWordSequences( WordSequences: TList;
44 DictionaryCount: longint );
45
46Implementation
47
48uses
49 SysUtils,
50 DebugUnit,
51 ACLStringUtility,
52 HelpTopic;
53
54type
55 TSearchType = ( stGeneral, stStarts, stExactMatch, stEnds );
56
57procedure ClearWordSequence( WordSequence: TList;
58 DictionaryCount: longint );
59var
60 StepIndex: longint;
61 DictionaryRelevances: UInt32ArrayPointer;
62begin
63 for StepIndex := 0 to WordSequence.Count - 1 do
64 begin
65 DictionaryRelevances := WordSequence[ StepIndex ];
66 FreeUInt32Array( DictionaryRelevances, DictionaryCount );
67 end;
68 WordSequence.Clear;
69end;
70
71procedure ClearWordSequences( WordSequence: TList;
72 DictionaryCount: longint );
73var
74 SequenceIndex: longint;
75 WordSequence: TList;
76begin
77 for SequenceIndex := 0 to WordSequences.Count - 1 do
78 begin
79 WordSequence := WordSequences[ SequenceIndex ];
80 ClearWordSequence( WordSequence,
81 DictionaryCount );
82 WordSequence.Destroy;
83 end;
84 WordSequences.Clear;
85end;
86
87
88// given a search word which is known to matche Reference word,
89// return the relevance
90function MatchedWordRelevance( const SearchWord: string;
91 const ReferenceWord: string ): longint;
92begin
93 Result := mwExactWord
94 * Length( SearchWord )
95 div Length( ReferenceWord );
96 if Result = 0 then
97 Result := 1;
98end;
99
100// Compares the given search word against the given
101// reference word. Returns a value indicating how well the
102// search word matches, 0 = not at all.
103function CompareWord( const SearchWord: string;
104 const ReferenceWord: string ): longint;
105var
106 OccurrencePos: longint;
107begin
108 Result := 0;
109 OccurrencePos := CaseInsensitivePos( SearchWord, ReferenceWord );
110 if OccurrencePos = 0 then
111 // no match
112 exit;
113
114 Result := MatchedWordRelevance( SearchWord, ReferenceWord );
115end;
116
117// Search the help file dictionary for words that match
118// the given search word. Partial matches are considered.
119// Results returns the matching word indexes.
120procedure SearchDictionary( HelpFile: THelpFile;
121 SearchWord: string;
122 Results: UInt32ArrayPointer );
123var
124 DictIndex: integer;
125 pDictWord: pstring;
126begin
127 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
128 begin
129 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
130 Results[ DictIndex ] := CompareWord( SearchWord,
131 pDictWord^ );
132 end;
133end;
134
135// Search the help file dictionary for words that
136// match the given search word exactly (except for case-insensitive)
137procedure SearchDictionaryExact( HelpFile: THelpFile;
138 SearchWord: string;
139 Results: UInt32ArrayPointer );
140var
141 DictIndex: integer;
142 pDictWord: pstring;
143begin
144 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
145
146 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
147 begin
148 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
149 if StringsSame( SearchWord, pDictWord^ ) then
150 Results[ DictIndex ] := mwExactWord;
151 end;
152end;
153
154// Search the help file dictionary for words that
155// start with the given word
156procedure SearchDictionaryStarts( HelpFile: THelpFile;
157 SearchWord: string;
158 Results: UInt32ArrayPointer );
159var
160 DictIndex: integer;
161 DictWord: string;
162begin
163 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
164
165 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
166 begin
167 DictWord := HelpFile.DictionaryWords[ DictIndex ];
168 if StrStarts( SearchWord, DictWord ) then
169 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
170 end;
171end;
172
173// Search the help file dictionary for words that
174// end with the given word
175procedure SearchDictionaryEnds( HelpFile: THelpFile;
176 SearchWord: string;
177 Results: UInt32ArrayPointer );
178var
179 DictIndex: integer;
180 DictWord: string;
181begin
182 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
183
184 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
185 begin
186 DictWord := HelpFile.DictionaryWords[ DictIndex ];
187 if StrEnds( SearchWord, DictWord ) then
188 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
189 end;
190end;
191
192// Search titles of topics for given searchword
193procedure SearchTopicTitles( HelpFile: THelpFile;
194 SearchWord: string;
195 Results: UInt32ArrayPointer );
196var
197 TopicIndex: longint;
198 pTitle: pstring;
199 TitleWord: string;
200 Topic: TTopic;
201 TitleWordIndex: longint;
202 WordRelevance: longint;
203 TitleWordRelevance: longint;
204 P: longint;
205begin
206 // Search topic titles
207 for TopicIndex:= 0 to HelpFile.TopicCount - 1 do
208 begin
209 Topic:= HelpFile.Topics[ TopicIndex ];
210 pTitle:= Topic.TitlePtr;
211 TitleWordIndex := 0;
212 P := 1;
213 while P < Length( pTitle^ ) do
214 begin
215 GetNextValue( pTitle^, P, TitleWord, ' ' );
216 WordRelevance := CompareWord( SearchWord,
217 TitleWord );
218 if WordRelevance > 0 then
219 begin
220 if TitleWordIndex = 0 then
221 begin
222 // matching the first word is best
223 if P >= Length( pTitle^ ) then
224 // in fact it's the only word
225 TitleWordRelevance := mwOnlyTitleWord
226 * WordRelevance
227 else
228 TitleWordRelevance := mwFirstTitleWord
229 * WordRelevance
230 end
231 else
232 begin
233 TitleWordRelevance := mwTitleWord
234 * WordRelevance;
235 end;
236 inc( Results[ TopicIndex ],
237 TitleWordRelevance );
238 end;
239 inc( TitleWordIndex );
240 end;
241 end;
242end;
243
244// Search index entries for given searchword
245procedure SearchIndex( HelpFile: THelpFile;
246 SearchWord: string;
247 Results: UInt32ArrayPointer );
248var
249 IndexIndex: longint;
250 pIndexEntry: pstring;
251 IndexEntryWord: string;
252 Topic: TTopic;
253 IndexEntryWordIndex: longint;
254 WordRelevance: longint;
255 IndexEntryWordRelevance: longint;
256 P: longint;
257begin
258 for IndexIndex := 0 to HelpFile.Index.Count - 1 do
259 begin
260 Topic := HelpFile.Index.Objects[ IndexIndex ] as TTopic;
261 pIndexEntry := HelpFile.IndexEntryPtr[ IndexIndex ];
262 IndexEntryWordIndex := 0;
263 P := 1;
264 while P < Length( pIndexEntry^ ) do
265 begin
266 GetNextValue( pIndexEntry^, P, IndexEntryWord, ' ' );
267 WordRelevance := CompareWord( SearchWord,
268 IndexEntryWord );
269 if WordRelevance > 0 then
270 begin
271 if IndexEntryWordIndex = 0 then
272 begin
273 // matching the first word is best
274 if P >= Length( pIndexEntry^ ) then
275 // in fact it's the only word
276 IndexEntryWordRelevance := mwOnlyIndexWord
277 * WordRelevance
278 else
279 IndexEntryWordRelevance := mwFirstIndexWord
280 * WordRelevance
281 end
282 else
283 begin
284 IndexEntryWordRelevance := mwIndexWord
285 * WordRelevance;
286 end;
287 inc( Results[ Topic.Index ],
288 IndexEntryWordRelevance );
289 end;
290 inc( IndexEntryWordIndex );
291 end;
292 end;
293end;
294
295// ------------------------------------------------------
296
297// Master search function. Given a search query,
298// searches topic text, titles, index entries.
299// Matching topics are added to TList, with their
300// SearchRelevance set appropriately.
301procedure SearchHelpFile( HelpFile: THelpFile;
302 Query: TTextSearchQuery;
303 Results: TList;
304 WordSequences: TList );
305var
306 TopicCount: longint;
307 Topic: TTopic;
308 TopicIndex: longint;
309 TermIndex: longint;
310 Term: TSearchTerm;
311
312 DictionaryRelevances: UInt32ArrayPointer;
313
314 TopicsMatchingDictWord: UInt32ArrayPointer; // flags
315 TopicsMatchingTermPart: UInt32ArrayPointer; // flags
316 TopicsMatchingTerm: UInt32ArrayPointer; // flag then relevances
317 TopicRelevances: UInt32ArrayPointer;
318 TopicsExcluded: UInt32ArrayPointer;
319
320 TopicRelevanceForTerm: longint;
321
322 WordRelevance: longint;
323 DictIndex: longint;
324
325 TermPartIndex: longint;
326 TermPart: string;
327
328 s: string;
329
330 TermWordSequence: TList;
331begin
332 if HelpFile.SearchTable = nil then
333 begin
334 exit;
335 end;
336
337 // Reset flags per topic
338 TopicCount := HelpFile.TopicCount;
339
340 // Get memory for topic relevance arrays
341
342 AllocUInt32Array( TopicsMatchingDictWord,
343 TopicCount );
344 AllocUInt32Array( TopicsMatchingTermPart,
345 TopicCount );
346 AllocUInt32Array( TopicsMatchingTerm,
347 TopicCount );
348 AllocUInt32Array( TopicRelevances, // functions as a flag and a cumulative relevance
349 TopicCount );
350 AllocUInt32Array( TopicsExcluded, // Exclusions are treated as boolean only
351 TopicCount );
352
353 ClearUInt32Array( TopicRelevances,
354 TopicCount );
355 ClearUInt32Array( TopicsExcluded,
356 TopicCount );
357
358 for TermIndex := 0 to Query.TermCount - 1 do
359 begin
360 Term := Query.Term[ TermIndex ];
361
362 LogEvent(LogSearch, 'Searching for term "'
363 + Term.Text
364 + '", '
365 + IntToStr( Term.Parts.Count )
366 + ' parts' );
367
368 // look thru all parts of the term. eg. CAKE_SAUSAGE
369
370 TermWordSequence := TList.Create;
371
372 if WordSequences <> nil then
373 if Term.CombineMethod <> cmExcluded then
374 // this term is an inclusive one, so we want to remember the matches
375 WordSequences.Add( TermWordSequence );
376
377 for TermPartIndex := 0 to Term.Parts.Count - 1 do
378 begin
379 TermPart := Term.Parts[ TermPartIndex ];
380
381 LogEvent(LogSearch, ' Searching for [' + TermPart + ']' );
382
383 AllocUInt32Array( DictionaryRelevances,
384 HelpFile.DictionaryCount );
385
386 TermWordSequence.Add( DictionaryRelevances );
387
388 // Search the dictionary for matches.
389 // alpha numeric match
390
391 if Term.Parts.Count = 1 then
392 // general match allowing all kinds of partial matches
393 SearchDictionary( HelpFile,
394 TermPart,
395 DictionaryRelevances )
396
397 else if TermPartIndex = 0 then
398 // first term part: word must match end of a topic word e.g. must end in "cake"
399 SearchDictionaryEnds( HelpFile,
400 TermPart,
401 DictionaryRelevances )
402
403 else if TermPartIndex = Term.Parts.Count - 1 then
404 // last term part: word must match start of a topic word e.g. must start with "sausage"
405 SearchDictionaryStarts( HelpFile,
406 TermPart,
407 DictionaryRelevances )
408
409 else
410 // intermediate term part: word must match exactly e.g. must be "_"
411 SearchDictionaryExact( HelpFile,
412 TermPart,
413 DictionaryRelevances );
414
415 // For each word in the dictionary that matches
416 // this search term part, search topic texts
417
418 LogEvent(LogSearch, ' Dictionary search done' );
419 ClearUInt32Array( TopicsMatchingTermPart,
420 TopicCount );
421
422 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
423 begin
424 WordRelevance := DictionaryRelevances[ DictIndex ];
425 if WordRelevance > 0 then
426 begin
427 // Search for occurrences of this word
428 // within the text of topics
429 HelpFile.SearchTable.Search( DictIndex,
430 TopicsMatchingDictWord );
431
432 // debug
433 s := HelpFile.DictionaryWords[ DictIndex ];
434 // TopicRelevancesForDictWord now contains 1
435 // for topics that contain this word.
436
437 OrUInt32Array( TopicsMatchingDictWord,
438 TopicsMatchingTermPart,
439 TopicCount );
440 end
441 end;
442
443 LogEvent(LogSearch, 'Topic searches done' );
444
445 if TermPartIndex = 0 then
446 // first part, just copy
447 CopyUInt32Array( TopicsMatchingTermPart,
448 TopicsMatchingTerm,
449 TopicCount )
450 else
451 // and with previous term part results
452 AndUInt32Array( TopicsMatchingTermPart,
453 TopicsMatchingTerm,
454 TopicCount );
455
456 // loop for next term part (IPF word)
457 end;
458
459 // Now we have searched the dictionary and worked out matching topics
460 // for all parts of the term. Now combine all together
461
462 LogEvent(LogSearch, 'Checking for sequences' );
463 for TopicIndex := 0 to TopicCount - 1 do
464 begin
465 if TopicsMatchingTerm[ TopicIndex ] > 0 then
466 begin
467 Topic := HelpFile.Topics[ TopicIndex ];
468 // Topic text contained a match for the all the parts
469 // of the term.
470 // Now we need to:
471 // - verify that they actually occur all in a sequence (if it's a multi-part term)
472 // - count occurrences for relevance.
473
474 TopicRelevanceForTerm :=
475 Topic.SearchForWordSequences( TermWordSequence,
476 false ); // don't stop at first match
477
478 TopicRelevanceForTerm :=
479 TopicRelevanceForTerm div Term.Parts.Count; // divide to bring back into scale
480
481 TopicsMatchingTerm[ TopicIndex ] := TopicRelevanceForTerm;
482
483 end;
484 end;
485
486 if WordSequences = nil then
487 begin
488 // we don't need to keep the sequence
489 ClearWordSequence( TermWordSequence,
490 HelpFile.DictionaryCount );
491 TermWordSequence.Destroy;
492 end;
493
494 // Search titles and index
495
496 LogEvent(LogSearch, ' Searching titles' );
497 SearchTopicTitles( HelpFile, Term.Text, TopicsMatchingTerm );
498
499 LogEvent(LogSearch, ' Searching index' );
500 SearchIndex( HelpFile, Term.Text, TopicsMatchingTerm );
501
502 LogEvent(LogSearch, ' Combining' );
503 case Term.CombineMethod of
504 cmOptional:
505 AddUInt32Array( TopicsMatchingTerm,
506 TopicRelevances,
507 TopicCount );
508
509 cmRequired:
510 begin
511 // if zero then add to exclusions
512 NotOrUInt32Array( TopicsMatchingTerm,
513 TopicsExcluded,
514 TopicCount );
515
516 AddUInt32Array( TopicsMatchingTerm,
517 TopicRelevances,
518 TopicCount );
519 end;
520
521 cmExcluded:
522 OrUInt32Array( TopicsMatchingTerm,
523 TopicsExcluded,
524 TopicCount );
525 end;
526
527// Term.ClearMatches;
528
529 // loop for next term...
530 end;
531
532 LogEvent(LogSearch, 'Search completed, converting to list' );
533
534 // Now convert to list form.
535
536 for TopicIndex := 0 to TopicCount - 1 do
537 begin
538 if TopicsExcluded[ TopicIndex ] = 0 then
539 begin
540 Topic := HelpFile.Topics[ TopicIndex ];
541 Topic.SearchRelevance := TopicRelevances[ TopicIndex ];
542 if Topic.SearchRelevance > 0 then
543 begin
544 Results.Add( Topic );
545 end;
546 end;
547 end;
548
549 LogEvent(LogSearch, 'Freeing arrays' );
550 FreeUInt32Array( TopicRelevances, TopicCount );
551 FreeUInt32Array( TopicsExcluded, TopicCount );
552 FreeUInt32Array( TopicsMatchingTerm, TopicCount );
553 FreeUInt32Array( TopicsMatchingTermPart, TopicCount );
554 FreeUInt32Array( TopicsMatchingDictWord, TopicCount );
555
556 LogEvent(LogSearch, 'Done' );
557end;
558
559Initialization
560End.
Note: See TracBrowser for help on using the repository browser.