source: trunk/NewView/SearchUnit.pas@ 400

Last change on this file since 400 was 333, checked in by RBRi, 17 years ago

search bug fix; more logging

  • Property svn:eol-style set to native
File size: 18.1 KB
RevLine 
[18]1Unit SearchUnit;
2
3// NewView - a new OS/2 Help Viewer
4// Copyright 2003 Aaron Lawrence (aaronl at consultant dot com)
5// This software is released under the Gnu Public License - see readme.txt
6
7Interface
8
9// Contains code to search help files.
10
11uses
12 Classes,
[33]13 HelpFile,
14 TextSearchQuery,
15 IPFFileFormatUnit;
[18]16
17const
18 // match weightings
19 mwOnlyTitleWord = 200;
20 mwFirstTitleWord = 50;
21 mwTitleWord = 20;
22
23 mwOnlyIndexWord = 100;
24 mwFirstIndexWord = 20;
25 mwIndexWord = 10;
26 mwTopicTextWord = 1;
27
28 // best case match weighting of a word
29 mwExactWord = 20;
30
31
32// note on weightings. The title/index weightings
33// are multipled by word weightings.
34// Topic text matches are equal to word weighting
35// times word weighting.
36
37procedure SearchHelpFile( HelpFile: THelpFile;
38 Query: TTextSearchQuery;
39 Results: TList;
40 WordSequences: TList );
41
42// clear a lsit of word sequences (as produced by above)
43procedure ClearWordSequences( WordSequences: TList;
44 DictionaryCount: longint );
45
46Implementation
47
48uses
49 SysUtils,
[43]50 DebugUnit,
[115]51 StringUtilsUnit,
[18]52 HelpTopic;
53
54type
55 TSearchType = ( stGeneral, stStarts, stExactMatch, stEnds );
56
57procedure ClearWordSequence( WordSequence: TList;
58 DictionaryCount: longint );
59var
60 StepIndex: longint;
61 DictionaryRelevances: UInt32ArrayPointer;
62begin
63 for StepIndex := 0 to WordSequence.Count - 1 do
64 begin
65 DictionaryRelevances := WordSequence[ StepIndex ];
66 FreeUInt32Array( DictionaryRelevances, DictionaryCount );
67 end;
68 WordSequence.Clear;
69end;
70
71procedure ClearWordSequences( WordSequence: TList;
72 DictionaryCount: longint );
73var
74 SequenceIndex: longint;
75 WordSequence: TList;
76begin
77 for SequenceIndex := 0 to WordSequences.Count - 1 do
78 begin
79 WordSequence := WordSequences[ SequenceIndex ];
80 ClearWordSequence( WordSequence,
81 DictionaryCount );
82 WordSequence.Destroy;
83 end;
84 WordSequences.Clear;
85end;
86
87
88// given a search word which is known to matche Reference word,
89// return the relevance
90function MatchedWordRelevance( const SearchWord: string;
91 const ReferenceWord: string ): longint;
92begin
93 Result := mwExactWord
94 * Length( SearchWord )
95 div Length( ReferenceWord );
96 if Result = 0 then
97 Result := 1;
98end;
99
100// Compares the given search word against the given
101// reference word. Returns a value indicating how well the
102// search word matches, 0 = not at all.
103function CompareWord( const SearchWord: string;
104 const ReferenceWord: string ): longint;
105var
106 OccurrencePos: longint;
107begin
108 Result := 0;
109 OccurrencePos := CaseInsensitivePos( SearchWord, ReferenceWord );
110 if OccurrencePos = 0 then
[115]111 begin
[18]112 // no match
113 exit;
[115]114 end;
[18]115
116 Result := MatchedWordRelevance( SearchWord, ReferenceWord );
117end;
118
119// Search the help file dictionary for words that match
120// the given search word. Partial matches are considered.
121// Results returns the matching word indexes.
122procedure SearchDictionary( HelpFile: THelpFile;
123 SearchWord: string;
124 Results: UInt32ArrayPointer );
125var
[333]126 tmpDictIndex: integer;
[18]127 pDictWord: pstring;
128begin
[333]129 for tmpDictIndex := 0 to HelpFile.DictionaryCount - 1 do
[18]130 begin
[333]131 pDictWord := HelpFile.DictionaryWordPtrs[ tmpDictIndex ];
132 Results[ tmpDictIndex ] := CompareWord( SearchWord, pDictWord^ );
[18]133 end;
134end;
135
136// Search the help file dictionary for words that
137// match the given search word exactly (except for case-insensitive)
138procedure SearchDictionaryExact( HelpFile: THelpFile;
139 SearchWord: string;
140 Results: UInt32ArrayPointer );
141var
142 DictIndex: integer;
143 pDictWord: pstring;
144begin
145 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
146
147 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
148 begin
149 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
[115]150 if StrEqualIgnoringCase( SearchWord, pDictWord^ ) then
[18]151 Results[ DictIndex ] := mwExactWord;
152 end;
153end;
154
155// Search the help file dictionary for words that
156// start with the given word
157procedure SearchDictionaryStarts( HelpFile: THelpFile;
158 SearchWord: string;
159 Results: UInt32ArrayPointer );
160var
161 DictIndex: integer;
162 DictWord: string;
163begin
[333]164 if IsLogAspectsEnabled(LogSearch) then
165 begin
166 LogEvent(LogSearch, ' calling SearchDictionaryStarts "' + SearchWord + '"');
167 end;
168
[18]169 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
170
171 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
172 begin
173 DictWord := HelpFile.DictionaryWords[ DictIndex ];
[312]174 if StrStartsWithIgnoringCase(DictWord, SearchWord) then
[333]175 begin
176 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord )
177 end;
[18]178 end;
179end;
180
[333]181
[18]182// Search the help file dictionary for words that
183// end with the given word
184procedure SearchDictionaryEnds( HelpFile: THelpFile;
185 SearchWord: string;
186 Results: UInt32ArrayPointer );
187var
188 DictIndex: integer;
189 DictWord: string;
190begin
[333]191 if IsLogAspectsEnabled(LogSearch) then
192 begin
193 LogEvent(LogSearch, ' calling SearchDictionaryEnds for "' + SearchWord + '"');
194 end;
[18]195 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
196
197 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
198 begin
199 DictWord := HelpFile.DictionaryWords[ DictIndex ];
[312]200 if StrEndsWithIgnoringCase(DictWord, SearchWord ) then
[333]201 begin
[18]202 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
[333]203 end;
[18]204 end;
205end;
206
207// Search titles of topics for given searchword
208procedure SearchTopicTitles( HelpFile: THelpFile;
209 SearchWord: string;
210 Results: UInt32ArrayPointer );
211var
212 TopicIndex: longint;
213 pTitle: pstring;
214 TitleWord: string;
215 Topic: TTopic;
216 TitleWordIndex: longint;
217 WordRelevance: longint;
218 TitleWordRelevance: longint;
[115]219 tmpTitleWords : TStringList;
220 i : integer;
[18]221begin
[333]222 if IsLogAspectsEnabled(LogSearch) then
223 begin
224 LogEvent(LogSearch, ' calling SearchTopicTitles for "' + SearchWord + '"');
225 end;
226
[115]227 tmpTitleWords := TStringList.Create;
228
[18]229 // Search topic titles
230 for TopicIndex:= 0 to HelpFile.TopicCount - 1 do
231 begin
232 Topic:= HelpFile.Topics[ TopicIndex ];
233 pTitle:= Topic.TitlePtr;
234 TitleWordIndex := 0;
[115]235
236 tmpTitleWords.Clear;
237 StrExtractStringsQuoted(tmpTitleWords, pTitle^);
238
239 for i := 0 to tmpTitleWords.count-1 do
[18]240 begin
[115]241 TitleWord := tmpTitleWords[i];
242
[333]243 WordRelevance := CompareWord( SearchWord, TitleWord );
[18]244 if WordRelevance > 0 then
245 begin
246 if TitleWordIndex = 0 then
247 begin
248 // matching the first word is best
[115]249 if i = tmpTitleWords.count-1 then
250 begin
[18]251 // in fact it's the only word
[333]252 TitleWordRelevance := mwOnlyTitleWord * WordRelevance
[115]253 end
[18]254 else
[333]255 TitleWordRelevance := mwFirstTitleWord * WordRelevance
[18]256 end
257 else
258 begin
[333]259 TitleWordRelevance := mwTitleWord * WordRelevance;
[18]260 end;
261 inc( Results[ TopicIndex ],
262 TitleWordRelevance );
263 end;
264 inc( TitleWordIndex );
265 end;
266 end;
[115]267 tmpTitleWords.Destroy;
[18]268end;
269
270// Search index entries for given searchword
271procedure SearchIndex( HelpFile: THelpFile;
272 SearchWord: string;
273 Results: UInt32ArrayPointer );
274var
275 IndexIndex: longint;
276 pIndexEntry: pstring;
277 IndexEntryWord: string;
278 Topic: TTopic;
279 IndexEntryWordIndex: longint;
280 WordRelevance: longint;
281 IndexEntryWordRelevance: longint;
[115]282 tmpIndexWords : TStringList;
283 i : integer;
[18]284begin
[333]285 if IsLogAspectsEnabled(LogSearch) then
286 begin
287 LogEvent(LogSearch, ' calling SearchIndex for "' + SearchWord + '"');
288 end;
289
[115]290 tmpIndexWords := TStringList.Create;
291
[18]292 for IndexIndex := 0 to HelpFile.Index.Count - 1 do
293 begin
294 Topic := HelpFile.Index.Objects[ IndexIndex ] as TTopic;
295 pIndexEntry := HelpFile.IndexEntryPtr[ IndexIndex ];
296 IndexEntryWordIndex := 0;
[115]297
298 tmpIndexWords.Clear;
299 StrExtractStringsQuoted(tmpIndexWords, pIndexEntry^);
300
301 for i := 0 to tmpIndexWords.count-1 do
[18]302 begin
[115]303 IndexEntryWord := tmpIndexWords[i];
304
[333]305 WordRelevance := CompareWord( SearchWord, IndexEntryWord );
[18]306 if WordRelevance > 0 then
307 begin
308 if IndexEntryWordIndex = 0 then
309 begin
310 // matching the first word is best
[115]311 if i = tmpIndexWords.count-1 then
312 begin
[18]313 // in fact it's the only word
[333]314 IndexEntryWordRelevance := mwOnlyIndexWord * WordRelevance
[115]315 end
[18]316 else
[333]317 IndexEntryWordRelevance := mwFirstIndexWord * WordRelevance
[18]318 end
319 else
320 begin
[333]321 IndexEntryWordRelevance := mwIndexWord * WordRelevance;
[18]322 end;
323 inc( Results[ Topic.Index ],
324 IndexEntryWordRelevance );
325 end;
326 inc( IndexEntryWordIndex );
327 end;
328 end;
[115]329
330 tmpIndexWords.Destroy;
[18]331end;
332
333// ------------------------------------------------------
334
335// Master search function. Given a search query,
336// searches topic text, titles, index entries.
337// Matching topics are added to TList, with their
338// SearchRelevance set appropriately.
339procedure SearchHelpFile( HelpFile: THelpFile;
340 Query: TTextSearchQuery;
341 Results: TList;
342 WordSequences: TList );
343var
[333]344 tmpTopicCount: longint;
345 tmpTopic: TTopic;
346 tmpTopicIndex: longint;
347 tmpTermIndex: longint;
348 tmpTerm: TSearchTerm;
[18]349
350 DictionaryRelevances: UInt32ArrayPointer;
351
352 TopicsMatchingDictWord: UInt32ArrayPointer; // flags
353 TopicsMatchingTermPart: UInt32ArrayPointer; // flags
354 TopicsMatchingTerm: UInt32ArrayPointer; // flag then relevances
355 TopicRelevances: UInt32ArrayPointer;
356 TopicsExcluded: UInt32ArrayPointer;
357
358 TopicRelevanceForTerm: longint;
359
360 WordRelevance: longint;
361 DictIndex: longint;
362
363 TermPartIndex: longint;
364 TermPart: string;
365
366 s: string;
367
368 TermWordSequence: TList;
369begin
[333]370 LogEvent(LogSearch, 'SearchHelpFile');
371 Query.Log;
372
[18]373 if HelpFile.SearchTable = nil then
374 begin
375 exit;
376 end;
377
378 // Reset flags per topic
[333]379 tmpTopicCount := HelpFile.TopicCount;
[18]380
381 // Get memory for topic relevance arrays
382
[333]383 AllocUInt32Array( TopicsMatchingDictWord, tmpTopicCount );
384 AllocUInt32Array( TopicsMatchingTermPart, tmpTopicCount );
385 AllocUInt32Array( TopicsMatchingTerm, tmpTopicCount );
386 AllocUInt32Array( TopicRelevances, tmpTopicCount ); // functions as a flag and a cumulative relevance
[18]387
[333]388 AllocUInt32Array( TopicsExcluded, tmpTopicCount ); // Exclusions are treated as boolean only
[18]389
[333]390
391 ClearUInt32Array( TopicRelevances, tmpTopicCount );
392 ClearUInt32Array( TopicsExcluded, tmpTopicCount );
393
394 for tmpTermIndex := 0 to Query.TermCount - 1 do
[18]395 begin
[333]396 tmpTerm := Query.Term[ tmpTermIndex ];
[18]397
[333]398 if IsLogAspectsEnabled(LogSearch) then
399 begin
400 LogEvent(LogSearch, 'Searching for term "'
401 + tmpTerm.Text
[18]402 + '", '
[333]403 + IntToStr( tmpTerm.Parts.Count )
[18]404 + ' parts' );
[333]405 end;
[18]406
407 // look thru all parts of the term. eg. CAKE_SAUSAGE
408
409 TermWordSequence := TList.Create;
410
411 if WordSequences <> nil then
[333]412 if tmpTerm.CombineMethod <> cmExcluded then
413 begin
[18]414 // this term is an inclusive one, so we want to remember the matches
415 WordSequences.Add( TermWordSequence );
[333]416 end;
[18]417
[333]418 for TermPartIndex := 0 to tmpTerm.Parts.Count - 1 do
[18]419 begin
[333]420 TermPart := tmpTerm.Parts[ TermPartIndex ];
[18]421
[333]422 if IsLogAspectsEnabled(LogSearch) then
423 begin
424 LogEvent(LogSearch, ' Searching for TermPart [' + TermPart + ']' );
425 end;
[18]426
[333]427 AllocUInt32Array( DictionaryRelevances, HelpFile.DictionaryCount );
[18]428
429 TermWordSequence.Add( DictionaryRelevances );
430
431 // Search the dictionary for matches.
432 // alpha numeric match
433
[333]434 if tmpTerm.Parts.Count = 1 then
435 begin
436 if IsLogAspectsEnabled(LogSearch) then
437 begin
438 LogEvent(LogSearch, ' Term has only one part...' );
439 LogEvent(LogSearch, ' SearchDictionary [' + TermPart + ']' );
440 end;
441
[18]442 // general match allowing all kinds of partial matches
[333]443 SearchDictionary( HelpFile, TermPart, DictionaryRelevances )
444 end
[18]445
446 else if TermPartIndex = 0 then
[333]447 begin
448 if IsLogAspectsEnabled(LogSearch) then
449 begin
450 LogEvent(LogSearch, ' Term has more then one part... we are at first' );
451 LogEvent(LogSearch, ' SearchDictionaryEnd [' + TermPart + ']' );
452 end;
453
[18]454 // first term part: word must match end of a topic word e.g. must end in "cake"
[333]455 SearchDictionaryEnds( HelpFile, TermPart, DictionaryRelevances )
456 end
[18]457
[333]458 else if TermPartIndex = tmpTerm.Parts.Count - 1 then
459 begin
460 if IsLogAspectsEnabled(LogSearch) then
461 begin
462 LogEvent(LogSearch, ' Term has more then one part... we are at last' );
463 LogEvent(LogSearch, ' SearchDictionaryEnd [' + TermPart + ']' );
464 end;
465
[18]466 // last term part: word must match start of a topic word e.g. must start with "sausage"
[333]467 SearchDictionaryStarts( HelpFile, TermPart, DictionaryRelevances )
468 end
[18]469
470 else
[333]471 begin
472 if IsLogAspectsEnabled(LogSearch) then
473 begin
474 LogEvent(LogSearch, ' Term has more then one part... we are inside' );
475 LogEvent(LogSearch, ' SearchDictionaryEnd [' + TermPart + ']' );
476 end;
477
[18]478 // intermediate term part: word must match exactly e.g. must be "_"
[333]479 SearchDictionaryExact( HelpFile, TermPart, DictionaryRelevances )
480 end;
[18]481
482 // For each word in the dictionary that matches
483 // this search term part, search topic texts
484
[43]485 LogEvent(LogSearch, ' Dictionary search done' );
[333]486 ClearUInt32Array( TopicsMatchingTermPart, tmpTopicCount );
[18]487
488 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
489 begin
490 WordRelevance := DictionaryRelevances[ DictIndex ];
491 if WordRelevance > 0 then
492 begin
493 // Search for occurrences of this word
494 // within the text of topics
495 HelpFile.SearchTable.Search( DictIndex,
496 TopicsMatchingDictWord );
497
498 // debug
499 s := HelpFile.DictionaryWords[ DictIndex ];
500 // TopicRelevancesForDictWord now contains 1
501 // for topics that contain this word.
502
503 OrUInt32Array( TopicsMatchingDictWord,
504 TopicsMatchingTermPart,
[333]505 tmpTopicCount );
[18]506 end
507 end;
508
[333]509 LogEvent(LogSearch, ' Topic searches done' );
[18]510
511 if TermPartIndex = 0 then
512 // first part, just copy
513 CopyUInt32Array( TopicsMatchingTermPart,
514 TopicsMatchingTerm,
[333]515 tmpTopicCount )
[18]516 else
517 // and with previous term part results
518 AndUInt32Array( TopicsMatchingTermPart,
519 TopicsMatchingTerm,
[333]520 tmpTopicCount );
[18]521
522 // loop for next term part (IPF word)
523 end;
524
525 // Now we have searched the dictionary and worked out matching topics
526 // for all parts of the term. Now combine all together
527
[43]528 LogEvent(LogSearch, 'Checking for sequences' );
[333]529 for tmpTopicIndex := 0 to tmpTopicCount - 1 do
[18]530 begin
[333]531 if TopicsMatchingTerm[ tmpTopicIndex ] > 0 then
[18]532 begin
[333]533 tmpTopic := HelpFile.Topics[ tmpTopicIndex ];
[18]534 // Topic text contained a match for the all the parts
535 // of the term.
536 // Now we need to:
537 // - verify that they actually occur all in a sequence (if it's a multi-part term)
538 // - count occurrences for relevance.
539
[333]540 TopicRelevanceForTerm := tmpTopic.SearchForWordSequences( TermWordSequence, false ); // don't stop at first match
[18]541
[333]542 TopicRelevanceForTerm := TopicRelevanceForTerm div tmpTerm.Parts.Count; // divide to bring back into scale
[18]543
[333]544 TopicsMatchingTerm[ tmpTopicIndex ] := TopicRelevanceForTerm;
[18]545
546 end;
547 end;
548
549 if WordSequences = nil then
550 begin
551 // we don't need to keep the sequence
552 ClearWordSequence( TermWordSequence,
553 HelpFile.DictionaryCount );
554 TermWordSequence.Destroy;
555 end;
556
557 // Search titles and index
558
[43]559 LogEvent(LogSearch, ' Searching titles' );
[333]560 SearchTopicTitles( HelpFile, tmpTerm.Text, TopicsMatchingTerm );
[18]561
[43]562 LogEvent(LogSearch, ' Searching index' );
[333]563 SearchIndex( HelpFile, tmpTerm.Text, TopicsMatchingTerm );
[18]564
[43]565 LogEvent(LogSearch, ' Combining' );
[333]566 case tmpTerm.CombineMethod of
[18]567 cmOptional:
[333]568 begin
569 LogEvent(LogSearch, ' Combining optional');
[18]570 AddUInt32Array( TopicsMatchingTerm,
571 TopicRelevances,
[333]572 tmpTopicCount );
573 end;
[18]574
575 cmRequired:
576 begin
[333]577 LogEvent(LogSearch, ' Combining required');
[18]578 // if zero then add to exclusions
579 NotOrUInt32Array( TopicsMatchingTerm,
580 TopicsExcluded,
[333]581 tmpTopicCount );
[18]582
583 AddUInt32Array( TopicsMatchingTerm,
584 TopicRelevances,
[333]585 tmpTopicCount );
[18]586 end;
587
588 cmExcluded:
[333]589 begin
590 LogEvent(LogSearch, ' Combining excluded');
[18]591 OrUInt32Array( TopicsMatchingTerm,
592 TopicsExcluded,
[333]593 tmpTopicCount );
594 end;
[18]595 end;
596
597// Term.ClearMatches;
598
599 // loop for next term...
600 end;
601
[43]602 LogEvent(LogSearch, 'Search completed, converting to list' );
[18]603
604 // Now convert to list form.
605
[333]606 for tmpTopicIndex := 0 to tmpTopicCount - 1 do
[18]607 begin
[333]608 if TopicsExcluded[ tmpTopicIndex ] = 0 then
[18]609 begin
[333]610 tmpTopic := HelpFile.Topics[ tmpTopicIndex ];
611 tmpTopic.SearchRelevance := TopicRelevances[ tmpTopicIndex ];
612 if tmpTopic.SearchRelevance > 0 then
[18]613 begin
[333]614 Results.Add( tmpTopic );
[18]615 end;
616 end;
617 end;
618
[43]619 LogEvent(LogSearch, 'Freeing arrays' );
[333]620 FreeUInt32Array( TopicRelevances, tmpTopicCount );
621 FreeUInt32Array( TopicsExcluded, tmpTopicCount );
622 FreeUInt32Array( TopicsMatchingTerm, tmpTopicCount );
623 FreeUInt32Array( TopicsMatchingTermPart, tmpTopicCount );
624 FreeUInt32Array( TopicsMatchingDictWord, tmpTopicCount );
[18]625
[43]626 LogEvent(LogSearch, 'Done' );
[18]627end;
628
629Initialization
[333]630End.
Note: See TracBrowser for help on using the repository browser.