source: trunk/NewView/SearchUnit.pas@ 345

Last change on this file since 345 was 345, checked in by RBRi, 16 years ago

use the same index; reformated

  • Property svn:eol-style set to native
File size: 16.5 KB
RevLine 
[18]1Unit SearchUnit;
2
3// NewView - a new OS/2 Help Viewer
4// Copyright 2003 Aaron Lawrence (aaronl at consultant dot com)
5// This software is released under the Gnu Public License - see readme.txt
6
7Interface
8
9// Contains code to search help files.
10
11uses
12 Classes,
[33]13 HelpFile,
14 TextSearchQuery,
15 IPFFileFormatUnit;
[18]16
17const
18 // match weightings
19 mwOnlyTitleWord = 200;
20 mwFirstTitleWord = 50;
21 mwTitleWord = 20;
22
23 mwOnlyIndexWord = 100;
24 mwFirstIndexWord = 20;
25 mwIndexWord = 10;
26 mwTopicTextWord = 1;
27
28 // best case match weighting of a word
29 mwExactWord = 20;
30
31
32// note on weightings. The title/index weightings
33// are multipled by word weightings.
34// Topic text matches are equal to word weighting
35// times word weighting.
36
37procedure SearchHelpFile( HelpFile: THelpFile;
38 Query: TTextSearchQuery;
39 Results: TList;
40 WordSequences: TList );
41
42// clear a lsit of word sequences (as produced by above)
43procedure ClearWordSequences( WordSequences: TList;
44 DictionaryCount: longint );
45
46Implementation
47
48uses
49 SysUtils,
[43]50 DebugUnit,
[115]51 StringUtilsUnit,
[18]52 HelpTopic;
53
54type
55 TSearchType = ( stGeneral, stStarts, stExactMatch, stEnds );
56
57procedure ClearWordSequence( WordSequence: TList;
58 DictionaryCount: longint );
59var
60 StepIndex: longint;
61 DictionaryRelevances: UInt32ArrayPointer;
62begin
63 for StepIndex := 0 to WordSequence.Count - 1 do
64 begin
65 DictionaryRelevances := WordSequence[ StepIndex ];
66 FreeUInt32Array( DictionaryRelevances, DictionaryCount );
67 end;
68 WordSequence.Clear;
69end;
70
71procedure ClearWordSequences( WordSequence: TList;
72 DictionaryCount: longint );
73var
74 SequenceIndex: longint;
75 WordSequence: TList;
76begin
77 for SequenceIndex := 0 to WordSequences.Count - 1 do
78 begin
79 WordSequence := WordSequences[ SequenceIndex ];
80 ClearWordSequence( WordSequence,
81 DictionaryCount );
82 WordSequence.Destroy;
83 end;
84 WordSequences.Clear;
85end;
86
87
88// given a search word which is known to matche Reference word,
89// return the relevance
90function MatchedWordRelevance( const SearchWord: string;
91 const ReferenceWord: string ): longint;
92begin
93 Result := mwExactWord
94 * Length( SearchWord )
95 div Length( ReferenceWord );
96 if Result = 0 then
97 Result := 1;
98end;
99
100// Compares the given search word against the given
101// reference word. Returns a value indicating how well the
102// search word matches, 0 = not at all.
103function CompareWord( const SearchWord: string;
104 const ReferenceWord: string ): longint;
105var
106 OccurrencePos: longint;
107begin
108 Result := 0;
109 OccurrencePos := CaseInsensitivePos( SearchWord, ReferenceWord );
110 if OccurrencePos = 0 then
[115]111 begin
[18]112 // no match
113 exit;
[115]114 end;
[18]115
116 Result := MatchedWordRelevance( SearchWord, ReferenceWord );
117end;
118
119// Search the help file dictionary for words that match
120// the given search word. Partial matches are considered.
121// Results returns the matching word indexes.
122procedure SearchDictionary( HelpFile: THelpFile;
123 SearchWord: string;
124 Results: UInt32ArrayPointer );
125var
126 DictIndex: integer;
127 pDictWord: pstring;
128begin
129 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
130 begin
131 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
132 Results[ DictIndex ] := CompareWord( SearchWord,
133 pDictWord^ );
134 end;
135end;
136
137// Search the help file dictionary for words that
138// match the given search word exactly (except for case-insensitive)
139procedure SearchDictionaryExact( HelpFile: THelpFile;
140 SearchWord: string;
141 Results: UInt32ArrayPointer );
142var
143 DictIndex: integer;
144 pDictWord: pstring;
145begin
146 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
147
148 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
149 begin
150 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
[115]151 if StrEqualIgnoringCase( SearchWord, pDictWord^ ) then
[18]152 Results[ DictIndex ] := mwExactWord;
153 end;
154end;
155
156// Search the help file dictionary for words that
157// start with the given word
158procedure SearchDictionaryStarts( HelpFile: THelpFile;
159 SearchWord: string;
160 Results: UInt32ArrayPointer );
161var
162 DictIndex: integer;
163 DictWord: string;
164begin
165 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
166
167 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
168 begin
169 DictWord := HelpFile.DictionaryWords[ DictIndex ];
[252]170 if StrStartsWithIgnoringCase(DictWord, SearchWord) then
[18]171 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
172 end;
173end;
174
175// Search the help file dictionary for words that
176// end with the given word
177procedure SearchDictionaryEnds( HelpFile: THelpFile;
178 SearchWord: string;
179 Results: UInt32ArrayPointer );
180var
181 DictIndex: integer;
182 DictWord: string;
183begin
184 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
185
186 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
187 begin
188 DictWord := HelpFile.DictionaryWords[ DictIndex ];
[115]189 if StrEndsWithIgnoringCase( SearchWord, DictWord ) then
[18]190 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
191 end;
192end;
193
194// Search titles of topics for given searchword
195procedure SearchTopicTitles( HelpFile: THelpFile;
196 SearchWord: string;
197 Results: UInt32ArrayPointer );
198var
199 TopicIndex: longint;
200 pTitle: pstring;
201 TitleWord: string;
202 Topic: TTopic;
203 TitleWordIndex: longint;
204 WordRelevance: longint;
205 TitleWordRelevance: longint;
[115]206 tmpTitleWords : TStringList;
207 i : integer;
[18]208begin
[115]209 tmpTitleWords := TStringList.Create;
210
[18]211 // Search topic titles
[345]212 for TopicIndex := 0 to HelpFile.TopicCount - 1 do
[18]213 begin
[345]214 Topic := HelpFile.Topics[ TopicIndex ];
215 pTitle := Topic.TitlePtr;
[18]216 TitleWordIndex := 0;
[115]217
218 tmpTitleWords.Clear;
219 StrExtractStringsQuoted(tmpTitleWords, pTitle^);
220
221 for i := 0 to tmpTitleWords.count-1 do
[18]222 begin
[115]223 TitleWord := tmpTitleWords[i];
224
[18]225 WordRelevance := CompareWord( SearchWord,
226 TitleWord );
227 if WordRelevance > 0 then
228 begin
229 if TitleWordIndex = 0 then
230 begin
231 // matching the first word is best
[115]232 if i = tmpTitleWords.count-1 then
233 begin
[18]234 // in fact it's the only word
[345]235 TitleWordRelevance := mwOnlyTitleWord * WordRelevance
[115]236 end
[18]237 else
[345]238 TitleWordRelevance := mwFirstTitleWord * WordRelevance
[18]239 end
240 else
241 begin
[345]242 TitleWordRelevance := mwTitleWord * WordRelevance;
[18]243 end;
[345]244 inc( Results[ Topic.Index ], TitleWordRelevance );
[18]245 end;
246 inc( TitleWordIndex );
247 end;
248 end;
[115]249 tmpTitleWords.Destroy;
[18]250end;
251
252// Search index entries for given searchword
253procedure SearchIndex( HelpFile: THelpFile;
254 SearchWord: string;
255 Results: UInt32ArrayPointer );
256var
257 IndexIndex: longint;
258 pIndexEntry: pstring;
259 IndexEntryWord: string;
[345]260 tmpTopic: TTopic;
[18]261 IndexEntryWordIndex: longint;
262 WordRelevance: longint;
263 IndexEntryWordRelevance: longint;
[115]264 tmpIndexWords : TStringList;
265 i : integer;
[18]266begin
[115]267 tmpIndexWords := TStringList.Create;
268
[18]269 for IndexIndex := 0 to HelpFile.Index.Count - 1 do
270 begin
[342]271 pIndexEntry := HelpFile.Index.GetLabels.ValuePtrs[IndexIndex];
[18]272 IndexEntryWordIndex := 0;
[115]273
274 tmpIndexWords.Clear;
275 StrExtractStringsQuoted(tmpIndexWords, pIndexEntry^);
276
277 for i := 0 to tmpIndexWords.count-1 do
[18]278 begin
[115]279 IndexEntryWord := tmpIndexWords[i];
280
[345]281 WordRelevance := CompareWord( SearchWord, IndexEntryWord );
[18]282 if WordRelevance > 0 then
283 begin
284 if IndexEntryWordIndex = 0 then
285 begin
286 // matching the first word is best
[115]287 if i = tmpIndexWords.count-1 then
288 begin
[18]289 // in fact it's the only word
[345]290 IndexEntryWordRelevance := mwOnlyIndexWord * WordRelevance
[115]291 end
[18]292 else
[345]293 IndexEntryWordRelevance := mwFirstIndexWord * WordRelevance
[18]294 end
295 else
296 begin
[345]297 IndexEntryWordRelevance := mwIndexWord * WordRelevance;
[18]298 end;
[345]299 tmpTopic := HelpFile.Index.getTopic(IndexIndex);
300 inc( Results[ tmpTopic.Index ], IndexEntryWordRelevance );
[18]301 end;
302 inc( IndexEntryWordIndex );
303 end;
304 end;
[115]305
306 tmpIndexWords.Destroy;
[18]307end;
308
309// ------------------------------------------------------
310
311// Master search function. Given a search query,
312// searches topic text, titles, index entries.
313// Matching topics are added to TList, with their
314// SearchRelevance set appropriately.
315procedure SearchHelpFile( HelpFile: THelpFile;
316 Query: TTextSearchQuery;
317 Results: TList;
318 WordSequences: TList );
319var
320 TopicCount: longint;
321 Topic: TTopic;
322 TopicIndex: longint;
323 TermIndex: longint;
324 Term: TSearchTerm;
325
326 DictionaryRelevances: UInt32ArrayPointer;
327
328 TopicsMatchingDictWord: UInt32ArrayPointer; // flags
329 TopicsMatchingTermPart: UInt32ArrayPointer; // flags
330 TopicsMatchingTerm: UInt32ArrayPointer; // flag then relevances
331 TopicRelevances: UInt32ArrayPointer;
332 TopicsExcluded: UInt32ArrayPointer;
333
334 TopicRelevanceForTerm: longint;
335
336 WordRelevance: longint;
337 DictIndex: longint;
338
339 TermPartIndex: longint;
340 TermPart: string;
341
342 s: string;
343
344 TermWordSequence: TList;
345begin
346 if HelpFile.SearchTable = nil then
347 begin
348 exit;
349 end;
350
351 // Reset flags per topic
352 TopicCount := HelpFile.TopicCount;
353
354 // Get memory for topic relevance arrays
355
356 AllocUInt32Array( TopicsMatchingDictWord,
357 TopicCount );
358 AllocUInt32Array( TopicsMatchingTermPart,
359 TopicCount );
360 AllocUInt32Array( TopicsMatchingTerm,
361 TopicCount );
362 AllocUInt32Array( TopicRelevances, // functions as a flag and a cumulative relevance
363 TopicCount );
364 AllocUInt32Array( TopicsExcluded, // Exclusions are treated as boolean only
365 TopicCount );
366
367 ClearUInt32Array( TopicRelevances,
368 TopicCount );
369 ClearUInt32Array( TopicsExcluded,
370 TopicCount );
371
372 for TermIndex := 0 to Query.TermCount - 1 do
373 begin
374 Term := Query.Term[ TermIndex ];
375
[43]376 LogEvent(LogSearch, 'Searching for term "'
[18]377 + Term.Text
378 + '", '
379 + IntToStr( Term.Parts.Count )
380 + ' parts' );
381
382 // look thru all parts of the term. eg. CAKE_SAUSAGE
383
384 TermWordSequence := TList.Create;
385
386 if WordSequences <> nil then
387 if Term.CombineMethod <> cmExcluded then
388 // this term is an inclusive one, so we want to remember the matches
389 WordSequences.Add( TermWordSequence );
390
391 for TermPartIndex := 0 to Term.Parts.Count - 1 do
392 begin
393 TermPart := Term.Parts[ TermPartIndex ];
394
[43]395 LogEvent(LogSearch, ' Searching for [' + TermPart + ']' );
[18]396
397 AllocUInt32Array( DictionaryRelevances,
398 HelpFile.DictionaryCount );
399
400 TermWordSequence.Add( DictionaryRelevances );
401
402 // Search the dictionary for matches.
403 // alpha numeric match
404
405 if Term.Parts.Count = 1 then
406 // general match allowing all kinds of partial matches
407 SearchDictionary( HelpFile,
408 TermPart,
409 DictionaryRelevances )
410
411 else if TermPartIndex = 0 then
412 // first term part: word must match end of a topic word e.g. must end in "cake"
413 SearchDictionaryEnds( HelpFile,
414 TermPart,
415 DictionaryRelevances )
416
417 else if TermPartIndex = Term.Parts.Count - 1 then
418 // last term part: word must match start of a topic word e.g. must start with "sausage"
419 SearchDictionaryStarts( HelpFile,
420 TermPart,
421 DictionaryRelevances )
422
423 else
424 // intermediate term part: word must match exactly e.g. must be "_"
425 SearchDictionaryExact( HelpFile,
426 TermPart,
427 DictionaryRelevances );
428
429 // For each word in the dictionary that matches
430 // this search term part, search topic texts
431
[43]432 LogEvent(LogSearch, ' Dictionary search done' );
[18]433 ClearUInt32Array( TopicsMatchingTermPart,
434 TopicCount );
435
436 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
437 begin
438 WordRelevance := DictionaryRelevances[ DictIndex ];
439 if WordRelevance > 0 then
440 begin
441 // Search for occurrences of this word
442 // within the text of topics
443 HelpFile.SearchTable.Search( DictIndex,
444 TopicsMatchingDictWord );
445
446 // debug
447 s := HelpFile.DictionaryWords[ DictIndex ];
448 // TopicRelevancesForDictWord now contains 1
449 // for topics that contain this word.
450
451 OrUInt32Array( TopicsMatchingDictWord,
452 TopicsMatchingTermPart,
453 TopicCount );
454 end
455 end;
456
[43]457 LogEvent(LogSearch, 'Topic searches done' );
[18]458
459 if TermPartIndex = 0 then
460 // first part, just copy
461 CopyUInt32Array( TopicsMatchingTermPart,
462 TopicsMatchingTerm,
463 TopicCount )
464 else
465 // and with previous term part results
466 AndUInt32Array( TopicsMatchingTermPart,
467 TopicsMatchingTerm,
468 TopicCount );
469
470 // loop for next term part (IPF word)
471 end;
472
473 // Now we have searched the dictionary and worked out matching topics
474 // for all parts of the term. Now combine all together
475
[43]476 LogEvent(LogSearch, 'Checking for sequences' );
[18]477 for TopicIndex := 0 to TopicCount - 1 do
478 begin
479 if TopicsMatchingTerm[ TopicIndex ] > 0 then
480 begin
481 Topic := HelpFile.Topics[ TopicIndex ];
482 // Topic text contained a match for the all the parts
483 // of the term.
484 // Now we need to:
485 // - verify that they actually occur all in a sequence (if it's a multi-part term)
486 // - count occurrences for relevance.
487
488 TopicRelevanceForTerm :=
489 Topic.SearchForWordSequences( TermWordSequence,
490 false ); // don't stop at first match
491
492 TopicRelevanceForTerm :=
493 TopicRelevanceForTerm div Term.Parts.Count; // divide to bring back into scale
494
495 TopicsMatchingTerm[ TopicIndex ] := TopicRelevanceForTerm;
496
497 end;
498 end;
499
500 if WordSequences = nil then
501 begin
502 // we don't need to keep the sequence
503 ClearWordSequence( TermWordSequence,
504 HelpFile.DictionaryCount );
505 TermWordSequence.Destroy;
506 end;
507
508 // Search titles and index
509
[43]510 LogEvent(LogSearch, ' Searching titles' );
[18]511 SearchTopicTitles( HelpFile, Term.Text, TopicsMatchingTerm );
512
[43]513 LogEvent(LogSearch, ' Searching index' );
[18]514 SearchIndex( HelpFile, Term.Text, TopicsMatchingTerm );
515
[43]516 LogEvent(LogSearch, ' Combining' );
[18]517 case Term.CombineMethod of
518 cmOptional:
519 AddUInt32Array( TopicsMatchingTerm,
520 TopicRelevances,
521 TopicCount );
522
523 cmRequired:
524 begin
525 // if zero then add to exclusions
526 NotOrUInt32Array( TopicsMatchingTerm,
527 TopicsExcluded,
528 TopicCount );
529
530 AddUInt32Array( TopicsMatchingTerm,
531 TopicRelevances,
532 TopicCount );
533 end;
534
535 cmExcluded:
536 OrUInt32Array( TopicsMatchingTerm,
537 TopicsExcluded,
538 TopicCount );
539 end;
540
541// Term.ClearMatches;
542
543 // loop for next term...
544 end;
545
[43]546 LogEvent(LogSearch, 'Search completed, converting to list' );
[18]547
548 // Now convert to list form.
549
550 for TopicIndex := 0 to TopicCount - 1 do
551 begin
552 if TopicsExcluded[ TopicIndex ] = 0 then
553 begin
554 Topic := HelpFile.Topics[ TopicIndex ];
555 Topic.SearchRelevance := TopicRelevances[ TopicIndex ];
556 if Topic.SearchRelevance > 0 then
557 begin
558 Results.Add( Topic );
559 end;
560 end;
561 end;
562
[43]563 LogEvent(LogSearch, 'Freeing arrays' );
[18]564 FreeUInt32Array( TopicRelevances, TopicCount );
565 FreeUInt32Array( TopicsExcluded, TopicCount );
566 FreeUInt32Array( TopicsMatchingTerm, TopicCount );
567 FreeUInt32Array( TopicsMatchingTermPart, TopicCount );
568 FreeUInt32Array( TopicsMatchingDictWord, TopicCount );
569
[43]570 LogEvent(LogSearch, 'Done' );
[18]571end;
572
573Initialization
574End.
Note: See TracBrowser for help on using the repository browser.