source: branches/2.19_branch/NewView/SearchUnit.pas@ 275

Last change on this file since 275 was 115, checked in by RBRi, 18 years ago

changed to use StringUtilsUnit

  • Property svn:eol-style set to native
File size: 16.8 KB
Line 
1Unit SearchUnit;
2
3// NewView - a new OS/2 Help Viewer
4// Copyright 2003 Aaron Lawrence (aaronl at consultant dot com)
5// This software is released under the Gnu Public License - see readme.txt
6
7Interface
8
9// Contains code to search help files.
10
11uses
12 Classes,
13 HelpFile,
14 TextSearchQuery,
15 IPFFileFormatUnit;
16
17const
18 // match weightings
19 mwOnlyTitleWord = 200;
20 mwFirstTitleWord = 50;
21 mwTitleWord = 20;
22
23 mwOnlyIndexWord = 100;
24 mwFirstIndexWord = 20;
25 mwIndexWord = 10;
26 mwTopicTextWord = 1;
27
28 // best case match weighting of a word
29 mwExactWord = 20;
30
31
32// note on weightings. The title/index weightings
33// are multipled by word weightings.
34// Topic text matches are equal to word weighting
35// times word weighting.
36
37procedure SearchHelpFile( HelpFile: THelpFile;
38 Query: TTextSearchQuery;
39 Results: TList;
40 WordSequences: TList );
41
42// clear a lsit of word sequences (as produced by above)
43procedure ClearWordSequences( WordSequences: TList;
44 DictionaryCount: longint );
45
46Implementation
47
48uses
49 SysUtils,
50 DebugUnit,
51 StringUtilsUnit,
52 HelpTopic;
53
54type
55 TSearchType = ( stGeneral, stStarts, stExactMatch, stEnds );
56
57procedure ClearWordSequence( WordSequence: TList;
58 DictionaryCount: longint );
59var
60 StepIndex: longint;
61 DictionaryRelevances: UInt32ArrayPointer;
62begin
63 for StepIndex := 0 to WordSequence.Count - 1 do
64 begin
65 DictionaryRelevances := WordSequence[ StepIndex ];
66 FreeUInt32Array( DictionaryRelevances, DictionaryCount );
67 end;
68 WordSequence.Clear;
69end;
70
71procedure ClearWordSequences( WordSequence: TList;
72 DictionaryCount: longint );
73var
74 SequenceIndex: longint;
75 WordSequence: TList;
76begin
77 for SequenceIndex := 0 to WordSequences.Count - 1 do
78 begin
79 WordSequence := WordSequences[ SequenceIndex ];
80 ClearWordSequence( WordSequence,
81 DictionaryCount );
82 WordSequence.Destroy;
83 end;
84 WordSequences.Clear;
85end;
86
87
88// given a search word which is known to matche Reference word,
89// return the relevance
90function MatchedWordRelevance( const SearchWord: string;
91 const ReferenceWord: string ): longint;
92begin
93 Result := mwExactWord
94 * Length( SearchWord )
95 div Length( ReferenceWord );
96 if Result = 0 then
97 Result := 1;
98end;
99
100// Compares the given search word against the given
101// reference word. Returns a value indicating how well the
102// search word matches, 0 = not at all.
103function CompareWord( const SearchWord: string;
104 const ReferenceWord: string ): longint;
105var
106 OccurrencePos: longint;
107begin
108 Result := 0;
109 OccurrencePos := CaseInsensitivePos( SearchWord, ReferenceWord );
110 if OccurrencePos = 0 then
111 begin
112 // no match
113 exit;
114 end;
115
116 Result := MatchedWordRelevance( SearchWord, ReferenceWord );
117end;
118
119// Search the help file dictionary for words that match
120// the given search word. Partial matches are considered.
121// Results returns the matching word indexes.
122procedure SearchDictionary( HelpFile: THelpFile;
123 SearchWord: string;
124 Results: UInt32ArrayPointer );
125var
126 DictIndex: integer;
127 pDictWord: pstring;
128begin
129 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
130 begin
131 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
132 Results[ DictIndex ] := CompareWord( SearchWord,
133 pDictWord^ );
134 end;
135end;
136
137// Search the help file dictionary for words that
138// match the given search word exactly (except for case-insensitive)
139procedure SearchDictionaryExact( HelpFile: THelpFile;
140 SearchWord: string;
141 Results: UInt32ArrayPointer );
142var
143 DictIndex: integer;
144 pDictWord: pstring;
145begin
146 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
147
148 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
149 begin
150 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
151 if StrEqualIgnoringCase( SearchWord, pDictWord^ ) then
152 Results[ DictIndex ] := mwExactWord;
153 end;
154end;
155
156// Search the help file dictionary for words that
157// start with the given word
158procedure SearchDictionaryStarts( HelpFile: THelpFile;
159 SearchWord: string;
160 Results: UInt32ArrayPointer );
161var
162 DictIndex: integer;
163 DictWord: string;
164begin
165 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
166
167 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
168 begin
169 DictWord := HelpFile.DictionaryWords[ DictIndex ];
170 if StrStartsWithIgnoringCase( SearchWord, DictWord ) then
171 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
172 end;
173end;
174
175// Search the help file dictionary for words that
176// end with the given word
177procedure SearchDictionaryEnds( HelpFile: THelpFile;
178 SearchWord: string;
179 Results: UInt32ArrayPointer );
180var
181 DictIndex: integer;
182 DictWord: string;
183begin
184 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
185
186 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
187 begin
188 DictWord := HelpFile.DictionaryWords[ DictIndex ];
189 if StrEndsWithIgnoringCase( SearchWord, DictWord ) then
190 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
191 end;
192end;
193
194// Search titles of topics for given searchword
195procedure SearchTopicTitles( HelpFile: THelpFile;
196 SearchWord: string;
197 Results: UInt32ArrayPointer );
198var
199 TopicIndex: longint;
200 pTitle: pstring;
201 TitleWord: string;
202 Topic: TTopic;
203 TitleWordIndex: longint;
204 WordRelevance: longint;
205 TitleWordRelevance: longint;
206 tmpTitleWords : TStringList;
207 i : integer;
208begin
209 tmpTitleWords := TStringList.Create;
210
211 // Search topic titles
212 for TopicIndex:= 0 to HelpFile.TopicCount - 1 do
213 begin
214 Topic:= HelpFile.Topics[ TopicIndex ];
215 pTitle:= Topic.TitlePtr;
216 TitleWordIndex := 0;
217
218 tmpTitleWords.Clear;
219 StrExtractStringsQuoted(tmpTitleWords, pTitle^);
220
221 for i := 0 to tmpTitleWords.count-1 do
222 begin
223 TitleWord := tmpTitleWords[i];
224
225 WordRelevance := CompareWord( SearchWord,
226 TitleWord );
227 if WordRelevance > 0 then
228 begin
229 if TitleWordIndex = 0 then
230 begin
231 // matching the first word is best
232 if i = tmpTitleWords.count-1 then
233 begin
234 // in fact it's the only word
235 TitleWordRelevance := mwOnlyTitleWord
236 * WordRelevance
237 end
238 else
239 TitleWordRelevance := mwFirstTitleWord
240 * WordRelevance
241 end
242 else
243 begin
244 TitleWordRelevance := mwTitleWord
245 * WordRelevance;
246 end;
247 inc( Results[ TopicIndex ],
248 TitleWordRelevance );
249 end;
250 inc( TitleWordIndex );
251 end;
252 end;
253 tmpTitleWords.Destroy;
254end;
255
256// Search index entries for given searchword
257procedure SearchIndex( HelpFile: THelpFile;
258 SearchWord: string;
259 Results: UInt32ArrayPointer );
260var
261 IndexIndex: longint;
262 pIndexEntry: pstring;
263 IndexEntryWord: string;
264 Topic: TTopic;
265 IndexEntryWordIndex: longint;
266 WordRelevance: longint;
267 IndexEntryWordRelevance: longint;
268 tmpIndexWords : TStringList;
269 i : integer;
270begin
271 tmpIndexWords := TStringList.Create;
272
273 for IndexIndex := 0 to HelpFile.Index.Count - 1 do
274 begin
275 Topic := HelpFile.Index.Objects[ IndexIndex ] as TTopic;
276 pIndexEntry := HelpFile.IndexEntryPtr[ IndexIndex ];
277 IndexEntryWordIndex := 0;
278
279 tmpIndexWords.Clear;
280 StrExtractStringsQuoted(tmpIndexWords, pIndexEntry^);
281
282 for i := 0 to tmpIndexWords.count-1 do
283 begin
284 IndexEntryWord := tmpIndexWords[i];
285
286 WordRelevance := CompareWord( SearchWord,
287 IndexEntryWord );
288 if WordRelevance > 0 then
289 begin
290 if IndexEntryWordIndex = 0 then
291 begin
292 // matching the first word is best
293 if i = tmpIndexWords.count-1 then
294 begin
295 // in fact it's the only word
296 IndexEntryWordRelevance := mwOnlyIndexWord
297 * WordRelevance
298 end
299 else
300 IndexEntryWordRelevance := mwFirstIndexWord
301 * WordRelevance
302 end
303 else
304 begin
305 IndexEntryWordRelevance := mwIndexWord
306 * WordRelevance;
307 end;
308 inc( Results[ Topic.Index ],
309 IndexEntryWordRelevance );
310 end;
311 inc( IndexEntryWordIndex );
312 end;
313 end;
314
315 tmpIndexWords.Destroy;
316end;
317
318// ------------------------------------------------------
319
320// Master search function. Given a search query,
321// searches topic text, titles, index entries.
322// Matching topics are added to TList, with their
323// SearchRelevance set appropriately.
324procedure SearchHelpFile( HelpFile: THelpFile;
325 Query: TTextSearchQuery;
326 Results: TList;
327 WordSequences: TList );
328var
329 TopicCount: longint;
330 Topic: TTopic;
331 TopicIndex: longint;
332 TermIndex: longint;
333 Term: TSearchTerm;
334
335 DictionaryRelevances: UInt32ArrayPointer;
336
337 TopicsMatchingDictWord: UInt32ArrayPointer; // flags
338 TopicsMatchingTermPart: UInt32ArrayPointer; // flags
339 TopicsMatchingTerm: UInt32ArrayPointer; // flag then relevances
340 TopicRelevances: UInt32ArrayPointer;
341 TopicsExcluded: UInt32ArrayPointer;
342
343 TopicRelevanceForTerm: longint;
344
345 WordRelevance: longint;
346 DictIndex: longint;
347
348 TermPartIndex: longint;
349 TermPart: string;
350
351 s: string;
352
353 TermWordSequence: TList;
354begin
355 if HelpFile.SearchTable = nil then
356 begin
357 exit;
358 end;
359
360 // Reset flags per topic
361 TopicCount := HelpFile.TopicCount;
362
363 // Get memory for topic relevance arrays
364
365 AllocUInt32Array( TopicsMatchingDictWord,
366 TopicCount );
367 AllocUInt32Array( TopicsMatchingTermPart,
368 TopicCount );
369 AllocUInt32Array( TopicsMatchingTerm,
370 TopicCount );
371 AllocUInt32Array( TopicRelevances, // functions as a flag and a cumulative relevance
372 TopicCount );
373 AllocUInt32Array( TopicsExcluded, // Exclusions are treated as boolean only
374 TopicCount );
375
376 ClearUInt32Array( TopicRelevances,
377 TopicCount );
378 ClearUInt32Array( TopicsExcluded,
379 TopicCount );
380
381 for TermIndex := 0 to Query.TermCount - 1 do
382 begin
383 Term := Query.Term[ TermIndex ];
384
385 LogEvent(LogSearch, 'Searching for term "'
386 + Term.Text
387 + '", '
388 + IntToStr( Term.Parts.Count )
389 + ' parts' );
390
391 // look thru all parts of the term. eg. CAKE_SAUSAGE
392
393 TermWordSequence := TList.Create;
394
395 if WordSequences <> nil then
396 if Term.CombineMethod <> cmExcluded then
397 // this term is an inclusive one, so we want to remember the matches
398 WordSequences.Add( TermWordSequence );
399
400 for TermPartIndex := 0 to Term.Parts.Count - 1 do
401 begin
402 TermPart := Term.Parts[ TermPartIndex ];
403
404 LogEvent(LogSearch, ' Searching for [' + TermPart + ']' );
405
406 AllocUInt32Array( DictionaryRelevances,
407 HelpFile.DictionaryCount );
408
409 TermWordSequence.Add( DictionaryRelevances );
410
411 // Search the dictionary for matches.
412 // alpha numeric match
413
414 if Term.Parts.Count = 1 then
415 // general match allowing all kinds of partial matches
416 SearchDictionary( HelpFile,
417 TermPart,
418 DictionaryRelevances )
419
420 else if TermPartIndex = 0 then
421 // first term part: word must match end of a topic word e.g. must end in "cake"
422 SearchDictionaryEnds( HelpFile,
423 TermPart,
424 DictionaryRelevances )
425
426 else if TermPartIndex = Term.Parts.Count - 1 then
427 // last term part: word must match start of a topic word e.g. must start with "sausage"
428 SearchDictionaryStarts( HelpFile,
429 TermPart,
430 DictionaryRelevances )
431
432 else
433 // intermediate term part: word must match exactly e.g. must be "_"
434 SearchDictionaryExact( HelpFile,
435 TermPart,
436 DictionaryRelevances );
437
438 // For each word in the dictionary that matches
439 // this search term part, search topic texts
440
441 LogEvent(LogSearch, ' Dictionary search done' );
442 ClearUInt32Array( TopicsMatchingTermPart,
443 TopicCount );
444
445 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
446 begin
447 WordRelevance := DictionaryRelevances[ DictIndex ];
448 if WordRelevance > 0 then
449 begin
450 // Search for occurrences of this word
451 // within the text of topics
452 HelpFile.SearchTable.Search( DictIndex,
453 TopicsMatchingDictWord );
454
455 // debug
456 s := HelpFile.DictionaryWords[ DictIndex ];
457 // TopicRelevancesForDictWord now contains 1
458 // for topics that contain this word.
459
460 OrUInt32Array( TopicsMatchingDictWord,
461 TopicsMatchingTermPart,
462 TopicCount );
463 end
464 end;
465
466 LogEvent(LogSearch, 'Topic searches done' );
467
468 if TermPartIndex = 0 then
469 // first part, just copy
470 CopyUInt32Array( TopicsMatchingTermPart,
471 TopicsMatchingTerm,
472 TopicCount )
473 else
474 // and with previous term part results
475 AndUInt32Array( TopicsMatchingTermPart,
476 TopicsMatchingTerm,
477 TopicCount );
478
479 // loop for next term part (IPF word)
480 end;
481
482 // Now we have searched the dictionary and worked out matching topics
483 // for all parts of the term. Now combine all together
484
485 LogEvent(LogSearch, 'Checking for sequences' );
486 for TopicIndex := 0 to TopicCount - 1 do
487 begin
488 if TopicsMatchingTerm[ TopicIndex ] > 0 then
489 begin
490 Topic := HelpFile.Topics[ TopicIndex ];
491 // Topic text contained a match for the all the parts
492 // of the term.
493 // Now we need to:
494 // - verify that they actually occur all in a sequence (if it's a multi-part term)
495 // - count occurrences for relevance.
496
497 TopicRelevanceForTerm :=
498 Topic.SearchForWordSequences( TermWordSequence,
499 false ); // don't stop at first match
500
501 TopicRelevanceForTerm :=
502 TopicRelevanceForTerm div Term.Parts.Count; // divide to bring back into scale
503
504 TopicsMatchingTerm[ TopicIndex ] := TopicRelevanceForTerm;
505
506 end;
507 end;
508
509 if WordSequences = nil then
510 begin
511 // we don't need to keep the sequence
512 ClearWordSequence( TermWordSequence,
513 HelpFile.DictionaryCount );
514 TermWordSequence.Destroy;
515 end;
516
517 // Search titles and index
518
519 LogEvent(LogSearch, ' Searching titles' );
520 SearchTopicTitles( HelpFile, Term.Text, TopicsMatchingTerm );
521
522 LogEvent(LogSearch, ' Searching index' );
523 SearchIndex( HelpFile, Term.Text, TopicsMatchingTerm );
524
525 LogEvent(LogSearch, ' Combining' );
526 case Term.CombineMethod of
527 cmOptional:
528 AddUInt32Array( TopicsMatchingTerm,
529 TopicRelevances,
530 TopicCount );
531
532 cmRequired:
533 begin
534 // if zero then add to exclusions
535 NotOrUInt32Array( TopicsMatchingTerm,
536 TopicsExcluded,
537 TopicCount );
538
539 AddUInt32Array( TopicsMatchingTerm,
540 TopicRelevances,
541 TopicCount );
542 end;
543
544 cmExcluded:
545 OrUInt32Array( TopicsMatchingTerm,
546 TopicsExcluded,
547 TopicCount );
548 end;
549
550// Term.ClearMatches;
551
552 // loop for next term...
553 end;
554
555 LogEvent(LogSearch, 'Search completed, converting to list' );
556
557 // Now convert to list form.
558
559 for TopicIndex := 0 to TopicCount - 1 do
560 begin
561 if TopicsExcluded[ TopicIndex ] = 0 then
562 begin
563 Topic := HelpFile.Topics[ TopicIndex ];
564 Topic.SearchRelevance := TopicRelevances[ TopicIndex ];
565 if Topic.SearchRelevance > 0 then
566 begin
567 Results.Add( Topic );
568 end;
569 end;
570 end;
571
572 LogEvent(LogSearch, 'Freeing arrays' );
573 FreeUInt32Array( TopicRelevances, TopicCount );
574 FreeUInt32Array( TopicsExcluded, TopicCount );
575 FreeUInt32Array( TopicsMatchingTerm, TopicCount );
576 FreeUInt32Array( TopicsMatchingTermPart, TopicCount );
577 FreeUInt32Array( TopicsMatchingDictWord, TopicCount );
578
579 LogEvent(LogSearch, 'Done' );
580end;
581
582Initialization
583End.
Note: See TracBrowser for help on using the repository browser.