source: trunk/NewView/SearchUnit.pas@ 25

Last change on this file since 25 was 18, checked in by RBRi, 19 years ago

+ newview source

  • Property svn:eol-style set to native
File size: 16.3 KB
RevLine 
[18]1Unit SearchUnit;
2
3// NewView - a new OS/2 Help Viewer
4// Copyright 2003 Aaron Lawrence (aaronl at consultant dot com)
5// This software is released under the Gnu Public License - see readme.txt
6
7Interface
8
9// Contains code to search help files.
10
11uses
12 Classes,
13 HelpFile, TextSearchQuery, IPFFileFormatUnit;
14
15const
16 // match weightings
17 mwOnlyTitleWord = 200;
18 mwFirstTitleWord = 50;
19 mwTitleWord = 20;
20
21 mwOnlyIndexWord = 100;
22 mwFirstIndexWord = 20;
23 mwIndexWord = 10;
24 mwTopicTextWord = 1;
25
26 // best case match weighting of a word
27 mwExactWord = 20;
28
29
30// note on weightings. The title/index weightings
31// are multipled by word weightings.
32// Topic text matches are equal to word weighting
33// times word weighting.
34
35procedure SearchHelpFile( HelpFile: THelpFile;
36 Query: TTextSearchQuery;
37 Results: TList;
38 WordSequences: TList );
39
40// clear a lsit of word sequences (as produced by above)
41procedure ClearWordSequences( WordSequences: TList;
42 DictionaryCount: longint );
43
44Implementation
45
46uses
47 SysUtils,
48 ACLUtility, ACLStringUtility, ACLProfile,
49 HelpTopic;
50
51type
52 TSearchType = ( stGeneral, stStarts, stExactMatch, stEnds );
53
54procedure ClearWordSequence( WordSequence: TList;
55 DictionaryCount: longint );
56var
57 StepIndex: longint;
58 DictionaryRelevances: UInt32ArrayPointer;
59begin
60 for StepIndex := 0 to WordSequence.Count - 1 do
61 begin
62 DictionaryRelevances := WordSequence[ StepIndex ];
63 FreeUInt32Array( DictionaryRelevances, DictionaryCount );
64 end;
65 WordSequence.Clear;
66end;
67
68procedure ClearWordSequences( WordSequence: TList;
69 DictionaryCount: longint );
70var
71 SequenceIndex: longint;
72 WordSequence: TList;
73begin
74 for SequenceIndex := 0 to WordSequences.Count - 1 do
75 begin
76 WordSequence := WordSequences[ SequenceIndex ];
77 ClearWordSequence( WordSequence,
78 DictionaryCount );
79 WordSequence.Destroy;
80 end;
81 WordSequences.Clear;
82end;
83
84
85// given a search word which is known to matche Reference word,
86// return the relevance
87function MatchedWordRelevance( const SearchWord: string;
88 const ReferenceWord: string ): longint;
89begin
90 Result := mwExactWord
91 * Length( SearchWord )
92 div Length( ReferenceWord );
93 if Result = 0 then
94 Result := 1;
95end;
96
97// Compares the given search word against the given
98// reference word. Returns a value indicating how well the
99// search word matches, 0 = not at all.
100function CompareWord( const SearchWord: string;
101 const ReferenceWord: string ): longint;
102var
103 OccurrencePos: longint;
104begin
105 Result := 0;
106 OccurrencePos := CaseInsensitivePos( SearchWord, ReferenceWord );
107 if OccurrencePos = 0 then
108 // no match
109 exit;
110
111 Result := MatchedWordRelevance( SearchWord, ReferenceWord );
112end;
113
114// Search the help file dictionary for words that match
115// the given search word. Partial matches are considered.
116// Results returns the matching word indexes.
117procedure SearchDictionary( HelpFile: THelpFile;
118 SearchWord: string;
119 Results: UInt32ArrayPointer );
120var
121 DictIndex: integer;
122 pDictWord: pstring;
123begin
124 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
125 begin
126 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
127 Results[ DictIndex ] := CompareWord( SearchWord,
128 pDictWord^ );
129 end;
130end;
131
132// Search the help file dictionary for words that
133// match the given search word exactly (except for case-insensitive)
134procedure SearchDictionaryExact( HelpFile: THelpFile;
135 SearchWord: string;
136 Results: UInt32ArrayPointer );
137var
138 DictIndex: integer;
139 pDictWord: pstring;
140begin
141 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
142
143 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
144 begin
145 pDictWord := HelpFile.DictionaryWordPtrs[ DictIndex ];
146 if StringsSame( SearchWord, pDictWord^ ) then
147 Results[ DictIndex ] := mwExactWord;
148 end;
149end;
150
151// Search the help file dictionary for words that
152// start with the given word
153procedure SearchDictionaryStarts( HelpFile: THelpFile;
154 SearchWord: string;
155 Results: UInt32ArrayPointer );
156var
157 DictIndex: integer;
158 DictWord: string;
159begin
160 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
161
162 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
163 begin
164 DictWord := HelpFile.DictionaryWords[ DictIndex ];
165 if StrStarts( SearchWord, DictWord ) then
166 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
167 end;
168end;
169
170// Search the help file dictionary for words that
171// end with the given word
172procedure SearchDictionaryEnds( HelpFile: THelpFile;
173 SearchWord: string;
174 Results: UInt32ArrayPointer );
175var
176 DictIndex: integer;
177 DictWord: string;
178begin
179 FillUInt32Array( Results, HelpFile.DictionaryCount, 0 );
180
181 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
182 begin
183 DictWord := HelpFile.DictionaryWords[ DictIndex ];
184 if StrEnds( SearchWord, DictWord ) then
185 Results[ DictIndex ] := MatchedWordRelevance( SearchWord, DictWord );
186 end;
187end;
188
189// Search titles of topics for given searchword
190procedure SearchTopicTitles( HelpFile: THelpFile;
191 SearchWord: string;
192 Results: UInt32ArrayPointer );
193var
194 TopicIndex: longint;
195 pTitle: pstring;
196 TitleWord: string;
197 Topic: TTopic;
198 TitleWordIndex: longint;
199 WordRelevance: longint;
200 TitleWordRelevance: longint;
201 P: longint;
202begin
203 // Search topic titles
204 for TopicIndex:= 0 to HelpFile.TopicCount - 1 do
205 begin
206 Topic:= HelpFile.Topics[ TopicIndex ];
207 pTitle:= Topic.TitlePtr;
208 TitleWordIndex := 0;
209 P := 1;
210 while P < Length( pTitle^ ) do
211 begin
212 GetNextValue( pTitle^, P, TitleWord, ' ' );
213 WordRelevance := CompareWord( SearchWord,
214 TitleWord );
215 if WordRelevance > 0 then
216 begin
217 if TitleWordIndex = 0 then
218 begin
219 // matching the first word is best
220 if P >= Length( pTitle^ ) then
221 // in fact it's the only word
222 TitleWordRelevance := mwOnlyTitleWord
223 * WordRelevance
224 else
225 TitleWordRelevance := mwFirstTitleWord
226 * WordRelevance
227 end
228 else
229 begin
230 TitleWordRelevance := mwTitleWord
231 * WordRelevance;
232 end;
233 inc( Results[ TopicIndex ],
234 TitleWordRelevance );
235 end;
236 inc( TitleWordIndex );
237 end;
238 end;
239end;
240
241// Search index entries for given searchword
242procedure SearchIndex( HelpFile: THelpFile;
243 SearchWord: string;
244 Results: UInt32ArrayPointer );
245var
246 IndexIndex: longint;
247 pIndexEntry: pstring;
248 IndexEntryWord: string;
249 Topic: TTopic;
250 IndexEntryWordIndex: longint;
251 WordRelevance: longint;
252 IndexEntryWordRelevance: longint;
253 P: longint;
254begin
255 for IndexIndex := 0 to HelpFile.Index.Count - 1 do
256 begin
257 Topic := HelpFile.Index.Objects[ IndexIndex ] as TTopic;
258 pIndexEntry := HelpFile.IndexEntryPtr[ IndexIndex ];
259 IndexEntryWordIndex := 0;
260 P := 1;
261 while P < Length( pIndexEntry^ ) do
262 begin
263 GetNextValue( pIndexEntry^, P, IndexEntryWord, ' ' );
264 WordRelevance := CompareWord( SearchWord,
265 IndexEntryWord );
266 if WordRelevance > 0 then
267 begin
268 if IndexEntryWordIndex = 0 then
269 begin
270 // matching the first word is best
271 if P >= Length( pIndexEntry^ ) then
272 // in fact it's the only word
273 IndexEntryWordRelevance := mwOnlyIndexWord
274 * WordRelevance
275 else
276 IndexEntryWordRelevance := mwFirstIndexWord
277 * WordRelevance
278 end
279 else
280 begin
281 IndexEntryWordRelevance := mwIndexWord
282 * WordRelevance;
283 end;
284 inc( Results[ Topic.Index ],
285 IndexEntryWordRelevance );
286 end;
287 inc( IndexEntryWordIndex );
288 end;
289 end;
290end;
291
292// ------------------------------------------------------
293
294// Master search function. Given a search query,
295// searches topic text, titles, index entries.
296// Matching topics are added to TList, with their
297// SearchRelevance set appropriately.
298procedure SearchHelpFile( HelpFile: THelpFile;
299 Query: TTextSearchQuery;
300 Results: TList;
301 WordSequences: TList );
302var
303 TopicCount: longint;
304 Topic: TTopic;
305 TopicIndex: longint;
306 TermIndex: longint;
307 Term: TSearchTerm;
308
309 DictionaryRelevances: UInt32ArrayPointer;
310
311 TopicsMatchingDictWord: UInt32ArrayPointer; // flags
312 TopicsMatchingTermPart: UInt32ArrayPointer; // flags
313 TopicsMatchingTerm: UInt32ArrayPointer; // flag then relevances
314 TopicRelevances: UInt32ArrayPointer;
315 TopicsExcluded: UInt32ArrayPointer;
316
317 TopicRelevanceForTerm: longint;
318
319 WordRelevance: longint;
320 DictIndex: longint;
321
322 TermPartIndex: longint;
323 TermPart: string;
324
325 s: string;
326
327 TermWordSequence: TList;
328begin
329 if HelpFile.SearchTable = nil then
330 begin
331 exit;
332 end;
333
334 // Reset flags per topic
335 TopicCount := HelpFile.TopicCount;
336
337 // Get memory for topic relevance arrays
338
339 AllocUInt32Array( TopicsMatchingDictWord,
340 TopicCount );
341 AllocUInt32Array( TopicsMatchingTermPart,
342 TopicCount );
343 AllocUInt32Array( TopicsMatchingTerm,
344 TopicCount );
345 AllocUInt32Array( TopicRelevances, // functions as a flag and a cumulative relevance
346 TopicCount );
347 AllocUInt32Array( TopicsExcluded, // Exclusions are treated as boolean only
348 TopicCount );
349
350 ClearUInt32Array( TopicRelevances,
351 TopicCount );
352 ClearUInt32Array( TopicsExcluded,
353 TopicCount );
354
355 for TermIndex := 0 to Query.TermCount - 1 do
356 begin
357 Term := Query.Term[ TermIndex ];
358
359 ProfileEvent( 'Searching for term "'
360 + Term.Text
361 + '", '
362 + IntToStr( Term.Parts.Count )
363 + ' parts' );
364
365 // look thru all parts of the term. eg. CAKE_SAUSAGE
366
367 TermWordSequence := TList.Create;
368
369 if WordSequences <> nil then
370 if Term.CombineMethod <> cmExcluded then
371 // this term is an inclusive one, so we want to remember the matches
372 WordSequences.Add( TermWordSequence );
373
374 for TermPartIndex := 0 to Term.Parts.Count - 1 do
375 begin
376 TermPart := Term.Parts[ TermPartIndex ];
377
378 ProfileEvent( ' Searching for [' + TermPart + ']' );
379
380 AllocUInt32Array( DictionaryRelevances,
381 HelpFile.DictionaryCount );
382
383 TermWordSequence.Add( DictionaryRelevances );
384
385 // Search the dictionary for matches.
386 // alpha numeric match
387
388 if Term.Parts.Count = 1 then
389 // general match allowing all kinds of partial matches
390 SearchDictionary( HelpFile,
391 TermPart,
392 DictionaryRelevances )
393
394 else if TermPartIndex = 0 then
395 // first term part: word must match end of a topic word e.g. must end in "cake"
396 SearchDictionaryEnds( HelpFile,
397 TermPart,
398 DictionaryRelevances )
399
400 else if TermPartIndex = Term.Parts.Count - 1 then
401 // last term part: word must match start of a topic word e.g. must start with "sausage"
402 SearchDictionaryStarts( HelpFile,
403 TermPart,
404 DictionaryRelevances )
405
406 else
407 // intermediate term part: word must match exactly e.g. must be "_"
408 SearchDictionaryExact( HelpFile,
409 TermPart,
410 DictionaryRelevances );
411
412 // For each word in the dictionary that matches
413 // this search term part, search topic texts
414
415 ProfileEvent( ' Dictionary search done' );
416 ClearUInt32Array( TopicsMatchingTermPart,
417 TopicCount );
418
419 for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
420 begin
421 WordRelevance := DictionaryRelevances[ DictIndex ];
422 if WordRelevance > 0 then
423 begin
424 // Search for occurrences of this word
425 // within the text of topics
426 HelpFile.SearchTable.Search( DictIndex,
427 TopicsMatchingDictWord );
428
429 // debug
430 s := HelpFile.DictionaryWords[ DictIndex ];
431 // TopicRelevancesForDictWord now contains 1
432 // for topics that contain this word.
433
434 OrUInt32Array( TopicsMatchingDictWord,
435 TopicsMatchingTermPart,
436 TopicCount );
437 end
438 end;
439
440 ProfileEvent( 'TOpic searches done' );
441
442 if TermPartIndex = 0 then
443 // first part, just copy
444 CopyUInt32Array( TopicsMatchingTermPart,
445 TopicsMatchingTerm,
446 TopicCount )
447 else
448 // and with previous term part results
449 AndUInt32Array( TopicsMatchingTermPart,
450 TopicsMatchingTerm,
451 TopicCount );
452
453 // loop for next term part (IPF word)
454 end;
455
456 // Now we have searched the dictionary and worked out matching topics
457 // for all parts of the term. Now combine all together
458
459 ProfileEvent( 'Checking for sequences' );
460 for TopicIndex := 0 to TopicCount - 1 do
461 begin
462 if TopicsMatchingTerm[ TopicIndex ] > 0 then
463 begin
464 Topic := HelpFile.Topics[ TopicIndex ];
465 // Topic text contained a match for the all the parts
466 // of the term.
467 // Now we need to:
468 // - verify that they actually occur all in a sequence (if it's a multi-part term)
469 // - count occurrences for relevance.
470
471 TopicRelevanceForTerm :=
472 Topic.SearchForWordSequences( TermWordSequence,
473 false ); // don't stop at first match
474
475 TopicRelevanceForTerm :=
476 TopicRelevanceForTerm div Term.Parts.Count; // divide to bring back into scale
477
478 TopicsMatchingTerm[ TopicIndex ] := TopicRelevanceForTerm;
479
480 end;
481 end;
482
483 if WordSequences = nil then
484 begin
485 // we don't need to keep the sequence
486 ClearWordSequence( TermWordSequence,
487 HelpFile.DictionaryCount );
488 TermWordSequence.Destroy;
489 end;
490
491 // Search titles and index
492
493 ProfileEvent( ' Searching titles' );
494 SearchTopicTitles( HelpFile, Term.Text, TopicsMatchingTerm );
495
496 ProfileEvent( ' Searching index' );
497 SearchIndex( HelpFile, Term.Text, TopicsMatchingTerm );
498
499 ProfileEvent( ' Combining' );
500 case Term.CombineMethod of
501 cmOptional:
502 AddUInt32Array( TopicsMatchingTerm,
503 TopicRelevances,
504 TopicCount );
505
506 cmRequired:
507 begin
508 // if zero then add to exclusions
509 NotOrUInt32Array( TopicsMatchingTerm,
510 TopicsExcluded,
511 TopicCount );
512
513 AddUInt32Array( TopicsMatchingTerm,
514 TopicRelevances,
515 TopicCount );
516 end;
517
518 cmExcluded:
519 OrUInt32Array( TopicsMatchingTerm,
520 TopicsExcluded,
521 TopicCount );
522 end;
523
524// Term.ClearMatches;
525
526 // loop for next term...
527 end;
528
529 ProfileEvent( 'Search completed, converting to list' );
530
531 // Now convert to list form.
532
533 for TopicIndex := 0 to TopicCount - 1 do
534 begin
535 if TopicsExcluded[ TopicIndex ] = 0 then
536 begin
537 Topic := HelpFile.Topics[ TopicIndex ];
538 Topic.SearchRelevance := TopicRelevances[ TopicIndex ];
539 if Topic.SearchRelevance > 0 then
540 begin
541 Results.Add( Topic );
542 end;
543 end;
544 end;
545
546 ProfileEvent( 'Freeing arrays' );
547 FreeUInt32Array( TopicRelevances, TopicCount );
548 FreeUInt32Array( TopicsExcluded, TopicCount );
549 FreeUInt32Array( TopicsMatchingTerm, TopicCount );
550 FreeUInt32Array( TopicsMatchingTermPart, TopicCount );
551 FreeUInt32Array( TopicsMatchingDictWord, TopicCount );
552
553 ProfileEvent( 'Done' );
554end;
555
556Initialization
557End.
Note: See TracBrowser for help on using the repository browser.