source: trunk/tools/assistant/index.cpp@ 203

Last change on this file since 203 was 191, checked in by rudi, 14 years ago

Qt Assistant added

File size: 12.4 KB
Line 
1/**********************************************************************
2** Copyright (C) 2000-2007 Trolltech ASA. All rights reserved.
3**
4** This file is part of the Qt Assistant.
5**
6** This file may be distributed and/or modified under the terms of the
7** GNU General Public License version 2 as published by the Free Software
8** Foundation and appearing in the file LICENSE.GPL included in the
9** packaging of this file.
10**
11** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
12** licenses may use this file in accordance with the Qt Commercial License
13** Agreement provided with the Software.
14**
15** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
16** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
17**
18** See http://www.trolltech.com/gpl/ for GPL licensing information.
19** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
20** information about Qt Commercial License Agreements.
21**
22** Contact info@trolltech.com if any conditions of this licensing are
23** not clear to you.
24**
25**********************************************************************/
26
27#include "index.h"
28
29#include <qfile.h>
30#include <qdir.h>
31#include <qstringlist.h>
32#include <qdict.h>
33#include <qapplication.h>
34
35#include <ctype.h>
36
37int TermList::compareItems( QPtrCollection::Item i1, QPtrCollection::Item i2 )
38{
39 if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency )
40 return 0;
41 if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency )
42 return -1;
43 return 1;
44}
45
46QDataStream &operator>>( QDataStream &s, Document &l )
47{
48 s >> l.docNumber;
49 s >> l.frequency;
50 return s;
51}
52
53QDataStream &operator<<( QDataStream &s, const Document &l )
54{
55 s << (Q_INT16)l.docNumber;
56 s << (Q_INT16)l.frequency;
57 return s;
58}
59
60Index::Index( const QString &dp, const QString &hp )
61 : QObject( 0, 0 ), dict( 8999 ), docPath( dp )
62{
63 alreadyHaveDocList = FALSE;
64 lastWindowClosed = FALSE;
65 connect( qApp, SIGNAL( lastWindowClosed() ),
66 this, SLOT( setLastWinClosed() ) );
67}
68
69Index::Index( const QStringList &dl, const QString &hp )
70 : QObject( 0, 0 ), dict( 8999 )
71{
72 docList = dl;
73 alreadyHaveDocList = TRUE;
74 lastWindowClosed = FALSE;
75 connect( qApp, SIGNAL( lastWindowClosed() ),
76 this, SLOT( setLastWinClosed() ) );
77}
78
79void Index::setLastWinClosed()
80{
81 lastWindowClosed = TRUE;
82}
83
84void Index::setDictionaryFile( const QString &f )
85{
86 dictFile = f;
87}
88
89void Index::setDocListFile( const QString &f )
90{
91 docListFile = f;
92}
93
94void Index::setDocList( const QStringList &lst )
95{
96 docList = lst;
97}
98
99int Index::makeIndex()
100{
101 if ( !alreadyHaveDocList )
102 setupDocumentList();
103 if ( docList.isEmpty() )
104 return 1;
105 QStringList::Iterator it = docList.begin();
106 int steps = docList.count() / 100;
107 if ( !steps )
108 steps++;
109 int prog = 0;
110 for ( int i = 0; it != docList.end(); ++it, ++i ) {
111 if ( lastWindowClosed ) {
112 return -1;
113 }
114 parseDocument( *it, i );
115 if ( i%steps == 0 ) {
116 prog++;
117 emit indexingProgress( prog );
118 }
119 }
120 return 0;
121}
122
123void Index::setupDocumentList()
124{
125 QDir d( docPath );
126 QStringList lst = d.entryList( "*.html" );
127 QStringList::ConstIterator it = lst.begin();
128 for ( ; it != lst.end(); ++it )
129 docList.append( docPath + "/" + *it );
130}
131
132void Index::insertInDict( const QString &str, int docNum )
133{
134 if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 )
135 return;
136 Entry *e = 0;
137 if ( dict.count() )
138 e = dict[ str ];
139
140 if ( e ) {
141 if ( e->documents.first().docNumber != docNum )
142 e->documents.prepend( Document( docNum, 1 ) );
143 else
144 e->documents.first().frequency++;
145 } else {
146 dict.insert( str, new Entry( docNum ) );
147 }
148}
149
150void Index::parseDocument( const QString &filename, int docNum )
151{
152 QFile file( filename );
153 if ( !file.open( IO_ReadOnly ) ) {
154 qWarning( "can not open file " + filename );
155 return;
156 }
157
158 QTextStream s( &file );
159 QString text = s.read();
160 if (text.isNull())
161 return;
162
163 bool valid = TRUE;
164 const QChar *buf = text.unicode();
165 QChar str[64];
166 QChar c = buf[0];
167 int j = 0;
168 int i = 0;
169 while ( (uint)j < text.length() ) {
170 if ( c == '<' || c == '&' ) {
171 valid = FALSE;
172 if ( i > 1 )
173 insertInDict( QString(str,i), docNum );
174 i = 0;
175 c = buf[++j];
176 continue;
177 }
178 if ( ( c == '>' || c == ';' ) && !valid ) {
179 valid = TRUE;
180 c = buf[++j];
181 continue;
182 }
183 if ( !valid ) {
184 c = buf[++j];
185 continue;
186 }
187 if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
188 str[i] = c.lower();
189 ++i;
190 } else {
191 if ( i > 1 )
192 insertInDict( QString(str,i), docNum );
193 i = 0;
194 }
195 c = buf[++j];
196 }
197 if ( i > 1 )
198 insertInDict( QString(str,i), docNum );
199 file.close();
200}
201
202void Index::writeDict()
203{
204 QDictIterator<Entry> it( dict );
205 QFile f( dictFile );
206 if ( !f.open( IO_WriteOnly ) )
207 return;
208 QDataStream s( &f );
209 for( ; it.current(); ++it ) {
210 Entry *e = it.current();
211 s << it.currentKey();
212 s << e->documents;
213 }
214 f.close();
215 writeDocumentList();
216}
217
218void Index::writeDocumentList()
219{
220 QFile f( docListFile );
221 if ( !f.open( IO_WriteOnly ) )
222 return;
223 QDataStream s( &f );
224 s << docList;
225}
226
227void Index::readDict()
228{
229 QFile f( dictFile );
230 if ( !f.open( IO_ReadOnly ) )
231 return;
232
233 dict.clear();
234 QDataStream s( &f );
235 QString key;
236 QValueList<Document> docs;
237 while ( !s.atEnd() ) {
238 s >> key;
239 s >> docs;
240 dict.insert( key, new Entry( docs ) );
241 }
242 f.close();
243 readDocumentList();
244}
245
246void Index::readDocumentList()
247{
248 QFile f( docListFile );
249 if ( !f.open( IO_ReadOnly ) )
250 return;
251 QDataStream s( &f );
252 s >> docList;
253}
254
255QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
256{
257 TermList termList;
258
259 QStringList::ConstIterator it = terms.begin();
260 for ( it = terms.begin(); it != terms.end(); ++it ) {
261 Entry *e = 0;
262 if ( (*it).contains( '*' ) ) {
263 QValueList<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
264 termList.append( new Term( "dummy", wcts.count(), wcts ) );
265 } else if ( dict[ *it ] ) {
266 e = dict[ *it ];
267 termList.append( new Term( *it, e->documents.count(), e->documents ) );
268 } else {
269 return QStringList();
270 }
271 }
272 termList.sort();
273
274 Term *minTerm = termList.first();
275 if ( !termList.count() )
276 return QStringList();
277 termList.removeFirst();
278
279 QValueList<Document> minDocs = minTerm->documents;
280 QValueList<Document>::iterator C;
281 QValueList<Document>::ConstIterator It;
282 Term *t = termList.first();
283 for ( ; t; t = termList.next() ) {
284 QValueList<Document> docs = t->documents;
285 C = minDocs.begin();
286 while ( C != minDocs.end() ) {
287 bool found = FALSE;
288 for ( It = docs.begin(); It != docs.end(); ++It ) {
289 if ( (*C).docNumber == (*It).docNumber ) {
290 (*C).frequency += (*It).frequency;
291 found = TRUE;
292 break;
293 }
294 }
295 if ( !found )
296 C = minDocs.remove( C );
297 else
298 ++C;
299 }
300 }
301
302 QStringList results;
303 qHeapSort( minDocs );
304 if ( termSeq.isEmpty() ) {
305 for ( C = minDocs.begin(); C != minDocs.end(); ++C )
306 results << docList[ (int)(*C).docNumber ];
307 return results;
308 }
309
310 QString fileName;
311 for ( C = minDocs.begin(); C != minDocs.end(); ++C ) {
312 fileName = docList[ (int)(*C).docNumber ];
313 if ( searchForPattern( termSeq, seqWords, fileName ) )
314 results << fileName;
315 }
316 return results;
317}
318
319QString Index::getDocumentTitle( const QString &fileName )
320{
321 QFile file( fileName );
322 if ( !file.open( IO_ReadOnly ) ) {
323 qWarning( "cannot open file " + fileName );
324 return fileName;
325 }
326 QTextStream s( &file );
327 QString text = s.read();
328
329 int start = text.find( "<title>", 0, FALSE ) + 7;
330 int end = text.find( "</title>", 0, FALSE );
331
332 QString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) );
333 return title;
334}
335
336QStringList Index::getWildcardTerms( const QString &term )
337{
338 QStringList lst;
339 QStringList terms = split( term );
340 QValueList<QString>::iterator iter;
341
342 QDictIterator<Entry> it( dict );
343 for( ; it.current(); ++it ) {
344 int index = 0;
345 bool found = FALSE;
346 QString text( it.currentKey() );
347 for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
348 if ( *iter == "*" ) {
349 found = TRUE;
350 continue;
351 }
352 if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
353 found = FALSE;
354 break;
355 }
356 index = text.find( *iter, index );
357 if ( *iter == terms.last() && index != (int)text.length()-1 ) {
358 index = text.findRev( *iter );
359 if ( index != (int)text.length() - (int)(*iter).length() ) {
360 found = FALSE;
361 break;
362 }
363 }
364 if ( index != -1 ) {
365 found = TRUE;
366 index += (*iter).length();
367 continue;
368 } else {
369 found = FALSE;
370 break;
371 }
372 }
373 if ( found )
374 lst << text;
375 }
376
377 return lst;
378}
379
380QStringList Index::split( const QString &str )
381{
382 QStringList lst;
383 int j = 0;
384 int i = str.find( '*', j );
385
386 while ( i != -1 ) {
387 if ( i > j && i <= (int)str.length() ) {
388 lst << str.mid( j, i - j );
389 lst << "*";
390 }
391 j = i + 1;
392 i = str.find( '*', j );
393 }
394
395 int l = str.length() - 1;
396 if ( str.mid( j, l - j + 1 ).length() > 0 )
397 lst << str.mid( j, l - j + 1 );
398
399 return lst;
400}
401
402QValueList<Document> Index::setupDummyTerm( const QStringList &terms )
403{
404 TermList termList;
405 QStringList::ConstIterator it = terms.begin();
406 for ( ; it != terms.end(); ++it ) {
407 Entry *e = 0;
408 if ( dict[ *it ] ) {
409 e = dict[ *it ];
410 termList.append( new Term( *it, e->documents.count(), e->documents ) );
411 }
412 }
413 termList.sort();
414
415 QValueList<Document> maxList;
416
417 if ( !termList.count() )
418 return maxList;
419 maxList = termList.last()->documents;
420 termList.removeLast();
421
422 QValueList<Document>::iterator docIt;
423 Term *t = termList.first();
424 while ( t ) {
425 QValueList<Document> docs = t->documents;
426 for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
427 if ( maxList.findIndex( *docIt ) == -1 )
428 maxList.append( *docIt );
429 }
430 t = termList.next();
431 }
432 return maxList;
433}
434
435void Index::buildMiniDict( const QString &str )
436{
437 if ( miniDict[ str ] )
438 miniDict[ str ]->positions.append( wordNum );
439 ++wordNum;
440}
441
442bool Index::searchForPattern( const QStringList &patterns, const QStringList &words, const QString &fileName )
443{
444 QFile file( fileName );
445 if ( !file.open( IO_ReadOnly ) ) {
446 qWarning( "cannot open file " + fileName );
447 return FALSE;
448 }
449
450 wordNum = 3;
451 miniDict.clear();
452 QStringList::ConstIterator cIt = words.begin();
453 for ( ; cIt != words.end(); ++cIt )
454 miniDict.insert( *cIt, new PosEntry( 0 ) );
455
456 QTextStream s( &file );
457 QString text = s.read();
458 bool valid = TRUE;
459 const QChar *buf = text.unicode();
460 QChar str[64];
461 QChar c = buf[0];
462 int j = 0;
463 int i = 0;
464 while ( (uint)j < text.length() ) {
465 if ( c == '<' || c == '&' ) {
466 valid = FALSE;
467 if ( i > 1 )
468 buildMiniDict( QString(str,i) );
469 i = 0;
470 c = buf[++j];
471 continue;
472 }
473 if ( ( c == '>' || c == ';' ) && !valid ) {
474 valid = TRUE;
475 c = buf[++j];
476 continue;
477 }
478 if ( !valid ) {
479 c = buf[++j];
480 continue;
481 }
482 if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
483 str[i] = c.lower();
484 ++i;
485 } else {
486 if ( i > 1 )
487 buildMiniDict( QString(str,i) );
488 i = 0;
489 }
490 c = buf[++j];
491 }
492 if ( i > 1 )
493 buildMiniDict( QString(str,i) );
494 file.close();
495
496 QStringList::ConstIterator patIt = patterns.begin();
497 QStringList wordLst;
498 QValueList<uint> a, b;
499 QValueList<uint>::iterator aIt;
500 for ( ; patIt != patterns.end(); ++patIt ) {
501 wordLst = QStringList::split( ' ', *patIt );
502 a = miniDict[ wordLst[0] ]->positions;
503 for ( int j = 1; j < (int)wordLst.count(); ++j ) {
504 b = miniDict[ wordLst[j] ]->positions;
505 aIt = a.begin();
506 while ( aIt != a.end() ) {
507 if ( b.find( *aIt + 1 ) != b.end() ) {
508 (*aIt)++;
509 ++aIt;
510 } else {
511 aIt = a.remove( aIt );
512 }
513 }
514 }
515 }
516 if ( a.count() )
517 return TRUE;
518 return FALSE;
519}
Note: See TracBrowser for help on using the repository browser.