source: trunk/tools/assistant/compat/index.cpp@ 824

Last change on this file since 824 was 651, checked in by Dmitry A. Kuminov, 16 years ago

trunk: Merged in qt 4.6.2 sources.

File size: 16.2 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the Qt Assistant of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "index.h"
43
44#include <QFile>
45#include <QDir>
46#include <QStringList>
47#include <QApplication>
48#include <QByteArray>
49#include <QTextStream>
50#include <QtAlgorithms>
51#include <QUrl>
52#include <QTextCodec>
53#include <ctype.h>
54#include <QTextDocument>
55
56QT_BEGIN_NAMESPACE
57
58struct Term {
59 Term() : frequency(-1) {}
60 Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {}
61 QString term;
62 int frequency;
63 QVector<Document>documents;
64 bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
65};
66
67QDataStream &operator>>( QDataStream &s, Document &l )
68{
69 s >> l.docNumber;
70 s >> l.frequency;
71 return s;
72}
73
74QDataStream &operator<<( QDataStream &s, const Document &l )
75{
76 s << (qint16)l.docNumber;
77 s << (qint16)l.frequency;
78 return s;
79}
80
81Index::Index( const QString &dp, const QString &hp )
82 : QObject( 0 ), docPath( dp )
83{
84 Q_UNUSED(hp);
85
86 alreadyHaveDocList = false;
87 lastWindowClosed = false;
88 connect( qApp, SIGNAL(lastWindowClosed()),
89 this, SLOT(setLastWinClosed()) );
90}
91
92Index::Index( const QStringList &dl, const QString &hp )
93 : QObject( 0 )
94{
95 Q_UNUSED(hp);
96 docList = dl;
97 alreadyHaveDocList = true;
98 lastWindowClosed = false;
99 connect( qApp, SIGNAL(lastWindowClosed()),
100 this, SLOT(setLastWinClosed()) );
101}
102
103void Index::setLastWinClosed()
104{
105 lastWindowClosed = true;
106}
107
108void Index::setDictionaryFile( const QString &f )
109{
110 dictFile = f;
111}
112
113void Index::setDocListFile( const QString &f )
114{
115 docListFile = f;
116}
117
118void Index::setDocList( const QStringList &lst )
119{
120 docList = lst;
121}
122
123int Index::makeIndex()
124{
125 if ( !alreadyHaveDocList )
126 setupDocumentList();
127 if ( docList.isEmpty() )
128 return 1;
129 QStringList::Iterator it = docList.begin();
130 int steps = docList.count() / 100;
131 if ( !steps )
132 steps++;
133 int prog = 0;
134 for ( int i = 0; it != docList.end(); ++it, ++i ) {
135 if ( lastWindowClosed ) {
136 return -1;
137 }
138 QUrl url(*it);
139 parseDocument( url.toLocalFile(), i );
140 if ( i%steps == 0 ) {
141 prog++;
142 emit indexingProgress( prog );
143 }
144 }
145 return 0;
146}
147
148void Index::setupDocumentList()
149{
150 QDir d( docPath );
151 QStringList filters;
152 filters.append(QLatin1String("*.html"));
153 QStringList lst = d.entryList(filters);
154 QStringList::ConstIterator it = lst.constBegin();
155 for ( ; it != lst.constEnd(); ++it )
156 docList.append( QLatin1String("file:") + docPath + QLatin1String("/") + *it );
157}
158
159void Index::insertInDict( const QString &str, int docNum )
160{
161 if ( str == QLatin1String("amp") || str == QLatin1String("nbsp"))
162 return;
163 Entry *e = 0;
164 if ( dict.count() )
165 e = dict[ str ];
166
167 if ( e ) {
168 if ( e->documents.last().docNumber != docNum )
169 e->documents.append( Document(docNum, 1 ) );
170 else
171 e->documents.last().frequency++;
172 } else {
173 dict.insert( str, new Entry( docNum ) );
174 }
175}
176
177QString Index::getCharsetForDocument(QFile *file)
178{
179 QTextStream s(file);
180 QString contents = s.readAll();
181
182 QString encoding;
183 int start = contents.indexOf(QLatin1String("<meta"), 0, Qt::CaseInsensitive);
184 if (start > 0) {
185 int end = contents.indexOf(QLatin1String(">"), start);
186 QString meta = contents.mid(start+5, end-start);
187 meta = meta.toLower();
188 QRegExp r(QLatin1String("charset=([^\"\\s]+)"));
189 if (r.indexIn(meta) != -1) {
190 encoding = r.cap(1);
191 }
192 }
193
194 file->seek(0);
195 if (encoding.isEmpty())
196 return QLatin1String("utf-8");
197 return encoding;
198}
199
200void Index::parseDocument( const QString &filename, int docNum )
201{
202 QFile file( filename );
203 if ( !file.open(QFile::ReadOnly) ) {
204 qWarning( "can not open file %s", qPrintable(filename) );
205 return;
206 }
207
208 QTextStream s(&file);
209 QString en = getCharsetForDocument(&file);
210 s.setCodec(QTextCodec::codecForName(en.toLatin1().constData()));
211
212 QString text = s.readAll();
213 if (text.isNull())
214 return;
215
216 bool valid = true;
217 const QChar *buf = text.unicode();
218 QChar str[64];
219 QChar c = buf[0];
220 int j = 0;
221 int i = 0;
222 while ( j < text.length() ) {
223 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
224 valid = false;
225 if ( i > 1 )
226 insertInDict( QString(str,i), docNum );
227 i = 0;
228 c = buf[++j];
229 continue;
230 }
231 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
232 valid = true;
233 c = buf[++j];
234 continue;
235 }
236 if ( !valid ) {
237 c = buf[++j];
238 continue;
239 }
240 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
241 str[i] = c.toLower();
242 ++i;
243 } else {
244 if ( i > 1 )
245 insertInDict( QString(str,i), docNum );
246 i = 0;
247 }
248 c = buf[++j];
249 }
250 if ( i > 1 )
251 insertInDict( QString(str,i), docNum );
252 file.close();
253}
254
255void Index::writeDict()
256{
257 QFile f( dictFile );
258 if ( !f.open(QFile::WriteOnly ) )
259 return;
260 QDataStream s( &f );
261 for(QHash<QString, Entry *>::Iterator it = dict.begin(); it != dict.end(); ++it) {
262 s << it.key();
263 s << it.value()->documents.count();
264 s << it.value()->documents;
265 }
266 f.close();
267 writeDocumentList();
268}
269
270void Index::writeDocumentList()
271{
272 QFile f( docListFile );
273 if ( !f.open(QFile::WriteOnly ) )
274 return;
275 QDataStream s( &f );
276 s << docList;
277}
278
279void Index::readDict()
280{
281 QFile f( dictFile );
282 if ( !f.open(QFile::ReadOnly ) )
283 return;
284
285 dict.clear();
286 QDataStream s( &f );
287 QString key;
288 int numOfDocs;
289 QVector<Document> docs;
290 while ( !s.atEnd() ) {
291 s >> key;
292 s >> numOfDocs;
293 docs.resize(numOfDocs);
294 s >> docs;
295 dict.insert( key, new Entry( docs ) );
296 }
297 f.close();
298 readDocumentList();
299}
300
301void Index::readDocumentList()
302{
303 QFile f( docListFile );
304 if ( !f.open(QFile::ReadOnly ) )
305 return;
306 QDataStream s( &f );
307 s >> docList;
308}
309
310QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
311{
312 QList<Term> termList;
313 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it ) {
314 Entry *e = 0;
315 if ( (*it).contains(QLatin1Char('*')) ) {
316 QVector<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
317 termList.append( Term(QLatin1String("dummy"), wcts.count(), wcts ) );
318 } else if ( dict[ *it ] ) {
319 e = dict[ *it ];
320 termList.append( Term( *it, e->documents.count(), e->documents ) );
321 } else {
322 return QStringList();
323 }
324 }
325 if ( !termList.count() )
326 return QStringList();
327 qSort(termList);
328
329 QVector<Document> minDocs = termList.takeFirst().documents;
330 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
331 Term *t = &(*it);
332 QVector<Document> docs = t->documents;
333 for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
334 bool found = false;
335 for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
336 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
337 (*minDoc_it).frequency += (*doc_it).frequency;
338 found = true;
339 break;
340 }
341 }
342 if ( !found )
343 minDoc_it = minDocs.erase( minDoc_it );
344 else
345 ++minDoc_it;
346 }
347 }
348
349 QStringList results;
350 qSort( minDocs );
351 if ( termSeq.isEmpty() ) {
352 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it)
353 results << docList.at((int)(*it).docNumber);
354 return results;
355 }
356
357 QString fileName;
358 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) {
359 fileName = docList[ (int)(*it).docNumber ];
360 if ( searchForPattern( termSeq, seqWords, fileName ) )
361 results << fileName;
362 }
363 return results;
364}
365
366QString Index::getDocumentTitle( const QString &fullFileName )
367{
368 QUrl url(fullFileName);
369 QString fileName = url.toLocalFile();
370
371 if (documentTitleCache.contains(fileName))
372 return documentTitleCache.value(fileName);
373
374 QFile file( fileName );
375 if ( !file.open( QFile::ReadOnly ) ) {
376 qWarning( "cannot open file %s", qPrintable(fileName) );
377 return fileName;
378 }
379 QTextStream s( &file );
380 QString text = s.readAll();
381
382 int start = text.indexOf(QLatin1String("<title>"), 0, Qt::CaseInsensitive) + 7;
383 int end = text.indexOf(QLatin1String("</title>"), 0, Qt::CaseInsensitive);
384
385 QString title = tr("Untitled");
386 if (end - start > 0) {
387 title = text.mid(start, end - start);
388 if (Qt::mightBeRichText(title)) {
389 QTextDocument doc;
390 doc.setHtml(title);
391 title = doc.toPlainText();
392 }
393 }
394 documentTitleCache.insert(fileName, title);
395 return title;
396}
397
398QStringList Index::getWildcardTerms( const QString &term )
399{
400 QStringList lst;
401 QStringList terms = split( term );
402 QStringList::Iterator iter;
403
404 for(QHash<QString, Entry*>::Iterator it = dict.begin(); it != dict.end(); ++it) {
405 int index = 0;
406 bool found = false;
407 QString text( it.key() );
408 for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
409 if ( *iter == QLatin1String("*") ) {
410 found = true;
411 continue;
412 }
413 if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
414 found = false;
415 break;
416 }
417 index = text.indexOf( *iter, index );
418 if ( *iter == terms.last() && index != (int)text.length()-1 ) {
419 index = text.lastIndexOf( *iter );
420 if ( index != (int)text.length() - (int)(*iter).length() ) {
421 found = false;
422 break;
423 }
424 }
425 if ( index != -1 ) {
426 found = true;
427 index += (*iter).length();
428 continue;
429 } else {
430 found = false;
431 break;
432 }
433 }
434 if ( found )
435 lst << text;
436 }
437
438 return lst;
439}
440
441QStringList Index::split( const QString &str )
442{
443 QStringList lst;
444 int j = 0;
445 int i = str.indexOf(QLatin1Char('*'), j );
446
447 if (str.startsWith(QLatin1String("*")))
448 lst << QLatin1String("*");
449
450 while ( i != -1 ) {
451 if ( i > j && i <= (int)str.length() ) {
452 lst << str.mid( j, i - j );
453 lst << QLatin1String("*");
454 }
455 j = i + 1;
456 i = str.indexOf(QLatin1Char('*'), j );
457 }
458
459 int l = str.length() - 1;
460 if ( str.mid( j, l - j + 1 ).length() > 0 )
461 lst << str.mid( j, l - j + 1 );
462
463 return lst;
464}
465
466QVector<Document> Index::setupDummyTerm( const QStringList &terms )
467{
468 QList<Term> termList;
469 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
470 Entry *e = 0;
471 if ( dict[ *it ] ) {
472 e = dict[ *it ];
473 termList.append( Term( *it, e->documents.count(), e->documents ) );
474 }
475 }
476 QVector<Document> maxList(0);
477 if ( !termList.count() )
478 return maxList;
479 qSort(termList);
480
481 maxList = termList.takeLast().documents;
482 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
483 Term *t = &(*it);
484 QVector<Document> docs = t->documents;
485 for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
486 if ( maxList.indexOf( *docIt ) == -1 )
487 maxList.append( *docIt );
488 }
489 }
490 return maxList;
491}
492
493void Index::buildMiniDict( const QString &str )
494{
495 if ( miniDict[ str ] )
496 miniDict[ str ]->positions.append( wordNum );
497 ++wordNum;
498}
499
500bool Index::searchForPattern( const QStringList &patterns, const QStringList &words, const QString &fileName )
501{
502 QUrl url(fileName);
503 QString fName = url.toLocalFile();
504 QFile file( fName );
505 if ( !file.open( QFile::ReadOnly ) ) {
506 qWarning( "cannot open file %s", qPrintable(fName) );
507 return false;
508 }
509
510 wordNum = 3;
511 miniDict.clear();
512 QStringList::ConstIterator cIt = words.begin();
513 for ( ; cIt != words.end(); ++cIt )
514 miniDict.insert( *cIt, new PosEntry( 0 ) );
515
516 QTextStream s( &file );
517 QString text = s.readAll();
518 bool valid = true;
519 const QChar *buf = text.unicode();
520 QChar str[64];
521 QChar c = buf[0];
522 int j = 0;
523 int i = 0;
524 while ( j < text.length() ) {
525 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
526 valid = false;
527 if ( i > 1 )
528 buildMiniDict( QString(str,i) );
529 i = 0;
530 c = buf[++j];
531 continue;
532 }
533 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
534 valid = true;
535 c = buf[++j];
536 continue;
537 }
538 if ( !valid ) {
539 c = buf[++j];
540 continue;
541 }
542 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
543 str[i] = c.toLower();
544 ++i;
545 } else {
546 if ( i > 1 )
547 buildMiniDict( QString(str,i) );
548 i = 0;
549 }
550 c = buf[++j];
551 }
552 if ( i > 1 )
553 buildMiniDict( QString(str,i) );
554 file.close();
555
556 QStringList::ConstIterator patIt = patterns.begin();
557 QStringList wordLst;
558 QList<uint> a, b;
559 QList<uint>::iterator aIt;
560 for ( ; patIt != patterns.end(); ++patIt ) {
561 wordLst = (*patIt).split(QLatin1Char(' '));
562 a = miniDict[ wordLst[0] ]->positions;
563 for ( int j = 1; j < (int)wordLst.count(); ++j ) {
564 b = miniDict[ wordLst[j] ]->positions;
565 aIt = a.begin();
566 while ( aIt != a.end() ) {
567 if ( b.contains( *aIt + 1 )) {
568 (*aIt)++;
569 ++aIt;
570 } else {
571 aIt = a.erase( aIt );
572 }
573 }
574 }
575 }
576 if ( a.count() )
577 return true;
578 return false;
579}
580
581QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.