diff options
Diffstat (limited to 'src/assistant/lib/qhelpsearchindexreader_clucene.cpp')
-rw-r--r-- | src/assistant/lib/qhelpsearchindexreader_clucene.cpp | 481 |
1 files changed, 481 insertions, 0 deletions
diff --git a/src/assistant/lib/qhelpsearchindexreader_clucene.cpp b/src/assistant/lib/qhelpsearchindexreader_clucene.cpp new file mode 100644 index 000000000..537e1d765 --- /dev/null +++ b/src/assistant/lib/qhelpsearchindexreader_clucene.cpp @@ -0,0 +1,481 @@ +/**************************************************************************** +** +** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the Qt Assistant of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "fulltextsearch/qindexreader_p.h" +#include "fulltextsearch/qqueryparser_p.h" +#include "fulltextsearch/qsearchable_p.h" +#include "qclucenefieldnames_p.h" +#include "qhelpenginecore.h" + +#include "qhelpsearchindexreader_clucene_p.h" + +#include <QtCore/QDir> +#include <QtCore/QSet> +#include <QtCore/QString> +#include <QtCore/QFileInfo> +#include <QtCore/QSharedPointer> +#include <QtCore/QStringList> +#include <QtCore/QTextStream> +#include <QtCore/QMutexLocker> + +QT_BEGIN_NAMESPACE + +namespace fulltextsearch { +namespace clucene { + +QHelpSearchIndexReaderClucene::QHelpSearchIndexReaderClucene() + : QHelpSearchIndexReader() +{ + // nothing todo +} + +QHelpSearchIndexReaderClucene::~QHelpSearchIndexReaderClucene() +{ +} + + +void QHelpSearchIndexReaderClucene::run() +{ + mutex.lock(); + + if (m_cancel) { + mutex.unlock(); + return; + } + + const QString collectionFile(this->m_collectionFile); + const QList<QHelpSearchQuery> &queryList = this->m_query; + const QString indexPath(m_indexFilesFolder); + + mutex.unlock(); + + QHelpEngineCore engine(collectionFile, 0); + if (!engine.setupData()) + return; + + QFileInfo fInfo(indexPath); + if (fInfo.exists() && !fInfo.isWritable()) { + qWarning("Full Text Search, could not read index (missing permissions)."); + return; + } + + if(QCLuceneIndexReader::indexExists(indexPath)) { + mutex.lock(); + if (m_cancel) { + mutex.unlock(); + return; + } + mutex.unlock(); + + emit searchingStarted(); + +#if !defined(QT_NO_EXCEPTIONS) + try { +#endif + QCLuceneBooleanQuery booleanQueryTitle; + QCLuceneBooleanQuery booleanQueryContent; + QCLuceneStandardAnalyzer analyzer; + const QStringList& attribList = + engine.filterAttributes(engine.currentFilter()); + bool titleQueryIsValid = buildQuery(queryList, TitleTokenizedField, + attribList, booleanQueryTitle, analyzer); + bool contentQueryIsValid = buildQuery(queryList, ContentField, + attribList, booleanQueryContent, analyzer); + if (!titleQueryIsValid && !contentQueryIsValid) { + emit searchingFinished(0); + return; + } + + QCLuceneIndexSearcher indexSearcher(indexPath); + + // QCLuceneHits object must be allocated on the heap, because + // there is no default constructor. + QSharedPointer<QCLuceneHits> titleHits; + QSharedPointer<QCLuceneHits> contentHits; + if (titleQueryIsValid) { + titleHits = QSharedPointer<QCLuceneHits>(new QCLuceneHits( + indexSearcher.search(booleanQueryTitle))); + } + if (contentQueryIsValid) { + contentHits = QSharedPointer<QCLuceneHits>(new QCLuceneHits( + indexSearcher.search(booleanQueryContent))); + } + bool boost = true; + if ((titleHits.isNull() || titleHits->length() == 0) + && (contentHits.isNull() || contentHits->length() == 0)) { + booleanQueryTitle = QCLuceneBooleanQuery(); + booleanQueryContent = QCLuceneBooleanQuery(); + titleQueryIsValid = + buildTryHarderQuery(queryList, TitleTokenizedField, + attribList, booleanQueryTitle, analyzer); + contentQueryIsValid = + buildTryHarderQuery(queryList, ContentField, attribList, + booleanQueryContent, analyzer); + if (!titleQueryIsValid && !contentQueryIsValid) { + emit searchingFinished(0); + return; + } + if (titleQueryIsValid) { + titleHits = QSharedPointer<QCLuceneHits>(new QCLuceneHits( + indexSearcher.search(booleanQueryTitle))); + } + if (contentQueryIsValid) { + contentHits = QSharedPointer<QCLuceneHits>(new QCLuceneHits( + indexSearcher.search(booleanQueryContent))); + } + boost = false; + } + QList<QSharedPointer<QCLuceneHits> > cluceneHitsList; + if (!titleHits.isNull()) + cluceneHitsList.append(titleHits); + if (!contentHits.isNull()) + cluceneHitsList.append(contentHits); + + QSet<QString> pathSet; + QCLuceneDocument document; + const QStringList namespaceList = engine.registeredDocumentations(); + + foreach (const QSharedPointer<QCLuceneHits> &hits, cluceneHitsList) { + for (qint32 i = 0; i < hits->length(); i++) { + document = hits->document(i); + const QString path = document.get(PathField); + if (!pathSet.contains(path) && namespaceList.contains( + document.get(NamespaceField), Qt::CaseInsensitive)) { + pathSet.insert(path); + hitList.append(qMakePair(path, document.get(TitleField))); + } + document.clear(); + + mutex.lock(); + if (m_cancel) { + mutex.unlock(); + emit searchingFinished(0); + return; + } + mutex.unlock(); + } + } + + indexSearcher.close(); + const int count = hitList.count(); + if ((count > 0) && boost) + boostSearchHits(engine, hitList, queryList); + emit searchingFinished(hitList.count()); + +#if !defined(QT_NO_EXCEPTIONS) + } catch(...) { + mutex.lock(); + hitList.clear(); + mutex.unlock(); + emit searchingFinished(0); + } +#endif + } +} + +bool QHelpSearchIndexReaderClucene::buildQuery( + const QList<QHelpSearchQuery> &queries, const QString &fieldName, + const QStringList &filterAttributes, QCLuceneBooleanQuery &booleanQuery, + QCLuceneAnalyzer &analyzer) +{ + bool queryIsValid = false; + foreach (const QHelpSearchQuery &query, queries) { + if (fieldName != ContentField && isNegativeQuery(query)) { + queryIsValid = false; + break; + } + switch (query.fieldName) { + case QHelpSearchQuery::FUZZY: + if (addFuzzyQuery(query, fieldName, booleanQuery, analyzer)) + queryIsValid = true; + break; + case QHelpSearchQuery::WITHOUT: + if (fieldName != ContentField) + return false; + if (addWithoutQuery(query, fieldName, booleanQuery)) + queryIsValid = true; + break; + case QHelpSearchQuery::PHRASE: + if (addPhraseQuery(query, fieldName, booleanQuery)) + queryIsValid = true; + break; + case QHelpSearchQuery::ALL: + if (addAllQuery(query, fieldName, booleanQuery)) + queryIsValid = true; + break; + case QHelpSearchQuery::DEFAULT: + if (addDefaultQuery(query, fieldName, true, booleanQuery, analyzer)) + queryIsValid = true; + break; + case QHelpSearchQuery::ATLEAST: + if (addAtLeastQuery(query, fieldName, booleanQuery, analyzer)) + queryIsValid = true; + break; + default: + Q_ASSERT(!"Invalid field name"); + } + } + + if (queryIsValid && !filterAttributes.isEmpty()) { + queryIsValid = + addAttributesQuery(filterAttributes, booleanQuery, analyzer); + } + + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::buildTryHarderQuery( + const QList<QHelpSearchQuery> &queries, const QString &fieldName, + const QStringList &filterAttributes, QCLuceneBooleanQuery &booleanQuery, + QCLuceneAnalyzer &analyzer) +{ + if (queries.isEmpty()) + return false; + const QHelpSearchQuery &query = queries.front(); + if (query.fieldName != QHelpSearchQuery::DEFAULT) + return false; + if (isNegativeQuery(query)) + return false; + if (!addDefaultQuery(query, fieldName, false, booleanQuery, analyzer)) + return false; + if (filterAttributes.isEmpty()) + return true; + return addAttributesQuery(filterAttributes, booleanQuery, analyzer); +} + +bool QHelpSearchIndexReaderClucene::isNegativeQuery(const QHelpSearchQuery &query) const +{ + const QString &search = query.wordList.join(" "); + return search.contains('!') || search.contains('-') + || search.contains(QLatin1String(" NOT ")); +} + +bool QHelpSearchIndexReaderClucene::addFuzzyQuery(const QHelpSearchQuery &query, + const QString &fieldName, QCLuceneBooleanQuery &booleanQuery, + QCLuceneAnalyzer &analyzer) +{ + bool queryIsValid = false; + const QLatin1String fuzzy("~"); + foreach (const QString &term, query.wordList) { + if (!term.isEmpty()) { + QCLuceneQuery *lQuery = + QCLuceneQueryParser::parse(term + fuzzy, fieldName, analyzer); + if (lQuery != 0) { + booleanQuery.add(lQuery, true, false, false); + queryIsValid = true; + } + } + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addWithoutQuery(const QHelpSearchQuery &query, + const QString &fieldName, QCLuceneBooleanQuery &booleanQuery) +{ + bool queryIsValid = false; + const QStringList &stopWords = QCLuceneStopAnalyzer().englishStopWords(); + foreach (const QString &term, query.wordList) { + if (stopWords.contains(term, Qt::CaseInsensitive)) + continue; + QCLuceneQuery *lQuery = new QCLuceneTermQuery(QCLuceneTerm( + fieldName, term.toLower())); + booleanQuery.add(lQuery, true, false, true); + queryIsValid = true; + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addPhraseQuery(const QHelpSearchQuery &query, + const QString &fieldName, QCLuceneBooleanQuery &booleanQuery) +{ + bool queryIsValid = false; + const QString &term = query.wordList.at(0).toLower(); + if (term.contains(QLatin1Char(' '))) { + const QStringList termList = term.split(QLatin1String(" ")); + QCLucenePhraseQuery *q = new QCLucenePhraseQuery(); + const QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords(); + foreach (const QString &term, termList) { + if (!stopWords.contains(term, Qt::CaseInsensitive)) + q->addTerm(QCLuceneTerm(fieldName, term.toLower())); + } + if (!q->getTerms().isEmpty()) { + booleanQuery.add(q, true, true, false); + queryIsValid = true; + } + } else { + QCLuceneQuery *lQuery = new QCLuceneTermQuery(QCLuceneTerm( + fieldName, term.toLower())); + booleanQuery.add(lQuery, true, true, false); + queryIsValid = true; + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addAllQuery(const QHelpSearchQuery &query, + const QString &fieldName, QCLuceneBooleanQuery &booleanQuery) +{ + bool queryIsValid = false; + const QStringList &stopWords = QCLuceneStopAnalyzer().englishStopWords(); + foreach (const QString &term, query.wordList) { + if (stopWords.contains(term, Qt::CaseInsensitive)) + continue; + QCLuceneQuery *lQuery = new QCLuceneTermQuery(QCLuceneTerm( + fieldName, term.toLower())); + booleanQuery.add(lQuery, true, true, false); + queryIsValid = true; + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addDefaultQuery(const QHelpSearchQuery &query, + const QString &fieldName, bool allTermsRequired, + QCLuceneBooleanQuery &booleanQuery, + QCLuceneAnalyzer &analyzer) +{ + bool queryIsValid = false; + foreach (const QString &term, query.wordList) { + QCLuceneQuery *lQuery = + QCLuceneQueryParser::parse(term.toLower(), fieldName, analyzer); + if (lQuery) { + booleanQuery.add(lQuery, true, allTermsRequired, false); + queryIsValid = true; + } + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addAtLeastQuery( + const QHelpSearchQuery &query, const QString &fieldName, + QCLuceneBooleanQuery &booleanQuery, QCLuceneAnalyzer &analyzer) +{ + bool queryIsValid = false; + foreach (const QString &term, query.wordList) { + if (!term.isEmpty()) { + QCLuceneQuery *lQuery = + QCLuceneQueryParser::parse(term, fieldName, analyzer); + if (lQuery) { + booleanQuery.add(lQuery, true, false, false); + queryIsValid = true; + } + } + } + return queryIsValid; +} + +bool QHelpSearchIndexReaderClucene::addAttributesQuery( + const QStringList &filterAttributes, QCLuceneBooleanQuery &booleanQuery, + QCLuceneAnalyzer &analyzer) +{ + QCLuceneQuery* lQuery = QCLuceneQueryParser::parse(QLatin1String("+") + + filterAttributes.join(QLatin1String(" +")), AttributeField, analyzer); + if (!lQuery) + return false; + booleanQuery.add(lQuery, true, true, false); + return true; +} + +void QHelpSearchIndexReaderClucene::boostSearchHits(const QHelpEngineCore &engine, + QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList) +{ + foreach (const QHelpSearchQuery &query, queryList) { + if (query.fieldName != QHelpSearchQuery::DEFAULT) + continue; + + QString joinedQuery = query.wordList.join(QLatin1String(" ")); + + QCLuceneStandardAnalyzer analyzer; + QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse( + joinedQuery, ContentField, analyzer); + + if (parsedQuery) { + joinedQuery = parsedQuery->toString(); + delete parsedQuery; + } + + const QString contentString(ContentField + QLatin1String(":")); + int length = contentString.length(); + int index = joinedQuery.indexOf(contentString); + + QString term; + int nextIndex = 0; + QStringList searchTerms; + while (index != -1) { + nextIndex = joinedQuery.indexOf(contentString, index + 1); + term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified(); + if (term.startsWith(QLatin1String("\"")) + && term.endsWith(QLatin1String("\""))) { + searchTerms.append(term.remove(QLatin1String("\""))); + } else { + searchTerms += term.split(QLatin1Char(' ')); + } + index = nextIndex; + } + searchTerms.removeDuplicates(); + + int count = qMin(75, hitList.count()); + QMap<int, QHelpSearchEngine::SearchHit> hitMap; + for (int i = 0; i < count; ++i) { + const QHelpSearchEngine::SearchHit &hit = hitList.at(i); + QString data = QString::fromUtf8(engine.fileData(hit.first)); + + int counter = 0; + foreach (const QString &term, searchTerms) + counter += data.count(term, Qt::CaseInsensitive); + hitMap.insertMulti(counter, hit); + } + + QList<QHelpSearchEngine::SearchHit> boostedList; + QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd(); + do { + --it; + boostedList.append(it.value()); + } while (it != hitMap.constBegin()); + boostedList += hitList.mid(count, hitList.count()); + mutex.lock(); + hitList = boostedList; + mutex.unlock(); + } +} + +} // namespace clucene +} // namespace fulltextsearch + +QT_END_NAMESPACE |