diff options
author | Qt by Nokia <qt-info@nokia.com> | 2011-04-27 12:05:43 +0200 |
---|---|---|
committer | axis <qt-info@nokia.com> | 2011-04-27 12:05:43 +0200 |
commit | 50123887ba0f33cf47520bee7c419d68742af2d1 (patch) | |
tree | 0eb8679b9e4e4370e59b44bfdcae616816e39aca /3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp |
Initial import from the monolithic Qt.
This is the beginning of revision history for this module. If you
want to look at revision history older than this, please refer to the
Qt Git wiki for how to use Git history grafting. At the time of
writing, this wiki is located here:
http://qt.gitorious.org/qt/pages/GitIntroductionWithQt
If you have already performed the grafting and you don't see any
history beyond this commit, try running "git log" with the "--follow"
argument.
Branched from the monolithic repo, Qt master branch, at commit
896db169ea224deb96c59ce8af800d019de63f12
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp new file mode 100644 index 000000000..5e9ac3c3b --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp @@ -0,0 +1,188 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "TermVector.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(index) + +Array<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS; + +SegmentTermVector::SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs) { + this->field = STRDUP_TtoT(field); + this->terms = terms; + this->termsLen = -1; //lazily get the size of the terms + this->termFreqs = termFreqs; +} + +SegmentTermVector::~SegmentTermVector(){ + _CLDELETE_CARRAY(field); + _CLDELETE_CARRAY_ALL(terms); + + _CLDELETE_ARRAY(termFreqs->values); + _CLDELETE(termFreqs); +} +TermPositionVector* SegmentTermVector::__asTermPositionVector(){ + return NULL; +} + +const TCHAR* SegmentTermVector::getField() { +return field; +} + +TCHAR* SegmentTermVector::toString() const{ +StringBuffer sb; +sb.appendChar('{'); +sb.append(field); +sb.append(_T(": ")); + +int32_t i=0; +while ( terms && terms[i] != NULL ){ + if (i>0) + sb.append(_T(", ")); + sb.append(terms[i]); + sb.appendChar('/'); + + sb.appendInt((*termFreqs)[i]); +} +sb.appendChar('}'); +return sb.toString(); +} + +int32_t SegmentTermVector::size() { +if ( terms == NULL ) + return 0; + +if ( termsLen == -1 ){ + termsLen=0; + while ( terms[termsLen] != 0 ) + termsLen++; +} +return termsLen; +} + +const TCHAR** SegmentTermVector::getTerms() { + return (const TCHAR**)terms; +} + +const Array<int32_t>* SegmentTermVector::getTermFrequencies() { + return termFreqs; +} + +int32_t SegmentTermVector::binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const +{ + int32_t low = 0; + int32_t hi = arraylen - 1; + int32_t mid = 0; + while (low <= hi) + { + mid = (low + hi) >> 1; + + int32_t c = _tcscmp(a[mid],key); + if (c==0) + return mid; + else if (c > 0) + hi = mid - 1; + else // This gets the insertion point right on the last loop. + low = ++mid; + } + return -mid - 1; +} + +int32_t SegmentTermVector::indexOf(const TCHAR* termText) { + if(terms == NULL) + return -1; + int32_t res = binarySearch(terms, size(), termText); + return res >= 0 ? res : -1; +} + +void SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret) { + // TODO: there must be a more efficient way of doing this. + // At least, we could advance the lower bound of the terms array + // as we find valid indexes. Also, it might be possible to leverage + // this even more by starting in the middle of the termNumbers array + // and thus dividing the terms array maybe in half with each found index. + ret.length = len; + ret.values = _CL_NEWARRAY(int32_t,len); + for (int32_t i=0; i<len; ++i) { + ret.values[i] = indexOf(termNumbers[start+ i]); + } +} + + + + +SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets): + SegmentTermVector(field,terms,termFreqs) +{ + this->offsets = offsets; + this->positions = positions; +} + +SegmentTermPositionVector::~SegmentTermPositionVector(){ + if ( offsets ){ + for (size_t i=0;i<offsets->length;i++){ + if ( offsets->values != NULL ){ + Array<TermVectorOffsetInfo>& offs = offsets->values[i]; + for ( size_t j=0;j<offs.length;j++ ){ + _CLDELETE_ARRAY(offs.values); + } + } + } + _CLDELETE_ARRAY(offsets->values); + _CLDELETE(offsets); + } + if ( positions ){ + for (size_t i=0;i<positions->length;i++){ + if ( positions->values != NULL ){ + Array<int32_t>& pos = positions->values[i]; + for ( size_t j=0;j<pos.length;j++ ){ + _CLDELETE_ARRAY(pos.values); + } + } + } + _CLDELETE_ARRAY(positions->values); + _CLDELETE(positions); + } +} + +TermPositionVector* SegmentTermPositionVector::__asTermPositionVector(){ + return this; +} +/** +* Returns an array of TermVectorOffsetInfo in which the term is found. +* +* @param index The position in the array to get the offsets from +* @return An array of TermVectorOffsetInfo objects or the empty list +* @see org.apache.lucene.analysis.Token +*/ +Array<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(int32_t index) { + if(offsets == NULL) + return NULL; + if (index >=0 && index < offsets->length) + return &offsets->values[index]; + else + return &TermVectorOffsetInfo::EMPTY_OFFSET_INFO; +} + +/** +* Returns an array of positions in which the term is found. +* Terms are identified by the index at which its number appears in the +* term String array obtained from the <code>indexOf</code> method. +*/ +Array<int32_t>* SegmentTermPositionVector::getTermPositions(int32_t index) { + if(positions == NULL) + return NULL; + + if (index >=0 && index < positions->length) + return &positions->values[index]; + else + return &EMPTY_TERM_POS; +} +CL_NS_END + |