diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp new file mode 100644 index 000000000..5e9ac3c3b --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp @@ -0,0 +1,188 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "TermVector.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(index) + +Array<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS; + +SegmentTermVector::SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs) { + this->field = STRDUP_TtoT(field); + this->terms = terms; + this->termsLen = -1; //lazily get the size of the terms + this->termFreqs = termFreqs; +} + +SegmentTermVector::~SegmentTermVector(){ + _CLDELETE_CARRAY(field); + _CLDELETE_CARRAY_ALL(terms); + + _CLDELETE_ARRAY(termFreqs->values); + _CLDELETE(termFreqs); +} +TermPositionVector* SegmentTermVector::__asTermPositionVector(){ + return NULL; +} + +const TCHAR* SegmentTermVector::getField() { +return field; +} + +TCHAR* SegmentTermVector::toString() const{ +StringBuffer sb; +sb.appendChar('{'); +sb.append(field); +sb.append(_T(": ")); + +int32_t i=0; +while ( terms && terms[i] != NULL ){ + if (i>0) + sb.append(_T(", ")); + sb.append(terms[i]); + sb.appendChar('/'); + + sb.appendInt((*termFreqs)[i]); +} +sb.appendChar('}'); +return sb.toString(); +} + +int32_t SegmentTermVector::size() { +if ( terms == NULL ) + return 0; + +if ( termsLen == -1 ){ + termsLen=0; + while ( terms[termsLen] != 0 ) + termsLen++; +} +return termsLen; +} + +const TCHAR** SegmentTermVector::getTerms() { + return (const TCHAR**)terms; +} + +const Array<int32_t>* SegmentTermVector::getTermFrequencies() { + return termFreqs; +} + +int32_t SegmentTermVector::binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const +{ + int32_t low = 0; + int32_t hi = arraylen - 1; + int32_t mid = 0; + while (low <= hi) + { + mid = (low + hi) >> 1; + + int32_t c = _tcscmp(a[mid],key); + if (c==0) + return mid; + else if (c > 0) + hi = mid - 1; + else // This gets the insertion point right on the last loop. + low = ++mid; + } + return -mid - 1; +} + +int32_t SegmentTermVector::indexOf(const TCHAR* termText) { + if(terms == NULL) + return -1; + int32_t res = binarySearch(terms, size(), termText); + return res >= 0 ? res : -1; +} + +void SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret) { + // TODO: there must be a more efficient way of doing this. + // At least, we could advance the lower bound of the terms array + // as we find valid indexes. Also, it might be possible to leverage + // this even more by starting in the middle of the termNumbers array + // and thus dividing the terms array maybe in half with each found index. + ret.length = len; + ret.values = _CL_NEWARRAY(int32_t,len); + for (int32_t i=0; i<len; ++i) { + ret.values[i] = indexOf(termNumbers[start+ i]); + } +} + + + + +SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets): + SegmentTermVector(field,terms,termFreqs) +{ + this->offsets = offsets; + this->positions = positions; +} + +SegmentTermPositionVector::~SegmentTermPositionVector(){ + if ( offsets ){ + for (size_t i=0;i<offsets->length;i++){ + if ( offsets->values != NULL ){ + Array<TermVectorOffsetInfo>& offs = offsets->values[i]; + for ( size_t j=0;j<offs.length;j++ ){ + _CLDELETE_ARRAY(offs.values); + } + } + } + _CLDELETE_ARRAY(offsets->values); + _CLDELETE(offsets); + } + if ( positions ){ + for (size_t i=0;i<positions->length;i++){ + if ( positions->values != NULL ){ + Array<int32_t>& pos = positions->values[i]; + for ( size_t j=0;j<pos.length;j++ ){ + _CLDELETE_ARRAY(pos.values); + } + } + } + _CLDELETE_ARRAY(positions->values); + _CLDELETE(positions); + } +} + +TermPositionVector* SegmentTermPositionVector::__asTermPositionVector(){ + return this; +} +/** +* Returns an array of TermVectorOffsetInfo in which the term is found. +* +* @param index The position in the array to get the offsets from +* @return An array of TermVectorOffsetInfo objects or the empty list +* @see org.apache.lucene.analysis.Token +*/ +Array<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(int32_t index) { + if(offsets == NULL) + return NULL; + if (index >=0 && index < offsets->length) + return &offsets->values[index]; + else + return &TermVectorOffsetInfo::EMPTY_OFFSET_INFO; +} + +/** +* Returns an array of positions in which the term is found. +* Terms are identified by the index at which its number appears in the +* term String array obtained from the <code>indexOf</code> method. +*/ +Array<int32_t>* SegmentTermPositionVector::getTermPositions(int32_t index) { + if(positions == NULL) + return NULL; + + if (index >=0 && index < positions->length) + return &positions->values[index]; + else + return &EMPTY_TERM_POS; +} +CL_NS_END + |