summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp188
1 files changed, 188 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
new file mode 100644
index 000000000..5e9ac3c3b
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
@@ -0,0 +1,188 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "TermVector.h"
+#include "CLucene/util/StringBuffer.h"
+
+CL_NS_USE(util)
+CL_NS_DEF(index)
+
+Array<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS;
+
+SegmentTermVector::SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs) {
+ this->field = STRDUP_TtoT(field);
+ this->terms = terms;
+ this->termsLen = -1; //lazily get the size of the terms
+ this->termFreqs = termFreqs;
+}
+
+SegmentTermVector::~SegmentTermVector(){
+ _CLDELETE_CARRAY(field);
+ _CLDELETE_CARRAY_ALL(terms);
+
+ _CLDELETE_ARRAY(termFreqs->values);
+ _CLDELETE(termFreqs);
+}
+TermPositionVector* SegmentTermVector::__asTermPositionVector(){
+ return NULL;
+}
+
+const TCHAR* SegmentTermVector::getField() {
+return field;
+}
+
+TCHAR* SegmentTermVector::toString() const{
+StringBuffer sb;
+sb.appendChar('{');
+sb.append(field);
+sb.append(_T(": "));
+
+int32_t i=0;
+while ( terms && terms[i] != NULL ){
+ if (i>0)
+ sb.append(_T(", "));
+ sb.append(terms[i]);
+ sb.appendChar('/');
+
+ sb.appendInt((*termFreqs)[i]);
+}
+sb.appendChar('}');
+return sb.toString();
+}
+
+int32_t SegmentTermVector::size() {
+if ( terms == NULL )
+ return 0;
+
+if ( termsLen == -1 ){
+ termsLen=0;
+ while ( terms[termsLen] != 0 )
+ termsLen++;
+}
+return termsLen;
+}
+
+const TCHAR** SegmentTermVector::getTerms() {
+ return (const TCHAR**)terms;
+}
+
+const Array<int32_t>* SegmentTermVector::getTermFrequencies() {
+ return termFreqs;
+}
+
+int32_t SegmentTermVector::binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const
+{
+ int32_t low = 0;
+ int32_t hi = arraylen - 1;
+ int32_t mid = 0;
+ while (low <= hi)
+ {
+ mid = (low + hi) >> 1;
+
+ int32_t c = _tcscmp(a[mid],key);
+ if (c==0)
+ return mid;
+ else if (c > 0)
+ hi = mid - 1;
+ else // This gets the insertion point right on the last loop.
+ low = ++mid;
+ }
+ return -mid - 1;
+}
+
+int32_t SegmentTermVector::indexOf(const TCHAR* termText) {
+ if(terms == NULL)
+ return -1;
+ int32_t res = binarySearch(terms, size(), termText);
+ return res >= 0 ? res : -1;
+}
+
+void SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret) {
+ // TODO: there must be a more efficient way of doing this.
+ // At least, we could advance the lower bound of the terms array
+ // as we find valid indexes. Also, it might be possible to leverage
+ // this even more by starting in the middle of the termNumbers array
+ // and thus dividing the terms array maybe in half with each found index.
+ ret.length = len;
+ ret.values = _CL_NEWARRAY(int32_t,len);
+ for (int32_t i=0; i<len; ++i) {
+ ret.values[i] = indexOf(termNumbers[start+ i]);
+ }
+}
+
+
+
+
+SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets):
+ SegmentTermVector(field,terms,termFreqs)
+{
+ this->offsets = offsets;
+ this->positions = positions;
+}
+
+SegmentTermPositionVector::~SegmentTermPositionVector(){
+ if ( offsets ){
+ for (size_t i=0;i<offsets->length;i++){
+ if ( offsets->values != NULL ){
+ Array<TermVectorOffsetInfo>& offs = offsets->values[i];
+ for ( size_t j=0;j<offs.length;j++ ){
+ _CLDELETE_ARRAY(offs.values);
+ }
+ }
+ }
+ _CLDELETE_ARRAY(offsets->values);
+ _CLDELETE(offsets);
+ }
+ if ( positions ){
+ for (size_t i=0;i<positions->length;i++){
+ if ( positions->values != NULL ){
+ Array<int32_t>& pos = positions->values[i];
+ for ( size_t j=0;j<pos.length;j++ ){
+ _CLDELETE_ARRAY(pos.values);
+ }
+ }
+ }
+ _CLDELETE_ARRAY(positions->values);
+ _CLDELETE(positions);
+ }
+}
+
+TermPositionVector* SegmentTermPositionVector::__asTermPositionVector(){
+ return this;
+}
+/**
+* Returns an array of TermVectorOffsetInfo in which the term is found.
+*
+* @param index The position in the array to get the offsets from
+* @return An array of TermVectorOffsetInfo objects or the empty list
+* @see org.apache.lucene.analysis.Token
+*/
+Array<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(int32_t index) {
+ if(offsets == NULL)
+ return NULL;
+ if (index >=0 && index < offsets->length)
+ return &offsets->values[index];
+ else
+ return &TermVectorOffsetInfo::EMPTY_OFFSET_INFO;
+}
+
+/**
+* Returns an array of positions in which the term is found.
+* Terms are identified by the index at which its number appears in the
+* term String array obtained from the <code>indexOf</code> method.
+*/
+Array<int32_t>* SegmentTermPositionVector::getTermPositions(int32_t index) {
+ if(positions == NULL)
+ return NULL;
+
+ if (index >=0 && index < positions->length)
+ return &positions->values[index];
+ else
+ return &EMPTY_TERM_POS;
+}
+CL_NS_END
+