summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
diff options
context:
space:
mode:
authorQt by Nokia <qt-info@nokia.com>2011-04-27 12:05:43 +0200
committeraxis <qt-info@nokia.com>2011-04-27 12:05:43 +0200
commit50123887ba0f33cf47520bee7c419d68742af2d1 (patch)
tree0eb8679b9e4e4370e59b44bfdcae616816e39aca /3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
Initial import from the monolithic Qt.
This is the beginning of revision history for this module. If you want to look at revision history older than this, please refer to the Qt Git wiki for how to use Git history grafting. At the time of writing, this wiki is located here: http://qt.gitorious.org/qt/pages/GitIntroductionWithQt If you have already performed the grafting and you don't see any history beyond this commit, try running "git log" with the "--follow" argument. Branched from the monolithic repo, Qt master branch, at commit 896db169ea224deb96c59ce8af800d019de63f12
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp188
1 files changed, 188 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
new file mode 100644
index 000000000..5e9ac3c3b
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/SegmentTermVector.cpp
@@ -0,0 +1,188 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "TermVector.h"
+#include "CLucene/util/StringBuffer.h"
+
+CL_NS_USE(util)
+CL_NS_DEF(index)
+
+Array<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS;
+
+SegmentTermVector::SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs) {
+ this->field = STRDUP_TtoT(field);
+ this->terms = terms;
+ this->termsLen = -1; //lazily get the size of the terms
+ this->termFreqs = termFreqs;
+}
+
+SegmentTermVector::~SegmentTermVector(){
+ _CLDELETE_CARRAY(field);
+ _CLDELETE_CARRAY_ALL(terms);
+
+ _CLDELETE_ARRAY(termFreqs->values);
+ _CLDELETE(termFreqs);
+}
+TermPositionVector* SegmentTermVector::__asTermPositionVector(){
+ return NULL;
+}
+
+const TCHAR* SegmentTermVector::getField() {
+return field;
+}
+
+TCHAR* SegmentTermVector::toString() const{
+StringBuffer sb;
+sb.appendChar('{');
+sb.append(field);
+sb.append(_T(": "));
+
+int32_t i=0;
+while ( terms && terms[i] != NULL ){
+ if (i>0)
+ sb.append(_T(", "));
+ sb.append(terms[i]);
+ sb.appendChar('/');
+
+ sb.appendInt((*termFreqs)[i]);
+}
+sb.appendChar('}');
+return sb.toString();
+}
+
+int32_t SegmentTermVector::size() {
+if ( terms == NULL )
+ return 0;
+
+if ( termsLen == -1 ){
+ termsLen=0;
+ while ( terms[termsLen] != 0 )
+ termsLen++;
+}
+return termsLen;
+}
+
+const TCHAR** SegmentTermVector::getTerms() {
+ return (const TCHAR**)terms;
+}
+
+const Array<int32_t>* SegmentTermVector::getTermFrequencies() {
+ return termFreqs;
+}
+
+int32_t SegmentTermVector::binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const
+{
+ int32_t low = 0;
+ int32_t hi = arraylen - 1;
+ int32_t mid = 0;
+ while (low <= hi)
+ {
+ mid = (low + hi) >> 1;
+
+ int32_t c = _tcscmp(a[mid],key);
+ if (c==0)
+ return mid;
+ else if (c > 0)
+ hi = mid - 1;
+ else // This gets the insertion point right on the last loop.
+ low = ++mid;
+ }
+ return -mid - 1;
+}
+
+int32_t SegmentTermVector::indexOf(const TCHAR* termText) {
+ if(terms == NULL)
+ return -1;
+ int32_t res = binarySearch(terms, size(), termText);
+ return res >= 0 ? res : -1;
+}
+
+void SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret) {
+ // TODO: there must be a more efficient way of doing this.
+ // At least, we could advance the lower bound of the terms array
+ // as we find valid indexes. Also, it might be possible to leverage
+ // this even more by starting in the middle of the termNumbers array
+ // and thus dividing the terms array maybe in half with each found index.
+ ret.length = len;
+ ret.values = _CL_NEWARRAY(int32_t,len);
+ for (int32_t i=0; i<len; ++i) {
+ ret.values[i] = indexOf(termNumbers[start+ i]);
+ }
+}
+
+
+
+
+SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets):
+ SegmentTermVector(field,terms,termFreqs)
+{
+ this->offsets = offsets;
+ this->positions = positions;
+}
+
+SegmentTermPositionVector::~SegmentTermPositionVector(){
+ if ( offsets ){
+ for (size_t i=0;i<offsets->length;i++){
+ if ( offsets->values != NULL ){
+ Array<TermVectorOffsetInfo>& offs = offsets->values[i];
+ for ( size_t j=0;j<offs.length;j++ ){
+ _CLDELETE_ARRAY(offs.values);
+ }
+ }
+ }
+ _CLDELETE_ARRAY(offsets->values);
+ _CLDELETE(offsets);
+ }
+ if ( positions ){
+ for (size_t i=0;i<positions->length;i++){
+ if ( positions->values != NULL ){
+ Array<int32_t>& pos = positions->values[i];
+ for ( size_t j=0;j<pos.length;j++ ){
+ _CLDELETE_ARRAY(pos.values);
+ }
+ }
+ }
+ _CLDELETE_ARRAY(positions->values);
+ _CLDELETE(positions);
+ }
+}
+
+TermPositionVector* SegmentTermPositionVector::__asTermPositionVector(){
+ return this;
+}
+/**
+* Returns an array of TermVectorOffsetInfo in which the term is found.
+*
+* @param index The position in the array to get the offsets from
+* @return An array of TermVectorOffsetInfo objects or the empty list
+* @see org.apache.lucene.analysis.Token
+*/
+Array<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(int32_t index) {
+ if(offsets == NULL)
+ return NULL;
+ if (index >=0 && index < offsets->length)
+ return &offsets->values[index];
+ else
+ return &TermVectorOffsetInfo::EMPTY_OFFSET_INFO;
+}
+
+/**
+* Returns an array of positions in which the term is found.
+* Terms are identified by the index at which its number appears in the
+* term String array obtained from the <code>indexOf</code> method.
+*/
+Array<int32_t>* SegmentTermPositionVector::getTermPositions(int32_t index) {
+ if(positions == NULL)
+ return NULL;
+
+ if (index >=0 && index < positions->length)
+ return &positions->values[index];
+ else
+ return &EMPTY_TERM_POS;
+}
+CL_NS_END
+