couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Couchdb Wiki] Update of "FullTextIndexWithView" by DanielReverri
Date Sun, 20 Jul 2008 22:39:20 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Couchdb Wiki" for change notification.

The following page has been changed by DanielReverri:
http://wiki.apache.org/couchdb/FullTextIndexWithView

New page:
I wanted to throw this idea out there to see what people thought.

There has been a lot of discussion about integrating full text search into couch and possibly
implementing full text search in Erlang. Would it be worth investigating the use of CouchDB's
!MapReduce functionality to implement a full text indexer? I whipped together a short example
using views. It implements a simple white space tokenizer in !JavaScript, emits each token
with it's doc id and position, and reduces each token to a list of doc ids and positions.

Here is the map function:

{{{
function(doc) 
{
    var tokenEmit = function(token) {
        emit([token.value,token.field], [this._id,token.position]);
    }
    
    var whiteSpaceAnalyzer = function(str, field) {
        // Returns tokens split by white space
        // token: { value: tokenString, position: [0,10] }
        var len = str.length;
        var tokenPositions = new Array();
        var startPosition = null;

        var isTokenChar = function(Char) {
            if (Char === ' ' || Char === '\t' || Char === '\n')
                return false;
            return true;
        }

        for(var i=0; i < len; i++)
        {
            if(startPosition == null)
            {
                if(isTokenChar(str[i]))
                {
                    // start of word
                    startPosition = i;
                    if( i+1 == len )
                    {
                        // end of string
                        tokenPositions[tokenPositions.length] = [startPosition, i+1];
                    }
                }
            }
            else
            {
                if(!isTokenChar(str[i]))
                {
                    // end of word
                    tokenPositions[tokenPositions.length] = [startPosition, i];
                    startPosition = null; // reset startPosition
                    continue;
                }
                
                if( i+1 == len )
                {
                    // end of string
                    tokenPositions[tokenPositions.length] = [startPosition, i+1];
                }
            }
        }

        var tokenMap = function(tokenPosition) {
            var token = this.str.substring(tokenPosition[0],tokenPosition[1]);
            return { value: token, field:this.field, position: tokenPosition };
        }
        
        return tokenPositions.map(tokenMap,{str:str,field:field});
    }
    
    var tokens;
    
    for (field in doc) {
        if (typeof(doc[field])=='string') {
            tokens = whiteSpaceAnalyzer(doc[field], field);
            tokens.map(tokenEmit, doc);
        }
    }
}
}}}

Here is the reduce function:

{{{
function(keys,values,combine)
{
    var result = new Array();
    var docHash = new Array();
    if(combine) 
    {
        for(var v in values)
        {
            var docObject = values[v][0];
            var docId = docObject["doc"];
            var positions = docObject["pos"];
            if(docHash[docId] == null)
            {
                docHash[docId]=new Array();
            }
            docHash[docId] = docHash[docId].concat(positions);
        }
        for(var i in docHash){
            result[result.length]={doc:i,pos:docHash[i]};
        }
    }
    else
    {
        for(var j in values)
        {
            var docId = values[j][0];
            var position = values[j][1];
            if(docHash[docId] == null)
            {
            docHash[docId]=new Array();
            }
            docHash[docId] = docHash[docId].concat([position]);
        }
        for(var i in docHash){
            result[result.length]={doc:i,pos:docHash[i]};
        }
    }
    return result;  
}
}}}

The key emitted from the view is {{{["token","field"]}}}. This allows terms to be searched
per field while also allowing the use of group_level=1 to combine the results of all fields.
Combining results of multiple fields currently eliminates the use of positions.

To reduce the amount of information passed during view generation the whiteSpaceAnalyzer function
can be moved to the main.js file.

Is this worth pursuing further?

Mime
View raw message