ponymail-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject incubator-ponymail git commit: scroll/scan is not supported in ES 5.x
Date Mon, 16 Jan 2017 17:00:43 GMT
Repository: incubator-ponymail
Updated Branches:
  refs/heads/master aa8192c26 -> 2bf51392a


scroll/scan is not supported in ES 5.x

This fixes #344

Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/2bf51392
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/2bf51392
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/2bf51392

Branch: refs/heads/master
Commit: 2bf51392a698fb3ab4a6e00a71e387ccbcbc89ee
Parents: aa8192c
Author: Sebb <sebb@apache.org>
Authored: Mon Jan 16 17:00:29 2017 +0000
Committer: Sebb <sebb@apache.org>
Committed: Mon Jan 16 17:00:29 2017 +0000

----------------------------------------------------------------------
 CHANGELOG.md             |  1 +
 site/api/lib/elastic.lua | 81 ++++++++++++++++++++++++-------------------
 site/api/pminfo.lua      | 24 ++++++-------
 site/api/stats.lua       | 24 ++++++-------
 4 files changed, 68 insertions(+), 62 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/2bf51392/CHANGELOG.md
----------------------------------------------------------------------
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e86af1e..09cae3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,7 @@
 - DRY: move leapYear and end of month calculations to utils
 - indicate which months are outside the archive span for a list (#340)
 - default to medium ID generator (#343)
+- scroll/scan is not supported in ES 5.x (#344)
 
 ## CHANGES in 0.9b:
 

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/2bf51392/site/api/lib/elastic.lua
----------------------------------------------------------------------
diff --git a/site/api/lib/elastic.lua b/site/api/lib/elastic.lua
index 95e8bef..f0c14ac 100644
--- a/site/api/lib/elastic.lua
+++ b/site/api/lib/elastic.lua
@@ -177,48 +177,57 @@ local function raw(query, doctype)
     return json or {}, url
 end
 
--- communicate between scan and scroll
-local scanHasBody = {}
+-- communicate between scroll calls
+local queryHasBody = {}
 
--- Raw query with scroll/scan
-local function scan(query, doctype)
-    doctype = doctype or default_doc
-    local url = config.es_url .. doctype .. "/_search?search_type=scan&scroll=1m"
-    local json = performRequest(url, query)
-    if json and json._scroll_id then
+--[[
+Raw query with scroll
+Parameters:
+   sidOrQuery - if table, then this is the initial query, otherwise it is the sid
+   doctype - optional document type, only relevant for initial query
+
+Returns:
+   json, sid
+]]
+local function scroll(sidOrQuery, doctype)
+    local json
+    local hasBody = false
+    if type(sidOrQuery) == 'table' then
+        local query = sidOrQuery
+        doctype = doctype or default_doc
         if doctype == "mbox" then
             -- Check if the query returns the body attribute
-           if contains(query._source, 'body') then
-               -- save the flag for the scroll function (don't bother saving if false)
-               scanHasBody[json._scroll_id] = true
-           end
+            if contains(query._source, 'body') then
+                hasBody = true
+            end
         end
-        return json._scroll_id
+        -- ensure we sort by _doc
+        query.sort = { '_doc' }
+        local url = config.es_url .. doctype .. "/_search?scroll=1m"
+        -- start off the scroll
+        json = performRequest(url, query)
+    else
+        local sid = sidOrQuery
+        hasBody = queryHasBody[sid]
+        queryHasBody[sid] = nil -- drop old entry (sid may change)
+        -- We have to do some gsubbing here, as ES expects us to be at the root of the ES
URL
+        -- But in case we're being proxied, let's just cut off the last part of the URL
+        local url = config.es_url:gsub("[^/]+/?$", "") .. "/_search/scroll?scroll=1m&scroll_id="
.. sid
+        -- continue the scroll
+        json = performRequest(url)
     end
-    return nil
-end
-
-local function scroll(sid)
-    -- We have to do some gsubbing here, as ES expects us to be at the root of the ES URL
-    -- But in case we're being proxied, let's just cut off the last part of the URL
-    local url = config.es_url:gsub("[^/]+/?$", "") .. "/_search/scroll?scroll=1m&scroll_id="
.. sid
-    local json = performRequest(url)
-    if json and json._scroll_id then
-        if scanHasBody[sid] then
-            -- propagate the setting for the next call
-            scanHasBody[sid] = nil -- no longer needed (must be done first in case sid has
not changed)
-            scanHasBody[json._scroll_id] = true
-            local dhh = json.hits.hits
-            for k = 1, #dhh do
-                local v = dhh[k]._source
-                if v.body == JSON.null then
-                    v.body = ''
-                end
+    if hasBody then
+        -- propagate the setting for the next call
+        queryHasBody[json._scroll_id] = true
+        local dhh = json.hits.hits
+        for k = 1, #dhh do
+            local v = dhh[k]._source
+            if v.body == JSON.null then
+                v.body = ''
             end
         end
-        return json, json._scroll_id
     end
-    return nil
+    return json, json._scroll_id
 end
 
 -- delete a scroll id after use
@@ -257,6 +266,9 @@ end
 
 -- module defs
 return {
+    -- maximum results that can be returned by a query
+    -- above this number, must use scrolling or search_after (ES 5.x)
+    MAX_RESULT_WINDOW = 10000,
     find = getHits,
     findFast = getHeaders,
     findFastReverse = getHeadersReverse,
@@ -265,7 +277,6 @@ return {
     index = index,
     default = setDefault,
     update = update,
-    scan = scan,
     scroll = scroll,
     scrollrelease = deleteScrollId
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/2bf51392/site/api/pminfo.lua
----------------------------------------------------------------------
diff --git a/site/api/pminfo.lua b/site/api/pminfo.lua
index 47fa0bf..18d27b2 100644
--- a/site/api/pminfo.lua
+++ b/site/api/pminfo.lua
@@ -27,8 +27,6 @@ function handle(r)
     local now = r:clock()
     local tnow = now
     local DD = 14
-    local MAXRESULTS = 10000 -- max value for from + size in a single query
-
     
     local NOWISH = math.floor(os.time() / 1800)
     local PMINFO_CACHE_KEY = "pminfo_cache_" .. r.hostname .. "-" .. NOWISH
@@ -125,23 +123,21 @@ function handle(r)
                 }
             }
         },
-        size = MAXRESULTS
+        size = elastic.MAX_RESULT_WINDOW
     }
     local hits = {}
     -- check if we need to use scrolling
-    if total_docs > MAXRESULTS then
-        local sid = elastic.scan(squery)
-        if sid then
-            doc, sid = elastic.scroll(sid)
-            while doc and doc.hits and doc.hits.hits and #doc.hits.hits > 0 do -- scroll
as long as we get new results
-                for k, v in pairs(doc.hits.hits) do
-                    table.insert(hits, v)
-                end
-                doc, sid = elastic.scroll(sid)
+    if total_docs > elastic.MAX_RESULT_WINDOW then
+        local sid
+        doc, sid = elastic.scroll(squery) -- init the scroll
+        while doc and doc.hits and doc.hits.hits and #doc.hits.hits > 0 do -- scroll as
long as we get new results
+            for k, v in pairs(doc.hits.hits) do
+                table.insert(hits, v)
             end
-            elastic.scrollrelease(sid) -- we're done with the sid, release it
+            doc, sid = elastic.scroll(sid)
         end
-        -- scroll/scan ignores the sort order!
+        elastic.scrollrelease(sid) -- we're done with the sid, release it
+        -- scroll always sorts by _doc so we need to fix that
         table.sort (hits, function (k1, k2) return k1._source.epoch > k2._source.epoch
end )
     else
         local doc = elastic.raw(squery)

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/2bf51392/site/api/stats.lua
----------------------------------------------------------------------
diff --git a/site/api/stats.lua b/site/api/stats.lua
index 135ded6..9df6f90 100644
--- a/site/api/stats.lua
+++ b/site/api/stats.lua
@@ -478,19 +478,19 @@ function handle(r)
             size = maxresults
         }
     
-    -- If max results limit is beyond 10k, we have to do a scan/scroll to fetch it.
-    if maxresults > 10000 then
-        local sid = elastic.scan(squery) -- get scroll ID
-        if sid then -- if results
-            local js, sid = elastic.scroll(sid)
-            while js and js.hits and js.hits.hits and #js.hits.hits > 0 do -- scroll as
long as we get new results
-                for k, v in pairs(js.hits.hits) do
-                    table.insert(dhh, v)
-                end
-                js, sid = elastic.scroll(sid)
+    -- If max results limit is beyond the limit, we have to do a scroll to fetch it.
+    if maxresults > elastic.MAX_RESULT_WINDOW then
+        squery.size = elastic.MAX_RESULT_WINDOW -- limit the maximum batch sizes
+        local js, sid = elastic.scroll(squery)
+        while js and js.hits and js.hits.hits and #js.hits.hits > 0 do -- scroll as long
as we get new results
+            for k, v in pairs(js.hits.hits) do
+                table.insert(dhh, v)
             end
-            elastic.scrollrelease(sid) -- we're done with the sid, release it
+            js, sid = elastic.scroll(sid)
         end
+        elastic.scrollrelease(sid) -- we're done with the sid, release it
+		    -- ES scroll uses _doc order for efficiency; we need to sort here
+    		table.sort (dhh, function (k1, k2) return k1._source.epoch > k2._source.epoch end
)
     -- otherwise, we can just do a standard raw query
     else
         local doc = elastic.raw(squery)
@@ -503,8 +503,6 @@ function handle(r)
     table.insert(t, r:clock() - tnow)
     tnow = r:clock()
     
-    -- Sometimes ES screws up, so let's sort for it!
-    table.sort (dhh, function (k1, k2) return k1._source.epoch > k2._source.epoch end
)
     
     for k = #dhh, 1, -1 do
         local v = dhh[k]


Mime
View raw message