From commits-return-1138-archive-asf-public=cust-asf.ponee.io@ponymail.incubator.apache.org Wed Oct 10 01:20:46 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id E6377180668 for ; Wed, 10 Oct 2018 01:20:45 +0200 (CEST) Received: (qmail 81291 invoked by uid 500); 9 Oct 2018 23:20:45 -0000 Mailing-List: contact commits-help@ponymail.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ponymail.incubator.apache.org Delivered-To: mailing list commits@ponymail.incubator.apache.org Received: (qmail 81282 invoked by uid 99); 9 Oct 2018 23:20:45 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 09 Oct 2018 23:20:45 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 65E6285065; Tue, 9 Oct 2018 23:20:44 +0000 (UTC) Date: Tue, 09 Oct 2018 23:20:44 +0000 To: "commits@ponymail.apache.org" Subject: [incubator-ponymail] branch master updated: Bug: cannot download more than 10K mails to a mbox file MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <153912724436.12602.4851432514386290328@gitbox.apache.org> From: sebb@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: incubator-ponymail X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 557920c07620c786fa18d026e58396325f40c965 X-Git-Newrev: e6e5d80caa509a803e91488b52c2aced87c97c9f X-Git-Rev: e6e5d80caa509a803e91488b52c2aced87c97c9f X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. sebb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git The following commit(s) were added to refs/heads/master by this push: new e6e5d80 Bug: cannot download more than 10K mails to a mbox file e6e5d80 is described below commit e6e5d80caa509a803e91488b52c2aced87c97c9f Author: Sebb AuthorDate: Wed Oct 10 00:20:42 2018 +0100 Bug: cannot download more than 10K mails to a mbox file This fixes #475 --- CHANGELOG.md | 1 + site/api/mbox.lua | 166 +++++++++++++++++++++++++++++++++++------------------- 2 files changed, 110 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 658d174..02860be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ## Changes in 0.11-SNAPSHOT +- Bug: cannot download more than 10K mails to a mbox file (#475) - Bug: no need to sort after scroll (#477) - Enh: Ensure non-printable chars are not lost in source and mbox output (#476) - Enh: display buttons even if no mails are found in a month (#470) diff --git a/site/api/mbox.lua b/site/api/mbox.lua index 780f242..2e94f42 100644 --- a/site/api/mbox.lua +++ b/site/api/mbox.lua @@ -53,6 +53,37 @@ local function getFromLine(r, source) return "From " .. replyTo .. " " .. timeStamp end +local function writeMbox(r, docs) + -- for each email, get the actual source of it to plop into the mbox file + for k, v in pairs(docs.hits.hits) do + v = v._source + local doc = elastic.get('mbox_source', v.mid) + if doc and doc.source then + local checkFirst -- should we check the first line? + if not doc.source:match('^From ') then -- only add the header if there is none + r:puts(getFromLine(r, doc.source)) + r:puts("\n") + checkFirst=true + else + checkFirst=false + end + + -- pick out individual lines (including last which may not have EOL) + -- it's tricky to add the prefix to the output unless the From is at the start of a line + -- so it's easier to just skip the first match if necessary + for line in doc.source:gmatch("[^\r\n]*\r?\n?") do + -- check if 'From ' needs to be escaped + if checkFirst and line:match("^From ") then r:puts(">") end + checkFirst=true + -- TODO consider whether to optionally prefix '>From ', '^>>From ' etc. + -- If so, just change the RE to "^>*From " + r:write(line) -- original line + end + r:puts("\n") + end + end +end + function handle(r) cross.contentType(r, "application/mbox") local get = r:parseargs() @@ -77,75 +108,96 @@ function handle(r) if r.headers_out then r.headers_out['Content-Disposition'] = ("attachment; filename=%s_%04d-%02d.mbox"):format(flid,y,m) end - - -- fetch all results from the list (up to 10k results), make sure to get the 'private' element - local docs = elastic.raw { - _source = {'mid','private'}, + + local DATERANGE = { + range = { + date = { + gte = ("%04d/%02d/%02d 00:00:00"):format(y,m,1), + lte = ("%04d/%02d/%02d 23:59:59"):format(y,m,d) + } + } + } + + local LIST = { + term = { + list_raw = lid + } + } + + -- Pre-process the list to find its size and whether there are any private mails + local squery = { query = { bool = { must = { - { - range = { - date = { - gte = ("%04d/%02d/%02d 00:00:00"):format(y,m,1), - lte = ("%04d/%02d/%02d 23:59:59"):format(y,m,d) - } - } - }, - { - term = { - list_raw = lid - } - } + DATERANGE, + LIST } } }, - sort = { + size = 0, -- no data wanted this time + aggs = { + privacy = { + terms = { + field = "private" + } + } + } + } + + -- find list details + local docs = elastic.raw(squery) + local total_docs = docs.hits.total + + local fetchPrivate = false -- should we try to fetch private messages? + for _, privacy in pairs(docs.aggregations.privacy.buckets) do + -- do we have a private message? + if privacy.key_as_string == "true" and privacy.doc_count > 0 then + -- if so, are we allowed access? + fetchPrivate = aaa.canAccessList(r, lid, user.get(r)) + break + end + end + + -- Now set up the data query + local MUST + if fetchPrivate then + MUST = { + DATERANGE, + LIST + } + else -- either there are no private messages or we don't have access + MUST = { + DATERANGE, + LIST, { - epoch = { - order = "asc" + term = { + private = false } - } + } + } + end + + -- create the actual query + local squery = { + _source = {'mid'}, + query = { + bool = { + must = MUST + } }, - size = 10000 + size = elastic.MAX_RESULT_WINDOW } - local account = user.get(r) - local listAccessible = nil -- not yet initialised - -- for each email, get the actual source of it to plop into the mbox file - for k, v in pairs(docs.hits.hits) do - v = v._source - -- aaa.rights() can be expensive, so only do it once per download - if v.private and listAccessible == nil then - -- we are dealing with a single list here so only need to check once - listAccessible = aaa.canAccessList(r, lid, account) - end - if listAccessible or not v.private then - local doc = elastic.get('mbox_source', v.mid) - if doc and doc.source then - local checkFirst -- should we check the first line? - if not doc.source:match('^From ') then -- only add the header if there is none - r:puts(getFromLine(r, doc.source)) - r:puts("\n") - checkFirst=true - else - checkFirst=false - end - - -- pick out individual lines (including last which may not have EOL) - -- it's tricky to add the prefix to the output unless the From is at the start of a line - -- so it's easier to just skip the first match if necessary - for line in doc.source:gmatch("[^\r\n]*\r?\n?") do - -- check if 'From ' needs to be escaped - if checkFirst and line:match("^From ") then r:puts(">") end - checkFirst=true - -- TODO consider whether to optionally prefix '>From ', '^>>From ' etc. - -- If so, just change the RE to "^>*From " - r:write(line) -- original line - end - r:puts("\n") - end + if total_docs > elastic.MAX_RESULT_WINDOW then + local docs, sid = elastic.scroll(squery) + while docs and docs.hits and docs.hits.hits and #docs.hits.hits > 0 do -- scroll as long as we get new results + writeMbox(r, docs) + docs, sid = elastic.scroll(sid) end + elastic.clear_scroll(sid) -- we're done with the sid, release it + else + local docs = elastic.raw(squery) + writeMbox(r, docs) end else cross.contentType(r, "text/plain")