Return-Path: Delivered-To: apmail-couchdb-user-archive@www.apache.org Received: (qmail 66537 invoked from network); 6 May 2010 15:26:19 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 6 May 2010 15:26:19 -0000 Received: (qmail 26956 invoked by uid 500); 6 May 2010 15:26:18 -0000 Delivered-To: apmail-couchdb-user-archive@couchdb.apache.org Received: (qmail 26922 invoked by uid 500); 6 May 2010 15:26:18 -0000 Mailing-List: contact user-help@couchdb.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: user@couchdb.apache.org Delivered-To: mailing list user@couchdb.apache.org Received: (qmail 26914 invoked by uid 99); 6 May 2010 15:26:18 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 06 May 2010 15:26:18 +0000 X-ASF-Spam-Status: No, hits=-0.0 required=10.0 tests=SPF_PASS X-Spam-Check-By: apache.org Received-SPF: pass (nike.apache.org: local policy) Received: from [200.243.80.130] (HELO mail.loop.com.br) (200.243.80.130) by apache.org (qpsmtpd/0.29) with SMTP; Thu, 06 May 2010 15:26:09 +0000 Received: (qmail 31849 invoked by uid 64014); 6 May 2010 12:25:43 -0300 Received: from 172.17.2.106 (mike@loop.com.br@172.17.2.106) by intranet (envelope-from , uid 64011) with qmail-scanner-2.01st (clamdscan: 0.88/1633. spamassassin: 3.0.3. perlscan: 2.01st. Clear:RC:1(172.17.2.106):. Processed in 0.038324 secs); 06 May 2010 15:25:43 -0000 Received: from unknown (HELO ?172.17.2.106?) (mike@loop.com.br@172.17.2.106) by 172.17.3.17 with SMTP; 6 May 2010 12:25:43 -0300 Subject: Re: Having purge problems From: Mike Leddy To: user@couchdb.apache.org In-Reply-To: <1273066935.30663.30.camel@mike> References: <1272902160.30231.70.camel@mike> <24B6CFE1-4663-4CA4-B142-DA80C3B27D17@gmail.com> <1273066935.30663.30.camel@mike> Content-Type: text/plain; charset="UTF-8" Date: Thu, 06 May 2010 12:26:20 -0300 Message-ID: <1273159580.16381.63.camel@mike> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Content-Transfer-Encoding: 7bit X-Virus-Checked: Checked by ClamAV on apache.org Just for the record, there was a problem with the patch which affects continuing compactions that fail for whatever reason. $ cat compact_deleted.patch --- couchdb-0.11.0/src/couchdb/couch_db.erl 2010-03-04 02:17:44.000000000 -0300 +++ couchdb-0.11.0.new/src/couchdb/couch_db.erl 2010-05-04 17:18:54.000000000 -0300 @@ -323,7 +323,7 @@ get_revs_limit(#db{revs_limit=Limit}) -> Limit. -set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit > 0 -> +set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit >= 0 -> check_is_admin(Db), gen_server:call(Pid, {set_revs_limit, Limit}, infinity); set_revs_limit(_Db, _Limit) -> --- couchdb-0.11.0/src/couchdb/couch_key_tree.erl 2009-11-21 10:43:43.000000000 -0300 +++ couchdb-0.11.0.new/src/couchdb/couch_key_tree.erl 2010-05-04 17:40:57.000000000 -0300 @@ -314,7 +314,7 @@ % flatten each branch in a tree into a tree path Paths = get_all_leafs_full(Trees), - Paths2 = [{Pos, lists:sublist(Path, Limit)} || {Pos, Path} <- Paths], + Paths2 = [{Pos, lists:sublist(Path, lists:max([Limit, 1]))} || {Pos, Path} <- Paths], % convert paths back to trees lists:foldl( --- couchdb-0.11.0/src/couchdb/couch_db_updater.erl 2010-02-22 12:20:53.000000000 -0300 +++ couchdb-0.11.0.new/src/couchdb/couch_db_updater.erl 2010-05-05 09:19:50.000000000 -0300 @@ -736,9 +736,16 @@ end, Tree). -copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> - Ids = [Id || #doc_info{id=Id} <- InfoBySeq], - LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids), +copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> + if Limit > 0 -> + Ids = [Id || #doc_info{id=Id} <- InfoBySeq], + LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids); + true -> + AllIds = [Id || #doc_info{id=Id} <- InfoBySeq], + BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, AllIds), + LookupResults = [Filtered || {ok, #full_doc_info{deleted=false}}=Filtered <- BaseResults], + Ids = [Id || {ok, #full_doc_info{id=Id}} <- LookupResults] + end, % write out the attachments NewFullDocInfos0 = lists:map( Regards, Mike -copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> - Ids = [Id || #doc_info{id=Id} <- InfoBySeq], - LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids), +copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> + if Limit > 0 -> + Ids = [Id || #doc_info{id=Id} <- InfoBySeq], + LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids); + true -> + AllIds = [Id || #doc_info{id=Id} <- InfoBySeq], + BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, AllIds), + LookupResults = [Filtered || {ok, #full_doc_info{deleted=false}}=Filtered <- BaseResults], + Ids = [Id || {ok, #full_doc_info{id=Id}} <- LookupResults] + end, % write out the attachments NewFullDocInfos0 = lists:map( On Wed, 2010-05-05 at 10:42 -0300, Mike Leddy wrote: > Hi, > > You are welcome. > > I had imagined trying some sort of database swapping but I never liked > the idea of switching where the app looks to the shadow/replacement and > doing it correctly while always available and replicating. > > I decided to bite the bullet and went for the my desired solution. It > may be completely against the grain of good practice but I decided to > patch couchdb to be able to purge deleted data on compaction. > > Please note I am an Erlang/Couchdb newbie. > > First it seemed reasonable to use revs_limit to trigger deleted document > removal as I also want to clear out older revisions as well. I decided > that a revs_limit = 0 might be appropriate. > > Patching couch_db.erl to allow revs_limit = 0 gave me: > > --- couchdb-0.11.0/src/couchdb/couch_db.erl 2010-03-04 02:17:44.000000000 -0300 > +++ couchdb-0.11.0.new/src/couchdb/couch_db.erl 2010-05-04 17:18:54.000000000 -0300 > @@ -323,7 +323,7 @@ > get_revs_limit(#db{revs_limit=Limit}) -> > Limit. > > -set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit > 0 -> > +set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit >= 0 -> > check_is_admin(Db), > gen_server:call(Pid, {set_revs_limit, Limit}, infinity); > set_revs_limit(_Db, _Limit) -> > > Making sure that revs_limit = 0 doesn't wipe out the whole database: > > --- couchdb-0.11.0/src/couchdb/couch_key_tree.erl 2009-11-21 10:43:43.000000000 -0300 > +++ couchdb-0.11.0.new/src/couchdb/couch_key_tree.erl 2010-05-04 17:40:57.000000000 -0300 > @@ -314,7 +314,7 @@ > % flatten each branch in a tree into a tree path > Paths = get_all_leafs_full(Trees), > > - Paths2 = [{Pos, lists:sublist(Path, Limit)} || {Pos, Path} <- Paths], > + Paths2 = [{Pos, lists:sublist(Path, lists:max([Limit, 1]))} || {Pos, Path} <- Paths], > > % convert paths back to trees > lists:foldl( > > Now the trickier part, choosing a good place to filter out the deleted > docs.... After several failed attempts I chose this: > > --- couchdb-0.11.0/src/couchdb/couch_db_updater.erl 2010-02-22 12:20:53.000000000 -0300 > +++ couchdb-0.11.0.new/src/couchdb/couch_db_updater.erl 2010-05-05 09:19:50.000000000 -0300 > @@ -736,9 +736,16 @@ > end, Tree). > > > -copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> > - Ids = [Id || #doc_info{id=Id} <- InfoBySeq], > - LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids), > +copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) -> > + if Limit > 0 -> > + Ids = [Id || #doc_info{id=Id} <- InfoBySeq], > + LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids); > + true -> > + AllIds = [Id || #doc_info{id=Id} <- InfoBySeq], > + BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, AllIds), > + LookupResults = [Filtered || {ok, #full_doc_info{deleted=false}}=Filtered <- BaseResults], > + Ids = [Id || #full_doc_info{id=Id} <- LookupResults] > + end, > > % write out the attachments > NewFullDocInfos0 = lists:map( > > Now I can simply: > > curl -X PUT 'localhost:5984/db/_revs_limit' -d '0' > > and then: > > curl -X POST 'localhost:5984/db/_compact' > > Hopefully I haven broken anything - I am still running some tests but > it looks like am able to do what I need and stay 100% available without > altering my application. > > I am not suggesting that anyone use this as it is but it might trigger > someone to incorporate something in couchdb that has a similar > functionality. > > Best regards, > > Mike > > > On Tue, 2010-05-04 at 09:09 -0700, J Chris Anderson wrote: > > On May 3, 2010, at 8:56 AM, Mike Leddy wrote: > > > > > Hi, > > > > > > I am currently on couchdb 0.11.0 using official debian packages with > > > erlang 1:13.b.4-dfsg-4 and I am having problems purging old documents. > > > > > > My database is constantly receiving new data and old data (more than > > > six weeks) is being deleted. I have been running like this for several > > > months and the overhead of old deleted document 'stubs' is becoming > > > relevant in day to day operations such as new replications, database > > > compaction etc. > > > > > > I decided that it would be best to purge the old deleted documents > > > so that the database would compact better and only contain relevant > > > ie. recent data. > > > > > > [What I would really like would be a compact that does not include > > > documents that match a filter function, then I could do this on each > > > node independently.] > > > > > > Unfortunately I am encountering problems purging the documents. I wrote > > > a script to process all the documents via _changes and purge the old > > > documents but I keep hitting documents that cannot be purged. > > > > > > > Thanks for the bug report. One way I've seen people accomplish this use case that doesn't involve purging, is by storing documents into a new database each week, and then throwing out old database files. > > > > Purging is really designed for removing secret data that was accidentally saved, more than for reclaiming space. Thanks for the bug report - hopefully it will be easy to fix. > > > > Chris > > > > > Here is the start of my changes feed: > > > > > > curl 'localhost:5984/iris/_changes?limit=5&since=0' > > > {"results":[ > > > {"seq":2,"id":"_design/admin","changes":[{"rev":"1-ea95c1898a2c779d664c1d1b71a24f33"}]}, > > > {"seq":22435808,"id":"1259540160F2016","changes":[{"rev":"2-7dcfd742f74c79286c3f3093595a83df"}],"deleted":true}, > > > {"seq":22435809,"id":"1259540640F2016","changes":[{"rev":"2-6bd122eb9f83c0838bc9875a1b73abaf"}],"deleted":true}, > > > {"seq":22435810,"id":"1259616780F2443","changes":[{"rev":"2-53e2311f5de7058fbfd55979816d3efc"}],"deleted":true}, > > > {"seq":22435811,"id":"1259616784F2443","changes":[{"rev":"2-caaff4cd1290f7807c2bcfeb6edc39e0"}],"deleted":true} > > > ], > > > "last_seq":22435811} > > > > > > This is a compacted copy of my main production database which is > > > already on seq 106280009. > > > > > > When i try to purge I get a badarity error: > > > > > > curl -X POST 'localhost:5984/iris/_purge' -d > > > '{"1259540160F2016":["2-7dcfd742f74c79286c3f3093595a83df"]}' > > > {"error":"{{badarity,{#Fun,\n > > > [{2,<<124,230,79,165,16,199,208,127,32,211,160,223,180,12,3,28>>},\n > > > {true,4185,19290621}]}},\n [{couch_key_tree,map_leafs_simple,3},\n > > > {couch_key_tree,map_leafs_simple,3},\n {couch_key_tree,map_leafs,2},\n > > > {couch_db_updater,'-handle_call/3-fun-2-',2},\n {lists,mapfoldl,3},\n > > > {couch_db_updater,handle_call,3},\n {gen_server,handle_msg,5},\n > > > {proc_lib,init_p_do_apply,3}]}","reason":"{gen_server,call,\n > > > [<0.28323.1>,\n {purge_docs,[{<<\"1259540160F2016\">>,\n > > > [{2,\n > > > <<125,207,215,66,247,76,121,40,108,63,48,\n > > > 147,89,90,131,223>>}]}]}]}"} > > > > > > This is what appears in my server log: > > > > > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28323.1>] ** Generic server > > > <0.28323.1> terminating > > > ** Last message in was {purge_docs,[{<<"1259540160F2016">>, > > > [{2, > > > > > > <<125,207,215,66,247,76,121,40,108, > > > 63,48,147,89,90,131,223>>}]}]} > > > ** When Server state == {db,<0.28322.1>,<0.28323.1>,nil, > > > > > > <<"1272901503155514">>,<0.28320.1>,<0.28324.1>, > > > {db_header,5,106014165,0, > > > {59608213341,{24516895,36227125}}, > > > {59608203141,60744020}, > > > {59608273696,[]}, > > > 0,nil,nil,1000}, > > > 106014165, > > > {btree,<0.28320.1>, > > > {59608213341,{24516895,36227125}}, > > > #Fun, > > > #Fun, > > > #Fun, > > > #Fun}, > > > {btree,<0.28320.1>, > > > {59608203141,60744020}, > > > #Fun, > > > #Fun, > > > #Fun, > > > #Fun}, > > > {btree,<0.28320.1>, > > > {59608273696,[]}, > > > #Fun, > > > #Fun, > > > #Fun,nil}, > > > 106014165,<<"iris">>, > > > > > > "/var/lib/couchdb/0.11.0/iris.couch",[],[],nil, > > > {user_ctx,null,[],undefined}, > > > nil,1000, > > > [before_header,after_header,on_file_open]} > > > ** Reason for termination == > > > ** {{badarity,{#Fun, > > > [{2, > > > > > > <<124,230,79,165,16,199,208,127,32,211,160,223,180,12,3,28>>}, > > > {true,4185,19290621}]}}, > > > [{couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs,2}, > > > {couch_db_updater,'-handle_call/3-fun-2-',2}, > > > {lists,mapfoldl,3}, > > > {couch_db_updater,handle_call,3}, > > > {gen_server,handle_msg,5}, > > > {proc_lib,init_p_do_apply,3}]} > > > > > > > > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28323.1>] > > > {error_report,<0.31.0>, > > > {<0.28323.1>,crash_report, > > > [[{initial_call,{couch_db_updater,init,['Argument__1']}}, > > > {pid,<0.28323.1>}, > > > {registered_name,[]}, > > > {error_info, > > > {exit, > > > {{badarity, > > > {#Fun, > > > [{2, > > > > > > <<124,230,79,165,16,199,208,127,32,211,160,223,180,12, > > > 3,28>>}, > > > {true,4185,19290621}]}}, > > > [{couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs,2}, > > > {couch_db_updater,'-handle_call/3-fun-2-',2}, > > > {lists,mapfoldl,3}, > > > {couch_db_updater,handle_call,3}, > > > {gen_server,handle_msg,5}, > > > {proc_lib,init_p_do_apply,3}]}, > > > > > > [{gen_server,terminate,6},{proc_lib,init_p_do_apply,3}]}}, > > > {ancestors, > > > > > > [<0.28322.1>,couch_server,couch_primary_services,couch_server_sup, > > > <0.32.0>]}, > > > {messages,[]}, > > > {links,[<0.28322.1>]}, > > > {dictionary,[]}, > > > {trap_exit,false}, > > > {status,running}, > > > {heap_size,4181}, > > > {stack_size,24}, > > > {reductions,42618}], > > > []]}} > > > > > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28291.1>] Uncaught error in > > > HTTP request: {exit, > > > {{{badarity, > > > > > > {#Fun, > > > [{2, > > > > > > <<124,230,79,165,16,199,208,127,32,211, > > > 160,223,180,12,3,28>>}, > > > {true,4185,19290621}]}}, > > > [{couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs_simple,3}, > > > {couch_key_tree,map_leafs,2}, > > > {couch_db_updater, > > > '-handle_call/3-fun-2-',2}, > > > {lists,mapfoldl,3}, > > > {couch_db_updater,handle_call,3}, > > > {gen_server,handle_msg,5}, > > > {proc_lib,init_p_do_apply,3}]}, > > > {gen_server,call, > > > [<0.28323.1>, > > > {purge_docs, > > > [{<<"1259540160F2016">>, > > > [{2, > > > <<125,207,215,66,247,76,121, > > > 40,108,63,48,147,89,90,131, > > > 223>>}]}]}]}}} > > > > > > [Mon, 03 May 2010 15:45:03 GMT] [info] [<0.28291.1>] Stacktrace: > > > [{gen_server,call,2}, > > > {couch_httpd_db,db_req,2}, > > > {couch_httpd_db,do_db_req,2}, > > > {couch_httpd,handle_request_int,5}, > > > {mochiweb_http,headers,5}, > > > {proc_lib,init_p_do_apply,3}] > > > > > > [Mon, 03 May 2010 15:45:03 GMT] [info] [<0.28291.1>] 127.0.0.1 - - > > > 'POST' /iris/_purge 500 > > > > > > Any suggestions would be greatly appreciated. > > > > > > Thanks, > > > > > > Mike > > > > > > > > > > > > > > > > > > > >