Return-Path: Delivered-To: apmail-incubator-spamassassin-cvs-archive@www.apache.org Received: (qmail 16116 invoked from network); 22 Apr 2004 20:48:37 -0000 Received: from daedalus.apache.org (HELO mail.apache.org) (208.185.179.12) by minotaur-2.apache.org with SMTP; 22 Apr 2004 20:48:37 -0000 Received: (qmail 99391 invoked by uid 500); 22 Apr 2004 20:48:25 -0000 Delivered-To: apmail-incubator-spamassassin-cvs-archive@incubator.apache.org Received: (qmail 99371 invoked by uid 500); 22 Apr 2004 20:48:25 -0000 Mailing-List: contact spamassassin-cvs-help@incubator.apache.org; run by ezmlm Precedence: bulk list-help: list-unsubscribe: list-post: Reply-To: "Spam Assassin Dev" List-Id: "SpamAssassin Commits" Delivered-To: mailing list spamassassin-cvs@incubator.apache.org Received: (qmail 99357 invoked from network); 22 Apr 2004 20:48:25 -0000 Received: from unknown (HELO minotaur.apache.org) (209.237.227.194) by daedalus.apache.org with SMTP; 22 Apr 2004 20:48:25 -0000 Received: (qmail 16109 invoked by uid 65534); 22 Apr 2004 20:48:36 -0000 Date: 22 Apr 2004 20:48:36 -0000 Message-ID: <20040422204836.16106.qmail@minotaur.apache.org> From: felicity@apache.org To: spamassassin-cvs@incubator.apache.org Subject: svn commit: rev 10188 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N Author: felicity Date: Thu Apr 22 13:48:35 2004 New Revision: 10188 Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Log: adding some comments for get_uri_list() Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm ============================================================================== --- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original) +++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Thu Apr 22 13:48:35 2004 @@ -1465,6 +1465,15 @@ # Finally, the address-spec regex (more or less) my $Addr_spec_re = qr<$local_part\s*\@\s*$domain>o; +# Returns an array of all URIs found in the message. It takes +# a combination of the URIs found in the rendered body and the +# URIs found when parsing the HTML in the message. The array will +# include the "raw" URI as well as "slightly cooked" versions -- +# ie: 'http://%77%77%77.spamassassin.org/' will get turned into: +# ( 'http://%77w%77.spamassassin.org/', 'http://www.spamassassin.org/' ) +# -- this lets us run rules against both the original and "correct" +# versions easily. +# # This really belongs in metadata sub get_uri_list { my ($self) = @_; @@ -1479,6 +1488,8 @@ # to do (note: we know the HTML parsing occurs, because we call for the # rendered text which does HTML parsing...) trying to get URLs out of # HTML w/out parsing causes issues, so let's not do it. + # also, if we allow $textary to be passed in, we need to invalidate + # the cache first. fyi. my $textary = $self->get_decoded_stripped_body_text_array(); my ($rulename, $pat, @uris); @@ -1560,7 +1571,7 @@ } } - # remove duplicates + # remove duplicates, merge nuris and uris my %uris = map { $_ => 1 } @uris, @nuris; @uris = keys %uris;