Return-Path: Delivered-To: apmail-jakarta-commons-dev-archive@www.apache.org Received: (qmail 17098 invoked from network); 29 May 2004 23:25:02 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur-2.apache.org with SMTP; 29 May 2004 23:25:02 -0000 Received: (qmail 34965 invoked by uid 500); 29 May 2004 23:24:55 -0000 Delivered-To: apmail-jakarta-commons-dev-archive@jakarta.apache.org Received: (qmail 34919 invoked by uid 500); 29 May 2004 23:24:54 -0000 Mailing-List: contact commons-dev-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Jakarta Commons Developers List" Reply-To: "Jakarta Commons Developers List" Delivered-To: mailing list commons-dev@jakarta.apache.org Received: (qmail 34900 invoked by uid 500); 29 May 2004 23:24:54 -0000 Received: (qmail 34890 invoked by uid 99); 29 May 2004 23:24:54 -0000 Received: from [209.237.227.194] (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.27.1) with SMTP; Sat, 29 May 2004 16:24:53 -0700 Received: (qmail 17070 invoked by uid 1110); 29 May 2004 23:24:54 -0000 Date: 29 May 2004 23:24:54 -0000 Message-ID: <20040529232454.17069.qmail@minotaur.apache.org> From: burton@apache.org To: jakarta-commons-sandbox-cvs@apache.org Subject: cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser ContentDetector.java X-Virus-Checked: Checked X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N burton 2004/05/29 16:24:54 Added: feedparser/src/java/org/apache/commons/feedparser ContentDetector.java Log: HTML or RSS content detection Revision Changes Path 1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/ContentDetector.java Index: ContentDetector.java =================================================================== /* * Copyright 1999,2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.feedparser; import java.net.URL; /** * Given the RAW content of a URL, determine if we're looking at an RSS file or * an HTML file. We also return the given RSS version or Atom version. * * @author Kevin A. Burton (burtonator) * @version $Id: ContentDetector.java,v 1.1 2004/05/29 23:24:54 burton Exp $ */ public class ContentDetector { /** * Return true if the given content seems to be RSS. This is going to be a * cheat because really we have no way of telling if this is RSS other than if * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl * * @author Kevin A. Burton */ public static ContentDetectorResult detect( String content ) throws Exception { ContentDetectorResult result = new ContentDetectorResult(); result.isHTML = isHTMLContent( content ); result.isRSS = ( isRSS_1_0_Content( content ) || isRSS_2_0_Content( content ) || isRSS_0_9_0_Content( content ) || isRSS_0_9_1_Content( content ) || isRSS_0_9_2_Content( content ) ); result.isAtom = isAtomContent( content ); result.isFeed = result.isRSS || result.isAtom; return result; } /** * Return true if this is RSS 1.0 content * * @author Kevin A. Burton */ public static boolean isRSS_1_0_Content( String content ) throws Exception { //do a search for the RSS 1.0 namespace. This is a bit of a trick right //now. return content.indexOf( "http://purl.org/rss/1.0/" ) != -1; } /** * Return true if this is RSS 2.0 content * * @author Kevin A. Burton */ public static boolean isRSS_0_9_1_Content( String content ) throws Exception { //look for the beginning of the RSS element return content.indexOf( "Kevin A. Burton */ public static boolean isRSS_0_9_2_Content( String content ) throws Exception { //same check for RSS 0.9.1 return isRSS_0_9_1_Content( content ); } /** * Return true if this is RSS 2.0 content * * @author Kevin A. Burton */ public static boolean isRSS_2_0_Content( String content ) throws Exception { return isRSS_0_9_1_Content( content ); } /** * Return true if this is RSS 2.0 content * * @author Kevin A. Burton */ public static boolean isRSS_0_9_0_Content( String content ) throws Exception { //FIXME: look for the RDF namespace and the RSS DTD namespace return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1; } public static boolean isAtomContent( String content ) throws Exception { return content.indexOf( "http://purl.org/atom/ns#" ) != -1; } /** * Return true if this is RSS 2.0 content * * @author Kevin A. Burton */ public static boolean isHTMLContent( String content ) throws Exception { //look for the beginning of the RSS element return content.indexOf( "