Return-Path: X-Original-To: apmail-manifoldcf-commits-archive@www.apache.org Delivered-To: apmail-manifoldcf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 1E351E8D6 for ; Thu, 7 Feb 2013 09:59:18 +0000 (UTC) Received: (qmail 6898 invoked by uid 500); 7 Feb 2013 09:59:17 -0000 Delivered-To: apmail-manifoldcf-commits-archive@manifoldcf.apache.org Received: (qmail 6775 invoked by uid 500); 7 Feb 2013 09:59:16 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 6754 invoked by uid 99); 7 Feb 2013 09:59:15 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 07 Feb 2013 09:59:15 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 07 Feb 2013 09:59:11 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id AD30023888CD; Thu, 7 Feb 2013 09:58:50 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1443377 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: AttrNameValue.java TagParseState.java Date: Thu, 07 Feb 2013 09:58:50 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130207095850.AD30023888CD@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: kwright Date: Thu Feb 7 09:58:50 2013 New Revision: 1443377 URL: http://svn.apache.org/viewvc?rev=1443377&view=rev Log: Turn on case sensitivity, and start coding qtag recognition. Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java (with props) Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java?rev=1443377&view=auto ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java (added) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java Thu Feb 7 09:58:50 2013 @@ -0,0 +1,48 @@ +/* $Id$ */ + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.manifoldcf.core.fuzzyml; + +import org.apache.manifoldcf.core.interfaces.*; +import java.util.*; + +/** This class represents a name/value pair from an +* XML/HTML attribute. +*/ +public class AttrNameValue +{ + protected final String name; + protected final String value; + + public AttrNameValue(String name, String value) + { + this.name = name; + this.value = value; + } + + public String getName() + { + return name; + } + + public String getValue() + { + return value; + } + +} Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java ------------------------------------------------------------------------------ svn:keywords = Id Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443377&r1=1443376&r2=1443377&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb 7 09:58:50 2013 @@ -54,7 +54,15 @@ public class TagParseState extends Singl protected static final int TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13; protected static final int TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14; protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15; - + protected static final int TAGPARSESTATE_IN_QTAG_NAME = 16; + protected static final int TAGPARSESTATE_IN_QTAG_ATTR_NAME = 17; + protected static final int TAGPARSESTATE_IN_QTAG_SAW_QUESTION = 18; + + // These still need to be added to the case statement + protected static final int TAGPARSESTATE_IN_QTAG_ATTR_VALUE = 19; + protected static final int TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE = 20; + protected static final int TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE = 21; + protected static final int TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE = 22; protected int currentState = TAGPARSESTATE_NORMAL; @@ -64,7 +72,7 @@ public class TagParseState extends Singl protected String currentTagName = null; protected String currentAttrName = null; - protected Map currentAttrMap = null; + protected List currentAttrList = null; protected static final Map mapLookup = new HashMap(); static @@ -88,7 +96,7 @@ public class TagParseState extends Singl throws ManifoldCFException { // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it. - char thisCharLower = Character.toLowerCase(thisChar); + // We don't even attempt to map to lower case, that's how naive this is. switch (currentState) { case TAGPARSESTATE_NORMAL: @@ -101,6 +109,11 @@ public class TagParseState extends Singl case TAGPARSESTATE_SAWLEFTBRACKET: if (thisChar == '!') currentState = TAGPARSESTATE_SAWEXCLAMATION; + else if (thisChar == '?') + { + currentState = TAGPARSESTATE_IN_QTAG_NAME; + currentTagNameBuffer = new StringBuilder(); + } else if (thisChar == '/') { currentState = TAGPARSESTATE_IN_END_TAG_NAME; @@ -111,7 +124,7 @@ public class TagParseState extends Singl currentState = TAGPARSESTATE_IN_TAG_NAME; currentTagNameBuffer = new StringBuilder(); if (!isWhitespace(thisChar)) - currentTagNameBuffer.append(thisCharLower); + currentTagNameBuffer.append(thisChar); } break; case TAGPARSESTATE_SAWEXCLAMATION: @@ -143,6 +156,55 @@ public class TagParseState extends Singl else if (thisChar != '-') currentState = TAGPARSESTATE_IN_COMMENT; break; + case TAGPARSESTATE_IN_QTAG_NAME: + if (isWhitespace(thisChar)) + { + if (currentTagNameBuffer.length() > 0) + { + // Done with the tag name! + currentTagName = currentTagNameBuffer.toString(); + currentTagNameBuffer = null; + currentAttrList = new ArrayList(); + currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; + currentAttrNameBuffer = new StringBuilder(); + } + } + else if (thisChar == '?') + { + if (currentTagNameBuffer.length() > 0) + { + currentTagName = currentTagNameBuffer.toString(); + currentTagNameBuffer = null; + currentAttrList = new ArrayList(); + currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION; + // Wait until we see end > to signal tag end though + } + else + { + currentState = TAGPARSESTATE_NORMAL; + currentTagNameBuffer = null; + } + } + else if (thisChar == '>') + { + if (currentTagNameBuffer.length() > 0) + { + currentTagName = currentTagNameBuffer.toString(); + currentTagNameBuffer = null; + currentAttrList = new ArrayList(); + } + if (currentTagName != null) + { + if (noteQTag(currentTagName,currentAttrList)) + return true; + } + currentState = TAGPARSESTATE_NORMAL; + currentTagName = null; + currentAttrList = null; + } + else + currentTagNameBuffer.append(thisChar); + break; case TAGPARSESTATE_IN_TAG_NAME: if (isWhitespace(thisChar)) { @@ -151,7 +213,7 @@ public class TagParseState extends Singl // Done with the tag name! currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; - currentAttrMap = new HashMap(); + currentAttrList = new ArrayList(); currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = new StringBuilder(); } @@ -162,9 +224,9 @@ public class TagParseState extends Singl { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; - currentAttrMap = new HashMap(); + currentAttrList = new ArrayList(); currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; } else @@ -179,19 +241,19 @@ public class TagParseState extends Singl { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; - currentAttrMap = new HashMap(); + currentAttrList = new ArrayList(); } if (currentTagName != null) { - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; } currentState = TAGPARSESTATE_NORMAL; currentTagName = null; - currentAttrMap = null; + currentAttrList = null; } else - currentTagNameBuffer.append(thisCharLower); + currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_ATTR_NAME: if (isWhitespace(thisChar)) @@ -223,10 +285,10 @@ public class TagParseState extends Singl } if (currentAttrName != null) { - currentAttrMap.put(currentAttrName,""); + currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; } @@ -239,17 +301,17 @@ public class TagParseState extends Singl } if (currentAttrName != null) { - currentAttrMap.put(currentAttrName,""); + currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } currentState = TAGPARSESTATE_NORMAL; - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; - currentAttrMap = null; + currentAttrList = null; } else - currentAttrNameBuffer.append(thisCharLower); + currentAttrNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE: if (thisChar == '=') @@ -260,25 +322,25 @@ public class TagParseState extends Singl else if (thisChar == '>') { currentState = TAGPARSESTATE_NORMAL; - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; - currentAttrMap = null; + currentAttrList = null; } else if (thisChar == '/') { currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; - currentAttrMap.put(currentAttrName,""); + currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; } else if (!isWhitespace(thisChar)) { - currentAttrMap.put(currentAttrName,""); + currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = new StringBuilder(); - currentAttrNameBuffer.append(thisCharLower); + currentAttrNameBuffer.append(thisChar); currentAttrName = null; } break; @@ -293,6 +355,16 @@ public class TagParseState extends Singl currentValueBuffer.append(thisChar); } break; + case TAGPARSESTATE_IN_QTAG_SAW_QUESTION: + if (thisChar == '>') + { + if (noteQTag(currentTagName,currentAttrList)) + return true; + currentState = TAGPARSESTATE_NORMAL; + currentTagName = null; + currentAttrList = null; + } + break; case TAGPARSESTATE_IN_TAG_SAW_SLASH: if (thisChar == '>') { @@ -300,7 +372,7 @@ public class TagParseState extends Singl return true; currentState = TAGPARSESTATE_NORMAL; currentTagName = null; - currentAttrMap = null; + currentAttrList = null; } break; case TAGPARSESTATE_IN_END_TAG_NAME: @@ -329,12 +401,12 @@ public class TagParseState extends Singl currentState = TAGPARSESTATE_NORMAL; } else if (currentTagNameBuffer != null) - currentTagNameBuffer.append(thisCharLower); + currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE: if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r') { - currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString())); + currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; @@ -346,7 +418,7 @@ public class TagParseState extends Singl case TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE: if (thisChar == '"' || thisChar == '\n' || thisChar == '\r') { - currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString())); + currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; @@ -358,7 +430,7 @@ public class TagParseState extends Singl case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE: if (isWhitespace(thisChar)) { - currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString())); + currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; @@ -366,21 +438,21 @@ public class TagParseState extends Singl } else if (thisChar == '/') { - currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString())); - if (noteTag(currentTagName,currentAttrMap)) + currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); + if (noteTag(currentTagName,currentAttrList)) return true; currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; } else if (thisChar == '>') { - currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString())); + currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_NORMAL; - if (noteTag(currentTagName,currentAttrMap)) + if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; - currentAttrMap = null; + currentAttrList = null; } else currentValueBuffer.append(thisChar); @@ -394,7 +466,7 @@ public class TagParseState extends Singl /** This method gets called for every tag. Override this method to intercept tag begins. *@return true to halt further processing. */ - protected boolean noteTag(String tagName, Map attributes) + protected boolean noteTag(String tagName, List attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) @@ -417,7 +489,7 @@ public class TagParseState extends Singl * Override it to intercept such constructs. *@return true to halt further processing. */ - protected boolean noteQTag(String tagName, Map attributes) + protected boolean noteQTag(String tagName, List attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled())