Return-Path: X-Original-To: apmail-manifoldcf-commits-archive@www.apache.org Delivered-To: apmail-manifoldcf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 1DD9DE196 for ; Wed, 6 Feb 2013 11:38:26 +0000 (UTC) Received: (qmail 66012 invoked by uid 500); 6 Feb 2013 11:38:25 -0000 Delivered-To: apmail-manifoldcf-commits-archive@manifoldcf.apache.org Received: (qmail 65852 invoked by uid 500); 6 Feb 2013 11:38:20 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 65818 invoked by uid 99); 6 Feb 2013 11:38:19 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 06 Feb 2013 11:38:19 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 06 Feb 2013 11:38:15 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id B59EF2388A5E; Wed, 6 Feb 2013 11:37:54 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1442918 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: BOMParseState.java ByteReceiver.java CharacterReceiver.java EncodingDetector.java TagParseState.java Date: Wed, 06 Feb 2013 11:37:54 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130206113754.B59EF2388A5E@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: kwright Date: Wed Feb 6 11:37:54 2013 New Revision: 1442918 URL: http://svn.apache.org/viewvc?rev=1442918&view=rev Log: More revisions of structure, designed to make a generally useful fuzzy ml parser. Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java (with props) manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java (with props) manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java (with props) manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java (with props) Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java?rev=1442918&view=auto ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java (added) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java Wed Feb 6 11:37:54 2013 @@ -0,0 +1,80 @@ +/* $Id$ */ + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.manifoldcf.core.fuzzyml; + +import org.apache.manifoldcf.core.interfaces.*; +import java.util.*; + +/** This class represents the parse state of the BOM (byte order mark) parser. +* The byte order mark parser looks for a byte order mark at the start of a byte sequence, +* and based on whether it finds it or not, and what it finds, selects a preliminary character encoding. +* Once a preliminary character encoding is determined, an EncodingAccepter is notified, +* and further bytes are sent to a provided ByteReceiver. +*/ +public class BOMParseState extends EncodingDetector +{ + protected String encoding = null; + protected final ByteReceiver byteReceiver; + + /** Constructor. Pass in the receiver of all overflow bytes. + * If no receiver is passed in, the detector will stop as soon as the + * BOM is either seen, or not seen. + */ + public BOMParseState(ByteReceiver byteReceiver) + { + super(8); + this.byteReceiver = byteReceiver; + } + + /** Set initial encoding. + */ + @Override + public void setEncoding(String encoding) + { + this.encoding = encoding; + } + + /** Retrieve final encoding determination. + */ + @Override + public String getEncoding() + { + return encoding; + } + + /** Receive a byte. + */ + @Override + public boolean dealWithByte(byte b) + throws ManifoldCFException + { + // MHL + return true; + } + + /** Finish up all processing. + */ + @Override + public void finishUp() + throws ManifoldCFException + { + // MHL + } + +} Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java ------------------------------------------------------------------------------ svn:keywords = Id Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java?rev=1442918&view=auto ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java (added) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java Wed Feb 6 11:37:54 2013 @@ -0,0 +1,65 @@ +/* $Id$ */ + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.manifoldcf.core.fuzzyml; + +import org.apache.manifoldcf.core.interfaces.*; +import java.io.*; + +/** This interface represents a receiver for bytes. +* Implementers of this interface will accept documents a byte at a time +*/ +public abstract class ByteReceiver +{ + protected final byte[] byteBuffer; + + /** Constructor */ + public ByteReceiver(int chunkSize) + { + byteBuffer = new byte[chunkSize]; + } + + /** Receive a byte stream and process up to chunksize bytes, + *@return true if end reached. + */ + public boolean dealWithBytes(InputStream is) + throws IOException, ManifoldCFException + { + int amt = is.read(byteBuffer); + if (amt == -1) + return true; + for (int i = 0; i < amt; i++) + { + if (dealWithByte(byteBuffer[i])) + return true; + } + return false; + } + + /** Receive a byte. + *@return true to stop further processing. + */ + public abstract boolean dealWithByte(byte b) + throws ManifoldCFException; + + /** Finish up all processing. + */ + public abstract void finishUp() + throws ManifoldCFException; + +} Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java ------------------------------------------------------------------------------ svn:keywords = Id Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java?rev=1442918&view=auto ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java (added) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java Wed Feb 6 11:37:54 2013 @@ -0,0 +1,72 @@ +/* $Id$ */ + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.manifoldcf.core.fuzzyml; + +import org.apache.manifoldcf.core.interfaces.*; +import java.io.*; + +/** This interface represents a receiver for characters. +* Implementers of this interface will accept documents a character at a time. +*/ +public abstract class CharacterReceiver +{ + protected final char[] charBuffer; + + public CharacterReceiver() + { + this(4096); + } + + /** Constructor. + */ + public CharacterReceiver(int chunkSize) + { + charBuffer = new char[chunkSize]; + } + + /** Receive a set of characters; process one + * chunksize worth. + *@return true if done. + */ + public boolean dealWithCharacters(Reader r) + throws IOException, ManifoldCFException + { + int amt = r.read(charBuffer); + if (amt == -1) + return true; + for (int i = 0; i < amt; i++) + { + if (dealWithCharacter(charBuffer[i])) + return true; + } + return false; + } + + /** Receive a byte. + * @return true if done. + */ + public abstract boolean dealWithCharacter(char c) + throws ManifoldCFException; + + /** Finish up all processing. + */ + public abstract void finishUp() + throws ManifoldCFException; + +} Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java ------------------------------------------------------------------------------ svn:keywords = Id Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java?rev=1442918&view=auto ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java (added) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java Wed Feb 6 11:37:54 2013 @@ -0,0 +1,51 @@ +/* $Id$ */ + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.manifoldcf.core.fuzzyml; + +import org.apache.manifoldcf.core.interfaces.*; + +/** This interface represents a receiver for bytes. +* Implementers of this interface will accept documents a byte at a time, +* AFTER an encoding has been set. +*/ +public abstract class EncodingDetector extends ByteReceiver +{ + protected String currentEncoding = null; + + /** Constructor */ + public EncodingDetector(int chunkSize) + { + super(chunkSize); + } + + /** Accept a starting encoding value. + */ + public void setEncoding(String encoding) + { + currentEncoding = encoding; + } + + /** Read out the detected encoding, when finished. + */ + public String getEncoding() + { + return currentEncoding; + } + +} Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java ------------------------------------------------------------------------------ svn:keywords = Id Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1442918&r1=1442917&r2=1442918&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original) +++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Wed Feb 6 11:37:54 2013 @@ -22,8 +22,8 @@ import org.apache.manifoldcf.core.interf import org.apache.manifoldcf.core.system.Logging; import java.util.*; -/** This class represents the basic, outermost parse state. */ -public class TagParseState +/** This class represents the basic, outermost tag parsing state. */ +public class TagParseState extends CharacterReceiver { protected static final int TAGPARSESTATE_NORMAL = 0; protected static final int TAGPARSESTATE_SAWLEFTBRACKET = 1; @@ -67,8 +67,10 @@ public class TagParseState { } - /** Deal with a character. No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */ - public void dealWithCharacter(char thisChar) + /** Deal with a character. No exceptions are allowed, since those would represent + * syntax errors, and we don't want those to cause difficulty. */ + @Override + public boolean dealWithCharacter(char thisChar) throws ManifoldCFException { // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it. @@ -361,6 +363,7 @@ public class TagParseState default: throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState)); } + return false; } protected void noteTag(String tagName, Map attributes) @@ -380,6 +383,7 @@ public class TagParseState { } + @Override public void finishUp() throws ManifoldCFException {