nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From lewi...@apache.org
Subject svn commit: r1497447 - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/storage/ src/java/org/apache/nutch/util/ src...
Date Thu, 27 Jun 2013 17:04:42 GMT
Author: lewismc
Date: Thu Jun 27 17:04:42 2013
New Revision: 1497447

URL: http://svn.apache.org/r1497447
Log:
NUTCH-1591 Incorrect conversion of ByteBuffer to String

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
    nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
    nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
    nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
    nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
    nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
    nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
    nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jun 27 17:04:42 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1591 Incorrect conversion of ByteBuffer to String (Jason Howes via lewismc)
+
 * NUTCH-1571 SolrInputSplit doesn't implement Writable and crawl script doesn't pass crawlId
to generate and updatedb tasks (yuanyun.cn via lewismc)
 
 * NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java Thu Jun 27 17:04:42 2013
@@ -195,7 +195,7 @@ public class DbReader {
             while (iterator.hasNext()) {
               Entry<Utf8, ByteBuffer> entry = iterator.next();
               simpleMeta.put(entry.getKey().toString(), 
-                  Bytes.toStringBinary(entry.getValue().array()));
+                  Bytes.toStringBinary(entry.getValue()));
             }
           }
           res.put(f, simpleMeta);
@@ -207,10 +207,10 @@ public class DbReader {
           res.put(f, ParseStatusUtils.toString(ps));
         } else if ("signature".equals(f)) {
           ByteBuffer bb = page.getSignature();
-          res.put(f, StringUtil.toHexString(bb.array()));
+          res.put(f, StringUtil.toHexString(bb));
         } else if ("content".equals(f)) {
           ByteBuffer bb = page.getContent();
-          res.put(f, Bytes.toStringBinary(bb.array()));
+          res.put(f, Bytes.toStringBinary(bb));
         } else if ("markers".equals(f)) {
           res.put(f, convertMap(page.getMarkers()));
         } else if ("inlinks".equals(f)) {
@@ -221,7 +221,7 @@ public class DbReader {
           if (val instanceof Utf8) {
             val = val.toString();
           } else if (val instanceof ByteBuffer) {
-            val = Bytes.toStringBinary(((ByteBuffer)val).array());
+            val = Bytes.toStringBinary((ByteBuffer)val);
           }
           res.put(f, val);
         }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Thu Jun 27 17:04:42
2013
@@ -118,7 +118,7 @@ extends GoraReducer<UrlWithScore, NutchW
         ByteBuffer prevSig = page.getPrevSignature();
         ByteBuffer signature = page.getSignature();
         if (prevSig != null && signature != null) {
-          if (SignatureComparator.compare(prevSig.array(), signature.array()) != 0) {
+          if (SignatureComparator.compare(prevSig, signature) != 0) {
             modified = FetchSchedule.STATUS_MODIFIED;
           } else {
             modified = FetchSchedule.STATUS_NOTMODIFIED;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java Thu Jun 27 17:04:42
2013
@@ -17,9 +17,11 @@
 
 package org.apache.nutch.crawl;
 
+import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashSet;
 
+import org.apache.avro.util.Utf8;
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.nutch.storage.WebPage;
 
@@ -40,9 +42,29 @@ public class MD5Signature extends Signat
 
   @Override
   public byte[] calculate(WebPage page) {
-    byte[] data = page.getContent().array();
-    if (data == null && page.getBaseUrl()!=null) data = page.getBaseUrl().getBytes();
-    return MD5Hash.digest(data).getDigest();
+    ByteBuffer buf = page.getContent();
+    byte[] data;
+    int of;
+    int cb;
+    if (buf == null) {
+      Utf8 baseUrl = page.getBaseUrl();
+      if (baseUrl == null) {
+        data = null;
+        of = 0;
+        cb = 0;
+      }
+      else {
+        data = baseUrl.getBytes();
+        of = 0;
+        cb = baseUrl.getLength();
+      }
+    } else {
+      data = buf.array();
+      of = buf.arrayOffset() + buf.position();
+      cb = buf.remaining();
+    }
+
+    return MD5Hash.digest(data, of, cb).getDigest();
   }
 
   @Override

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java Thu Jun 27
17:04:42 2013
@@ -17,13 +17,23 @@
 
 package org.apache.nutch.crawl;
 
+import java.nio.ByteBuffer;
+
 public class SignatureComparator {
   public static int compare(byte[] data1, byte[] data2) {
     if (data1 == null && data2 == null) return 0;
     if (data1 == null) return -1;
     if (data2 == null) return 1;
-    return _compare(data1, 0, data1.length, data2, 0, data2.length);  }
-  
+    return _compare(data1, 0, data1.length, data2, 0, data2.length);
+  }
+
+  public static int compare(ByteBuffer buf1, ByteBuffer buf2) {
+    if (buf1 == null && buf2 == null) return 0;
+    if (buf1 == null) return -1;
+    if (buf2 == null) return 1;
+    return _compare(buf1.array(), buf1.arrayOffset() + buf1.position(), buf1.remaining(),
+                    buf2.array(), buf2.arrayOffset() + buf2.position(), buf2.remaining());
+  }
   
   public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int s2, int l2)
{
     if (l2 > l1) return -1;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Thu Jun 27 17:04:42
2013
@@ -355,11 +355,11 @@ public class WebTableReader extends Nutc
         ProtocolStatusUtils.toString(page.getProtocolStatus())).append("\n");
     ByteBuffer prevSig = page.getPrevSignature();
         if (prevSig != null) {
-      sb.append("prevSignature:\t" + StringUtil.toHexString(prevSig.array())).append("\n");
+      sb.append("prevSignature:\t" + StringUtil.toHexString(prevSig)).append("\n");
     }
     ByteBuffer sig = page.getSignature();
     if (sig != null) {
-      sb.append("signature:\t" + StringUtil.toHexString(sig.array())).append("\n");
+      sb.append("signature:\t" + StringUtil.toHexString(sig)).append("\n");
     }
     sb.append("parseStatus:\t" +
         ParseStatusUtils.toString(page.getParseStatus())).append("\n");
@@ -380,7 +380,7 @@ public class WebTableReader extends Nutc
       while (iterator.hasNext()) {
         Entry<Utf8, ByteBuffer> entry = iterator.next();
         sb.append("metadata " + entry.getKey().toString()).append(" : \t")
-            .append(Bytes.toString(entry.getValue().array())).append("\n");
+            .append(Bytes.toString(entry.getValue())).append("\n");
       }
     }
     if (dumpLinks) {
@@ -409,7 +409,7 @@ public class WebTableReader extends Nutc
     if (content != null && dumpContent) {
       sb.append("contentType:\t" + page.getContentType()).append("\n");
       sb.append("content:start:\n");
-      sb.append(Bytes.toString(content.array()));
+      sb.append(Bytes.toString(content));
       sb.append("\ncontent:end:\n");
     }
     Utf8 text = page.getText();

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java Thu Jun 27 17:04:42
2013
@@ -62,7 +62,7 @@ public class IndexUtil {
   public NutchDocument index(String key, WebPage page) {
     NutchDocument doc = new NutchDocument();
     doc.add("id", key);
-    doc.add("digest", StringUtil.toHexString(page.getSignature().array()));
+    doc.add("digest", StringUtil.toHexString(page.getSignature()));
     if (page.getBatchId() != null) {
       doc.add("batchId", page.getBatchId().toString());
     }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jun 27 17:04:42
2013
@@ -171,7 +171,7 @@ public class ParserChecker implements To
       while (iterator.hasNext()) {
         Entry<Utf8, ByteBuffer> entry = iterator.next();
         sb.append(entry.getKey().toString()).append(" : \t")
-            .append(Bytes.toString(entry.getValue().array())).append("\n");
+            .append(Bytes.toString(entry.getValue())).append("\n");
       }
       System.out.print(sb.toString());
     }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java Thu Jun 27 17:04:42 2013
@@ -18,23 +18,14 @@ package org.apache.nutch.storage;
 
 import java.nio.ByteBuffer;
 import java.util.Map;
-import java.util.HashMap;
-import org.apache.avro.Protocol;
 import org.apache.avro.Schema;
 import org.apache.avro.AvroRuntimeException;
-import org.apache.avro.Protocol;
 import org.apache.avro.util.Utf8;
-import org.apache.avro.ipc.AvroRemoteException;
-import org.apache.avro.generic.GenericArray;
-import org.apache.avro.specific.SpecificExceptionBase;
-import org.apache.avro.specific.SpecificRecordBase;
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.avro.specific.SpecificFixed;
 import org.apache.gora.persistency.StateManager;
 import org.apache.gora.persistency.impl.PersistentBase;
 import org.apache.gora.persistency.impl.StateManagerImpl;
 import org.apache.gora.persistency.StatefulHashMap;
-import org.apache.gora.persistency.ListGenericArray;
+import org.apache.nutch.util.Bytes;
 
 @SuppressWarnings("all")
 public class Host extends PersistentBase {
@@ -152,7 +143,7 @@ public class Host extends PersistentBase
   
   public String getValue(String key, String defaultValue) {
     if (!contains(key)) return defaultValue;
-    return new String(metadata.get(new Utf8(key)).array());
+    return Bytes.toString(metadata.get(new Utf8(key)));
   }
   
   public int getInt(String key, int defaultValue) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java Thu Jun 27 17:04:42 2013
@@ -273,12 +273,27 @@ public class Bytes {
 	 * @return the byte array
 	 */
 	public static byte[] toBytes(ByteBuffer bb) {
-		int length = bb.limit();
+		int length = bb.remaining();
 		byte[] result = new byte[length];
-		System.arraycopy(bb.array(), bb.arrayOffset(), result, 0, length);
+		System.arraycopy(bb.array(), bb.arrayOffset() + bb.position(), result, 0, length);
 		return result;
 	}
 
+    /**
+     * This method will convert utf8 encoded bytes into a string. If an
+     * UnsupportedEncodingException occurs, this method will eat it and return
+     * null instead.
+     *
+     * @param bb
+     *            Presumed UTF-8 encoded ByteBuffer.
+     * @return String made from <code>b</code> or null
+     */
+    public static String toString(ByteBuffer bb) {
+        return bb == null
+               ? null
+               : toString(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
+    }
+
 	/**
 	 * @param b
 	 *            Presumed UTF-8 encoded byte array.
@@ -333,6 +348,20 @@ public class Bytes {
 		}
 	}
 
+    /**
+     * Write a printable representation of a ByteBuffer. Non-printable
+     * characters are hex escaped in the format \\x%02X, eg: \x00 \x05 etc
+     *
+     * @param bb
+     *            ByteBuffer to write out
+     * @return string output
+     */
+    public static String toStringBinary(ByteBuffer bb) {
+        return bb == null
+               ? null
+               : toStringBinary(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
+    }
+
 	/**
 	 * Write a printable representation of a byte array.
 	 * 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java Thu Jun 27 17:04:42
2013
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.util;
 
+import java.io.ByteArrayInputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
@@ -169,21 +170,20 @@ public class EncodingDetector {
 
   private void autoDetectClues(ByteBuffer dataBuffer, Utf8 typeUtf8,
                                String encoding, boolean filter) {
-    byte[] data = dataBuffer.array();
+    int length = dataBuffer.remaining();
     String type = TableUtil.toString(typeUtf8);
 
     if (minConfidence >= 0 && DETECTABLES.contains(type)
-        && data.length > MIN_LENGTH) {
+        && length > MIN_LENGTH) {
       CharsetMatch[] matches = null;
 
       // do all these in a try/catch; setText and detect/detectAll
       // will sometimes throw exceptions
       try {
         detector.enableInputFilter(filter);
-        if (data.length > MIN_LENGTH) {
-          detector.setText(data);
-          matches = detector.detectAll();
-        }
+        detector.setText(new ByteArrayInputStream(dataBuffer.array(),
+            dataBuffer.arrayOffset() + dataBuffer.position(), length));
+        matches = detector.detectAll();
       } catch (Exception e) {
         LOG.debug("Exception from ICU4J (ignoring): ", e);
       }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java Thu Jun 27 17:04:42
2013
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.util;
 
+import java.nio.ByteBuffer;
+
 /**
  * A collection of String processing utility methods. 
  */
@@ -52,6 +54,28 @@ public class StringUtil {
   {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
 
   /**
+   * Convenience call for {@link #toHexString(ByteBuffer, String, int)}, where
+   * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+   * @param buf
+   */
+  public static String toHexString(ByteBuffer buf) {
+    return toHexString(buf, null, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Get a text representation of a ByteBuffer as hexadecimal String, where each
+   * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+   * @param buf input data
+   * @param sep separate every pair of hexadecimal digits with this separator, or
+   * null if no separation is needed.
+   * @param lineLen break the output String into lines containing output for lineLen
+   * bytes.
+   */
+  public static String toHexString(ByteBuffer buf, String sep, int lineLen) {
+    return toHexString(buf.array(), buf.arrayOffset() + buf.position(), buf.remaining(),
sep, lineLen);
+  }
+
+  /**
    * Convenience call for {@link #toHexString(byte[], String, int)}, where
    * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
    * @param buf
@@ -70,15 +94,30 @@ public class StringUtil {
    * bytes.
    */
   public static String toHexString(byte[] buf, String sep, int lineLen) {
+    return toHexString(buf, 0, buf.length, sep, lineLen);
+  }
+
+  /**
+   * Get a text representation of a byte[] as hexadecimal String, where each
+   * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+   * @param buf input data
+   * @param of the offset into the byte[] to start reading
+   * @param cb the number of bytes to read from the byte[]
+   * @param sep separate every pair of hexadecimal digits with this separator, or
+   * null if no separation is needed.
+   * @param lineLen break the output String into lines containing output for lineLen
+   * bytes.
+   */
+  public static String toHexString(byte[] buf, int of, int cb, String sep, int lineLen) {
     if (buf == null) return null;
     if (lineLen <= 0) lineLen = Integer.MAX_VALUE;
-    StringBuffer res = new StringBuffer(buf.length * 2);
-    for (int i = 0; i < buf.length; i++) {
-      int b = buf[i];
+    StringBuffer res = new StringBuffer(cb * 2);
+    for (int c = 0; c < cb; c++) {
+      int b = buf[of++];
       res.append(HEX_DIGITS[(b >> 4) & 0xf]);
       res.append(HEX_DIGITS[b & 0xf]);
-      if (i > 0 && (i % lineLen) == 0) res.append('\n');
-      else if (sep != null && i < lineLen - 1) res.append(sep); 
+      if (c > 0 && (c % lineLen) == 0) res.append('\n');
+      else if (sep != null && c < lineLen - 1) res.append(sep);
     }
     return res.toString();
   }

Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -34,6 +34,7 @@ import org.apache.nutch.indexer.NutchDoc
 import org.apache.nutch.metadata.CreativeCommons;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
 
 /** Adds basic searchable fields to a document. */
 public class CCIndexingFilter implements IndexingFilter {
@@ -102,7 +103,7 @@ public class CCIndexingFilter implements
 		ByteBuffer blicense = page.getFromMetadata(new Utf8(
 				CreativeCommons.LICENSE_URL));
 		if (blicense != null) {
-			String licenseUrl = new String(blicense.array());
+			String licenseUrl = Bytes.toString(blicense);
 			if (LOG.isInfoEnabled()) {
 				LOG.info("CC: indexing " + licenseUrl + " for: "
 						+ url.toString());
@@ -119,7 +120,7 @@ public class CCIndexingFilter implements
 		ByteBuffer blicenseloc = page.getFromMetadata(new Utf8(
 				CreativeCommons.LICENSE_LOCATION));
 		if (blicenseloc != null) {
-			String licenseLocation = new String(blicenseloc.array());
+			String licenseLocation = Bytes.toString(blicenseloc);
 			addFeature(doc, "meta=" + licenseLocation);
 		}
 
@@ -127,7 +128,7 @@ public class CCIndexingFilter implements
 		ByteBuffer bworkType = page.getFromMetadata(new Utf8(
 				CreativeCommons.WORK_TYPE));
 		if (bworkType != null) {
-			String workType = new String(bworkType.array());
+			String workType = Bytes.toString(bworkType);
 			addFeature(doc, workType);
 		}
 

Modified: nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
Thu Jun 27 17:04:42 2013
@@ -21,6 +21,7 @@ import org.apache.nutch.parse.ParseUtil;
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import java.io.*;
@@ -73,13 +74,10 @@ public class TestCCParseFilter {
 		new ParseUtil(conf).parse(url, page);
 
 		ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
-		assertEquals(license, new String(bb.array()));
+		assertEquals(license, Bytes.toString(bb));
 		bb = page.getFromMetadata(new Utf8("License-Location"));
-		assertEquals(location, new String(bb.array()));
+		assertEquals(location, Bytes.toString(bb));
 		bb = page.getFromMetadata(new Utf8("Work-Type"));
-		if (bb == null)
-			assertEquals(type, null);
-		else
-			assertEquals(type, new String(bb.array()));
+        assertEquals(type, Bytes.toString(bb));
 	}
 }

Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -119,8 +119,7 @@ public class BasicIndexingFilter impleme
     // add cached content/summary display policy, if available
     ByteBuffer cachingRaw = page
         .getFromMetadata(Nutch.CACHING_FORBIDDEN_KEY_UTF8);
-    String caching = (cachingRaw == null ? null : Bytes.toString(cachingRaw
-        .array()));
+    String caching = Bytes.toString(cachingRaw);
     if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
       doc.add("cache", caching);
     }

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
(original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
Thu Jun 27 17:04:42 2013
@@ -132,7 +132,7 @@ public class HTMLLanguageParser implemen
       LanguageParser parser = new LanguageParser(doc);
       lang = parser.getLanguage();
     } else
-      lang = Bytes.toString(blang.array());
+      lang = Bytes.toString(blang);
 
     if (lang != null) {
       return lang;

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -61,11 +61,8 @@ public class LanguageIndexingFilter impl
       throws IndexingException {
 
     // check if LANGUAGE found, possibly put there by HTMLLanguageParser
-    String lang = null;
     ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
-    if (blang != null) {
-      lang = Bytes.toString(blang.array());
-    }
+    String lang = Bytes.toString(blang);
 
     if (lang == null || lang.length() == 0) {
       lang = "unknown";

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
(original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
Thu Jun 27 17:04:42 2013
@@ -60,9 +60,7 @@ public class TestHTMLLanguageParser {
         WebPage page = getPage(docs[t]);
         parser.parse(URL.toString(), page);
         ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
-        String lang = null;
-        if (blang != null)
-          lang = Bytes.toString(blang.array());
+        String lang = Bytes.toString(blang);
         assertEquals(metalanguages[t], lang);
       }
     } catch (Exception e) {

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -28,6 +28,7 @@ import org.apache.nutch.indexer.Indexing
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
 
 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that adds <code>tag</code>
@@ -87,7 +88,7 @@ public class RelTagIndexingFilter implem
     ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG));
 		
     if (bb != null) {
-      String[] tags = new String(bb.array()).split("\t");
+      String[] tags = Bytes.toString(bb).split("\t");
       for (int i = 0; i < tags.length; i++) {
 	    doc.add("tag", tags[i]);
       }

Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Thu Jun 27 17:04:42 2013
@@ -85,7 +85,7 @@ public class HtmlParser implements Parse
   private String parserImpl;
 
   /**
-   * Given a <code>byte[]</code> representing an html file of an
+   * Given a <code>ByteBuffer</code> representing an html file of an
    * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag
    * from the first <code>CHUNK_SIZE</code> bytes.
    * If there's no meta tag for Content-Type or no charset is specified,
@@ -97,12 +97,11 @@ public class HtmlParser implements Parse
    * See also http://www.w3.org/TR/REC-xml/#sec-guessing
    * <br />
    *
-   * @param content <code>byte[]</code> representation of an html file
+   * @param content <code>ByteBuffer</code> representation of an html file
    */
 
-  private static String sniffCharacterEncoding(byte[] content) {
-    int length = content.length < CHUNK_SIZE ?
-        content.length : CHUNK_SIZE;
+  private static String sniffCharacterEncoding(ByteBuffer content) {
+    int length = Math.min(content.remaining(), CHUNK_SIZE);
 
     // We don't care about non-ASCII parts so that it's sufficient
     // to just inflate each byte to a 16-bit value by padding.
@@ -110,8 +109,8 @@ public class HtmlParser implements Parse
     // {U+0041, U+0082, U+00B7}.
     String str = "";
     try {
-      str = new String(content, 0, length,
-          Charset.forName("ASCII").toString());
+      str = new String(content.array(), content.arrayOffset() + content.position(),
+          length, Charset.forName("ASCII").toString());
     } catch (UnsupportedEncodingException e) {
       // code should never come here, but just in case...
       return null;
@@ -157,8 +156,9 @@ public class HtmlParser implements Parse
     // parse the content
     DocumentFragment root;
     try {
-      byte[] contentInOctets = page.getContent().array();
-      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
+      ByteBuffer contentInOctets = page.getContent();
+      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(),
+          contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()));
 
       EncodingDetector detector = new EncodingDetector(conf);
       detector.autoDetectClues(page, true);

Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Thu Jun 27 17:04:42 2013
@@ -39,6 +39,7 @@ import org.apache.nutch.parse.ParseStatu
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.storage.ParseStatus;
 import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TableUtil;
 import org.apache.oro.text.regex.MatchResult;
@@ -163,7 +164,7 @@ public class JSParseFilter implements Pa
     if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
       return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT,
           "Content not JavaScript: '" + type + "'", getConf());
-    String script = new String(page.getContent().array());
+    String script = Bytes.toString(page.getContent());
     Outlink[] outlinks = getJSLinks(script, "", url);
     if (outlinks == null) outlinks = new Outlink[0];
     // Title? use the first line of the script...

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
(original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Thu Jun 27 17:04:42 2013
@@ -51,7 +51,6 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.nutch.util.TableUtil;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MimeType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.w3c.dom.DocumentFragment;
@@ -92,7 +91,7 @@ public class TikaParser implements org.a
     // get the right parser using the mime type as a clue
     String mimeType = page.getContentType().toString();
     Parser parser = tikaConfig.getParser(mimeType);
-    byte[] raw = page.getContent().array();
+    ByteBuffer raw = page.getContent();
 
     if (parser == null) {
       String message = "Can't retrieve Tika parser for mime-type " + mimeType;
@@ -114,7 +113,8 @@ public class TikaParser implements org.a
     // to add once available in Tika
     // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
     try {
-      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
+      parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(),
+          raw.remaining()), domhandler, tikamd, context);
     } catch (Exception e) {
       LOG.error("Error parsing "+url,e);
       return ParseStatusUtils.getEmptyParse(e, getConf());

Modified: nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
(original)
+++ nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Thu Jun 27 17:04:42 2013
@@ -111,7 +111,7 @@ public class OPICScoringFilter implement
     ByteBuffer cashRaw = row.getFromMetadata(CASH_KEY);
     float cash = 0.0f;
     if (cashRaw != null) {
-      cash = Bytes.toFloat(cashRaw.array());
+      cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position());
     }
     row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + adjust)));
   }
@@ -125,7 +125,7 @@ public class OPICScoringFilter implement
     if (cashRaw == null) {
       return;
     }
-    float cash = Bytes.toFloat(cashRaw.array());
+    float cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position());
     if (cash == 0) {
       return;
     }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Thu Jun 27 17:04:42
2013
@@ -25,8 +25,8 @@ import org.apache.avro.util.Utf8;
 import org.apache.hadoop.fs.Path;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.AbstractNutchTest;
+import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.CrawlTestUtil;
-import org.apache.gora.util.ByteUtils;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -113,7 +113,7 @@ public class TestInjector extends Abstra
       representation += "\tnutch.score=" + (int)page.getScore();
       ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
       if (bb != null) {
-        representation += "\tcustom.attribute=" + ByteUtils.toString(bb.array());
+        representation += "\tcustom.attribute=" + Bytes.toString(bb);
       }
       read.add(representation);
     }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Thu Jun 27 17:04:42
2013
@@ -27,6 +27,7 @@ import org.apache.nutch.crawl.InjectorJo
 import org.apache.nutch.crawl.URLWebPage;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.util.AbstractNutchTest;
+import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.CrawlTestUtil;
 import org.mortbay.jetty.Server;
 
@@ -113,7 +114,7 @@ public class TestFetcher extends Abstrac
       if (bb == null) {
         continue;
       }
-      String content = new String(bb.array());
+      String content = Bytes.toString(bb);
       if (content.indexOf("Nutch fetcher test page")!=-1) {
         handledurls.add(up.getUrl());
       }



Mime
View raw message