chukwa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From asrab...@apache.org
Subject svn commit: r796033 - in /hadoop/chukwa/trunk: bin/ src/docs/src/documentation/content/xdocs/ src/java/org/apache/hadoop/chukwa/ src/java/org/apache/hadoop/chukwa/util/ src/test/org/apache/hadoop/chukwa/extraction/archive/ src/test/org/apache/hadoop/ch...
Date Mon, 20 Jul 2009 21:51:33 GMT
Author: asrabkin
Date: Mon Jul 20 21:51:33 2009
New Revision: 796033

URL: http://svn.apache.org/viewvc?rev=796033&view=rev
Log:
CHUKWA-356. More powerful file extractor.

Added:
    hadoop/chukwa/trunk/bin/dump.sh
    hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpChunks.java
    hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/util/TestDumpChunks.java
Removed:
    hadoop/chukwa/trunk/bin/dumpDataType.sh
Modified:
    hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/index.xml
    hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/programming.xml
    hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/Chunk.java
    hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/ChunkImpl.java
    hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpDataType.java
    hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/archive/TestArchive.java
    hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/TestDemux.java
    hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/processor/mapper/TestHadoopLogProcessor.java

Added: hadoop/chukwa/trunk/bin/dump.sh
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/bin/dump.sh?rev=796033&view=auto
==============================================================================
--- hadoop/chukwa/trunk/bin/dump.sh (added)
+++ hadoop/chukwa/trunk/bin/dump.sh Mon Jul 20 21:51:33 2009
@@ -0,0 +1,24 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pid=$$
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/chukwa-config.sh
+
+exec ${JAVA_HOME}/bin/java -Xms32M -Xmx64M -DAPP=agent -Dlog4j.configuration=chukwa-log4j.properties
-DCHUKWA_HOME=${CHUKWA_HOME} -DCHUKWA_CONF_DIR=${CHUKWA_CONF_DIR} -DCHUKWA_LOG_DIR=${CHUKWA_LOG_DIR}
-classpath ${CHUKWA_CONF_DIR}:${CLASSPATH}:${CHUKWA_AGENT}:${CHUKWA_CORE}:${HADOOP_JAR}:${COMMON}
org.apache.hadoop.chukwa.util.DumpChunks $@

Modified: hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/index.xml
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/index.xml?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/index.xml (original)
+++ hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/index.xml Mon Jul 20 21:51:33
2009
@@ -24,16 +24,22 @@
   
   <body>
       <p>
-        The Chukwa Documentation provides the information you need to get started using Chukwa.
+        The Chukwa Documentation provides the information you need to get 
+        started using Chukwa.
       </p>
       <p>
-        If you're trying to set up a Chukwa cluster from scratch, you should read the <a
href="admin.html"> Chukwa Administration Guide</a> which shows you how to setup and
deploy Chukwa. 
+        If you're trying to set up a Chukwa cluster from scratch, you should 
+        read the <a href="admin.html"> Chukwa Administration Guide</a> which

+        shows you how to setup and deploy Chukwa. 
       </p>
-     <p> If you want to configure the Chukwa agent process, to control what's collected,
you should read the <a href="agent.html">Agent Guide</a>.
+     <p> If you want to configure the Chukwa agent process, to control what's 
+     collected, you should read the <a href="agent.html">Agent Guide</a>.
      </p>
-     <p>And if you want to use collected data, read the <a href="programming.html">programming
guide</a></p>
+     <p>And if you want to use collected data, read the 
+     <a href="programming.html">User and Programming Guide</a></p>
       <p>
-		If you have more questions, you can ask on the <a href="ext:lists">Chukwa Core Mailing
Lists</a>.
+		If you have more questions, you can ask on the 
+		<a href="ext:lists">Chukwa mailing lists</a>.
       </p>
   </body>
 </document>

Modified: hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/programming.xml
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/programming.xml?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/programming.xml (original)
+++ hadoop/chukwa/trunk/src/docs/src/documentation/content/xdocs/programming.xml Mon Jul 20
21:51:33 2009
@@ -15,41 +15,123 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 -->
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" 
+"http://forrest.apache.org/dtd/document-v20.dtd">
 
 <document>
   <header>
-    <title>Chukwa Programming Guide</title>
+    <title>Chukwa User and Programming Guide</title>
   </header>
   <body>
 
-<p>This document discusses the Chukwa archive file formats, and the layout of the Chukwa
storage directories.</p>
+<p>
+At the core of Chukwa is a flexible system for collecting and processing
+monitoring data, particularly log files. This document describes how to use the
+collected data.  (To control what gets collected, and for an overview of the
+Chukwa data model, see the <a href="agent.html">Agent Guide</a>.)  
+</p>
+
+<p>
+In particular, this document discusses the Chukwa archive file formats, and 
+the layout of the Chukwa storage directories.</p>
 
 <section>
 <title>Sink File Format</title>
-<p>As data is collected, Chukwa dumps it into <em>sink files</em> in HDFS.
By default, these are located in <code>/chukwa/logs</code>.  If the file name
ends in .chukwa, that means the file is still being written to. Every few minutes, the collector
will close the file, and rename the file to '*.done'.  This marks the file as available for
processing.</p>
-
-<p>Each sink file is a Hadoop sequence file, containing a succession of key-value pairs,
and periodic synch markers to facilitate MapReduce access. They key type is <code>ChukwaArchiveKey</code>;
the value type is <code>ChunkImpl</code>. See the Chukwa Javadoc for details about
these classes.</p>
+<p>
+As data is collected, Chukwa dumps it into <em>sink files</em> in HDFS. By
+default, these are located in <code>/chukwa/logs</code>.  If the file name ends
+in .chukwa, that means the file is still being written to. Every few minutes, 
+the collector will close the file, and rename the file to '*.done'.  This 
+marks the file as available for processing.</p>
+
+<p>
+Each sink file is a Hadoop sequence file, containing a succession of 
+key-value pairs, and periodic synch markers to facilitate MapReduce access. 
+They key type is <code>ChukwaArchiveKey</code>; the value type is 
+<code>ChunkImpl</code>. See the Chukwa Javadoc for details about these classes.
+</p>
 
 <p>Data in the sink may include duplicate and omitted chunks.</p>
 </section>
 
 <section>
 <title>Demux and Archiving</title>
-<p>It's possible to write MapReduce jobs that directly examine the data sink, but it's
not extremely convenient. Data is not organized in a useful way, so jobs will likely discard
most of their input. Data quality is imperfect, since duplicates and omissions may exist.
 And MapReduce and HDFS are optimized to deal with a modest number of large files, not many
small ones.</p> 
-
-<p> Chukwa therefore supplies several MapReduce jobs for organizing collected data
and putting it into a more useful form; these jobs are typically run regularly from cron.
 Knowing how to use Chukwa-collected data requires understanding how these jobs lay out storage.
For now, this document only discusses one such job: the Simple Archiver. </p>
+<p>It's possible to write MapReduce jobs that directly examine the data sink, 
+but it's not extremely convenient. Data is not organized in a useful way, so 
+jobs will likely discard most of their input. Data quality is imperfect, since 
+duplicates and omissions may exist.  And MapReduce and HDFS are optimized to 
+deal with a modest number of large files, not many small ones.</p> 
+
+<p> Chukwa therefore supplies several MapReduce jobs for organizing collected 
+data and putting it into a more useful form; these jobs are typically run 
+regularly from cron.  Knowing how to use Chukwa-collected data requires 
+understanding how these jobs lay out storage. For now, this document only 
+discusses one such job: the Simple Archiver. </p>
 </section>
 
 <section><title>Simple Archiver</title>
-<p>The simple archiver is designed to consolidate a large number of data sink files
into a small number of archive files, with the contents grouped in a useful way.  Archive
files, like raw sink files, are in Hadoop sequence file format. Unlike the data sink, however,
duplicates have been removed.  (Future versions of the Simple Archiver will indicate the presence
of gaps.)</p>
+<p>The simple archiver is designed to consolidate a large number of data sink 
+files into a small number of archive files, with the contents grouped in a 
+useful way.  Archive files, like raw sink files, are in Hadoop sequence file 
+format. Unlike the data sink, however, duplicates have been removed.  (Future 
+versions of the Simple Archiver will indicate the presence of gaps.)</p>
+
+<p>The simple archiver moves every <code>.done</code> file out of the sink,
and 
+then runs a MapReduce job to group the data. Output Chunks will be placed into 
+files with names of the form 
+<code>/chukwa/archive/clustername/Datatype_date.arc</code>.  
+Date corresponds to when the data was collected; Datatype is the datatype of 
+each Chunk. 
+</p>
+
+<p>If archived data corresponds to an existing filename, a new file will be 
+created with a disambiguating suffix.</p>
+
+<!-- The Simple Archiver is a Java class, stored in <code>chukwa-core-*.jar</code>
+-->
 
-<p>The simple archiver moves every <code>.done</code> file out of the sink,
and then runs a MapReduce job to group the data. Output Chunks will be placed into files with
names of the form <code>/chukwa/archive/clustername/Datatype_date.arc</code>.
 Date corresponds to when the data was collected; Datatype is the datatype of each Chunk.

+</section>
+
+<section>
+<title>Reading data from the sink or the archive</title>
+<p>
+It very often happens that you want to retrieve one or more files that have been
+collected with Chukwa. If the total volume of data to be recovered is not too
+great, you can use <code>dump.sh</code>, a command-line tool that does the job.
+</p>
+
+<p>
+The <code>dump</code> tool a search pattern as its first argument, followed by
+a list of files or file-globs.  It will then print the contents of every data
+stream in those files that matches the pattern. (A data stream is a sequence of
+chunks with the same host, source, and datatype.)  Data is printed in order,
+with duplicates removed.  No metadata is printed.  Separate streams are 
+separated by a row of dashes.  
+</p>
+
+<p>For example, the following command will dump all data from every file that
+matches the glob pattern.  Note the use of single quotes to pass glob patterns
+through to the application, preventing the shell from expanding them.</p>
+<source>
+bin/dump.sh 'datatype=.*' 'hdfs://host:9000/chukwa/archive/*.arc
+</source>
+
+<p>
+The patterns used by <code>dump.sh</code> are based on normal regular 
+expressions. They are of the form <code>field1=regex&#38;field2=regex</code>.
+That is, they are a sequence of rules, separated by ampersand signs. Each rule
+is of the form <code>metadatafield=regex</code>, where 
+<code>metadatafield</code> is one of the Chukwa metadata fields, and 
+<code>regex</code> is a regular expression.  The valid metadata field names are:
+<code>datatype</code>, <code>host</code>, <code>cluster</code>,
and 
+<code>name</code>.  
 </p>
 
-<p>If archived data corresponds to an existing filename, a new file will be created
with a disambiguating suffix.</p>
+<p>A stream matches the search pattern only if every rule matches. So to 
+retrieve HadoopLog data from cluster foo, you might search for 
+<code>cluster=foo&#38;datatype=HadoopLog</code>.
+</p>
 
-<!-- The Simple Archiver is a Java class, stored in <code>chukwa-core-*.jar</code>
-->
 
 </section>
 

Modified: hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/Chunk.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/Chunk.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/Chunk.java (original)
+++ hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/Chunk.java Mon Jul 20 21:51:33 2009
@@ -85,6 +85,8 @@
    * 
    *         We pick this convention so that subtracting sequence IDs yields
    *         length.
+   *         
+   *         Furthermore, seqID - length = first byte pos.
    */
   public long getSeqID();
 

Modified: hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/ChunkImpl.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/ChunkImpl.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/ChunkImpl.java (original)
+++ hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/ChunkImpl.java Mon Jul 20 21:51:33
2009
@@ -157,7 +157,6 @@
   }
 
   public int[] getRecordOffsets() {
-
     if (recordEndOffsets == null)
       recordEndOffsets = new int[] { data.length - 1 };
     return recordEndOffsets;
@@ -232,12 +231,8 @@
     for (int i = 0; i < recordEndOffsets.length; ++i)
       out.writeInt(recordEndOffsets[i]);
 
-    out.write(data, 0, recordEndOffsets[recordEndOffsets.length - 1] + 1); // byte
-                                                                           // at
-                                                                           // last
-                                                                           // offset
-                                                                           // is
-                                                                           // valid
+    out.write(data, 0, recordEndOffsets[recordEndOffsets.length - 1] + 1); 
+    // byte at last offset is valid
   }
 
   public static ChunkImpl read(DataInput in) throws IOException {
@@ -257,8 +252,8 @@
    * @see org.apache.hadoop.chukwa.Chunk#getSerializedSizeEstimate()
    */
   public int getSerializedSizeEstimate() {
-    int size = 2 * (source.length() + application.length() + dataType.length() + debuggingInfo
-        .length()); // length of strings (pessimistic)
+    int size = 2 * (source.length() + application.length() + dataType.length() 
+        + debuggingInfo.length()); // length of strings (pessimistic)
     size += data.length + 4;
     if (recordEndOffsets == null)
       size += 8;
@@ -274,5 +269,9 @@
     for (Integer offset : carriageReturns)
       recordEndOffsets[i++] = offset;
   }
+  
+  public int getLength() {
+    return data.length;
+  }
 
 }

Added: hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpChunks.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpChunks.java?rev=796033&view=auto
==============================================================================
--- hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpChunks.java (added)
+++ hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpChunks.java Mon Jul 20
21:51:33 2009
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.chukwa.util;
+
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.regex.*;
+import java.util.*;
+import java.io.*;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.hadoop.chukwa.ChukwaArchiveKey;
+import org.apache.hadoop.chukwa.ChunkImpl;
+import org.apache.hadoop.chukwa.conf.ChukwaConfiguration;
+import org.apache.hadoop.chukwa.extraction.engine.RecordUtil;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.conf.Configuration;
+
+public class DumpChunks {
+
+  static class SearchRule {
+    Pattern p;
+    String targ;
+    
+    SearchRule(Pattern p, String t) {
+      this.p = p;
+      this.targ = t;
+    }
+    
+    boolean matches(ChunkImpl chunk) {
+      if(targ.equals("datatype")) {
+        return p.matcher(chunk.getDataType()).matches();
+      } else if(targ.equals("name")) {
+        return p.matcher(chunk.getStreamName()).matches();
+      } else if(targ.equals("host")) {
+        return p.matcher(chunk.getSource()).matches();
+      } else if(targ.equals("cluster")) {
+        String cluster = RecordUtil.getClusterName(chunk);
+        return p.matcher(cluster).matches();
+      }
+      else { 
+        assert false: "unknown target: " +targ;
+        return false;
+      }
+    }
+    
+  }
+  
+  static final String[] SEARCH_TARGS = {"datatype", "name", "host", "cluster"};
+
+    static final String SEPARATOR="&";
+  /**
+   * Tries to find chunks matching a given pattern.
+   * Takes as input a set of &-delimited patterns, followed
+   * by a list of file names.
+   * 
+   * E.g:  Dump datatype=Iostat&source=/my/log/.* *.done
+   */
+  public static void main(String[] args) throws IOException, URISyntaxException {
+    
+    if(args.length < 2) {
+      System.out.println("usage: Dump pattern1,pattern2,pattern3... file1 file2 file3...");
+      System.exit(-1);
+    }
+    System.err.println("Patterns:" + args[0]);
+    for(int i=1; i < args.length; ++i)
+        System.err.println("FileGlob: " + args[i]);
+
+    ChukwaConfiguration conf = new ChukwaConfiguration();
+    FileSystem fs;
+    if(args[1].contains("://")) {
+      fs = FileSystem.get(new URI(args[1]), conf);
+    } else {
+      String fsName = conf.get("writer.hdfs.filesystem");
+      if(fsName == null)
+        fs = FileSystem.getLocal(conf);
+      else
+        fs = FileSystem.get(conf);
+    }
+    System.err.println("filesystem is " + fs.getUri());
+
+    dump(args, conf, fs, System.out);
+  }
+
+  static void dump(String[] args, Configuration conf,
+      FileSystem fs, PrintStream out) throws IOException {
+    List<SearchRule> patterns = buildPatterns(args[0]);
+    ArrayList<Path> filesToSearch = new ArrayList<Path>();
+
+    Map<String, SortedMap<Long, ChunkImpl> > matchCatalog = new HashMap<String,
SortedMap<Long, ChunkImpl> >();
+    
+    for(int i=1; i < args.length; ++i){
+      Path[] globbedPaths = FileUtil.stat2Paths(fs.globStatus(new Path(args[i])));
+      for(Path p: globbedPaths)
+        filesToSearch.add(p);
+    }
+    
+    System.err.println("expands to " + filesToSearch.size() + " actual files");
+
+    try {
+      for(Path p: filesToSearch) {
+      
+        SequenceFile.Reader r = new SequenceFile.Reader(fs, p, conf);
+  
+        ChukwaArchiveKey key = new ChukwaArchiveKey();
+        ChunkImpl chunk = ChunkImpl.getBlankChunk();
+        while (r.next(key, chunk)) {
+          if(matchesPattern(patterns, chunk)) {
+            updateMatchCatalog(matchCatalog, key.getStreamName(), chunk);
+            chunk = ChunkImpl.getBlankChunk();
+          }
+        }
+      }
+      
+      for(SortedMap<Long, ChunkImpl> stream: matchCatalog.values()) {
+        printNoDups(stream, out);
+        out.println("\n--------------------");
+      }
+      
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+  
+  private static void printNoDups(SortedMap<Long, ChunkImpl> stream, OutputStream out)
throws IOException {
+    long nextToPrint = 0;
+
+   System.err.println("---- map starts at "+ stream.firstKey());
+    for(Map.Entry<Long, ChunkImpl> e: stream.entrySet()) {
+      if(e.getKey() >= nextToPrint) {
+        System.err.println("---- printing bytes starting at " + e.getKey());
+        out.write(e.getValue().getData());
+        nextToPrint = e.getValue().getSeqID();
+      } else if(e.getValue().getSeqID() < nextToPrint) {
+        continue; //data already printed
+      } else {
+        //tricky case: chunk overlaps with already-printed data, but not completely
+        ChunkImpl c = e.getValue();
+        long chunkStartPos = e.getKey();
+        int numToPrint = (int) (c.getSeqID() - nextToPrint);
+        int printStartOffset = (int) ( nextToPrint -  chunkStartPos);
+        out.write(c.getData(), printStartOffset, numToPrint);
+        nextToPrint = c.getSeqID();
+      }
+    }
+    
+  }
+
+  private static void updateMatchCatalog(
+      Map<String, SortedMap<Long, ChunkImpl>> matchCatalog, String streamName,
+      ChunkImpl chunk) {
+
+    SortedMap<Long, ChunkImpl> chunksInStream = matchCatalog.get(streamName);
+    if(chunksInStream == null ) {
+      chunksInStream = new TreeMap<Long, ChunkImpl>();
+      matchCatalog.put(streamName, chunksInStream);
+    }
+    
+    long startPos = chunk.getSeqID() - chunk.getLength();
+    
+    ChunkImpl prevMatch = chunksInStream.get(startPos);
+    if(prevMatch == null)
+      chunksInStream.put(startPos, chunk);
+    else { //pick longest
+      if(chunk.getLength() > prevMatch.getLength())
+        chunksInStream.put (startPos, chunk);
+    }
+    
+  }
+
+  static List<SearchRule> buildPatterns(String listOfPatterns) throws
+  PatternSyntaxException{
+    List<SearchRule> compiledPatterns = new ArrayList<SearchRule>();
+    //FIXME: could escape these
+    String[] patterns = listOfPatterns.split(SEPARATOR);
+    for(String p: patterns) {
+      int equalsPos = p.indexOf('=');
+      
+      if(equalsPos < 0 || equalsPos > (p.length() -2)) {
+        throw new PatternSyntaxException(
+            "pattern must be of form targ=pattern", p, -1);
+      }
+      
+      String targ = p.substring(0, equalsPos);
+      if(!ArrayUtils.contains(SEARCH_TARGS, targ)) {
+        throw new PatternSyntaxException(
+            "pattern doesn't start with recognized search target", p, -1);
+      }
+      
+      Pattern pat = Pattern.compile(p.substring(equalsPos+1));
+      compiledPatterns.add(new SearchRule(pat, targ));
+    }
+    
+    return compiledPatterns;
+  }
+
+  static boolean matchesPattern(List<SearchRule> matchers, ChunkImpl chunk) {
+    for(SearchRule r: matchers) {
+      if(!r.matches(chunk))
+        return false;
+    }
+    return true;
+  }
+
+}

Modified: hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpDataType.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpDataType.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpDataType.java (original)
+++ hadoop/chukwa/trunk/src/java/org/apache/hadoop/chukwa/util/DumpDataType.java Mon Jul 20
21:51:33 2009
@@ -1,50 +0,0 @@
-package org.apache.hadoop.chukwa.util;
-
-
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import org.apache.hadoop.chukwa.ChukwaArchiveKey;
-import org.apache.hadoop.chukwa.ChunkImpl;
-import org.apache.hadoop.chukwa.conf.ChukwaConfiguration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-
-public class DumpDataType {
-
-  /**
-   * @param args
-   * @throws URISyntaxException
-   * @throws IOException
-   */
-  public static void main(String[] args) throws IOException, URISyntaxException {
-    System.err.println("Input file:" + args[0]);
-    System.err.println("DataType:" + args[1]);
-    System.err.println("Source:" + args[2]);
-
-    ChukwaConfiguration conf = new ChukwaConfiguration();
-    String fsName = conf.get("writer.hdfs.filesystem");
-    FileSystem fs = FileSystem.get(new URI(fsName), conf);
-
-    SequenceFile.Reader r = new SequenceFile.Reader(fs, new Path(args[0]), conf);
-
-    ChukwaArchiveKey key = new ChukwaArchiveKey();
-    ChunkImpl chunk = ChunkImpl.getBlankChunk();
-    try {
-      while (r.next(key, chunk)) {
-        if (args[1].equalsIgnoreCase(chunk.getDataType())) {
-          if (args[2].equalsIgnoreCase("ALL")
-              || args[2].equalsIgnoreCase(chunk.getSource())) {
-            System.out.print(new String(chunk.getData()));
-          }
-        }
-
-      }
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
-
-  }
-
-}

Modified: hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/archive/TestArchive.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/archive/TestArchive.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/archive/TestArchive.java
(original)
+++ hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/archive/TestArchive.java
Mon Jul 20 21:51:33 2009
@@ -61,7 +61,7 @@
         + r.nextInt() + "\n";
 
     ChunkImpl c = new ChunkImpl("HadoopLogProcessor", "test",
-        line.length() - 1L + lastSeqID, line.getBytes(), null);
+        line.length()  + lastSeqID, line.getBytes(), null);
     lastSeqID += line.length();
     c.addTag("cluster=\"foocluster\"");
     return c;

Modified: hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/TestDemux.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/TestDemux.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/TestDemux.java
(original)
+++ hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/TestDemux.java
Mon Jul 20 21:51:33 2009
@@ -51,7 +51,7 @@
         + r.nextInt() + "\n";
 
     ChunkImpl c = new ChunkImpl("HadoopLogProcessor", "test",
-        line.length() - 1L, line.getBytes(), null);
+        line.length() , line.getBytes(), null);
     return c;
   }
 

Modified: hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/processor/mapper/TestHadoopLogProcessor.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/processor/mapper/TestHadoopLogProcessor.java?rev=796033&r1=796032&r2=796033&view=diff
==============================================================================
--- hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/processor/mapper/TestHadoopLogProcessor.java
(original)
+++ hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/extraction/demux/processor/mapper/TestHadoopLogProcessor.java
Mon Jul 20 21:51:33 2009
@@ -70,7 +70,7 @@
         + " INFO org.apache.hadoop.dfs.DataNode: Some text goes here"
         + r.nextInt() + "\n";
     ChunkImpl c = new ChunkImpl("HadoopLogProcessor", "test",
-        line.length() - 1, line.getBytes(), null);
+        line.length() , line.getBytes(), null);
 
     return c;
   }

Added: hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/util/TestDumpChunks.java
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/util/TestDumpChunks.java?rev=796033&view=auto
==============================================================================
--- hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/util/TestDumpChunks.java (added)
+++ hadoop/chukwa/trunk/src/test/org/apache/hadoop/chukwa/util/TestDumpChunks.java Mon Jul
20 21:51:33 2009
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.chukwa.util;
+
+import junit.framework.TestCase;
+import java.util.*;
+import java.io.*;
+import org.apache.hadoop.chukwa.ChukwaArchiveKey;
+import org.apache.hadoop.chukwa.ChunkImpl;
+import org.apache.hadoop.chukwa.datacollection.DataFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+
+public class TestDumpChunks extends TestCase {
+  
+  public static void writeSeqFile(Configuration conf, FileSystem fileSys, Path dest,
+      List<ChunkImpl> chunks) throws IOException {
+    FSDataOutputStream out = fileSys.create(dest);
+
+    Calendar calendar = Calendar.getInstance();
+    SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(conf, out,
+        ChukwaArchiveKey.class, ChunkImpl.class,
+        SequenceFile.CompressionType.NONE, null);
+    for (ChunkImpl chunk: chunks) {
+      ChukwaArchiveKey archiveKey = new ChukwaArchiveKey();
+      
+      calendar.set(Calendar.SECOND, 0);
+      calendar.set(Calendar.MILLISECOND, 0);
+      archiveKey.setTimePartition(calendar.getTimeInMillis());
+      
+      archiveKey.setDataType(chunk.getDataType());
+      archiveKey.setStreamName(chunk.getStreamName());
+      archiveKey.setSeqId(chunk.getSeqID());
+      seqFileWriter.append(archiveKey, chunk);
+    }
+    seqFileWriter.close();
+    out.close();
+  }
+  
+  public void testBasicPatternMatching() {
+   try {
+     List<DumpChunks.SearchRule> rules = DumpChunks.buildPatterns("host=foo.*&cluster=bar&datatype=Data");
+     assertEquals(3, rules.size());
+     byte[] dat = "someText".getBytes();
+     ChunkImpl chunkNone = new ChunkImpl("badData","aname", dat.length, dat, null);
+     assertFalse(DumpChunks.matchesPattern(rules, chunkNone));
+
+     ChunkImpl chunkSome = new ChunkImpl("badData", "aname", dat.length, dat, null);
+     chunkSome.setSource("fooly");
+     chunkSome.addTag("cluster=\"bar\"");
+     assertFalse(DumpChunks.matchesPattern(rules, chunkSome));
+
+     ChunkImpl chunkAll = new ChunkImpl("Data", "aname", dat.length, dat, null);
+     chunkAll.setSource("fooly");
+     chunkAll.addTag("cluster=\"bar\"");
+
+     System.out.println("chunk is " + chunkAll);
+     assertTrue(DumpChunks.matchesPattern(rules, chunkAll));
+   } catch(Exception e) {
+     fail("exception " + e);
+   } 
+  }
+  
+  public void testFilePatternMatching() throws IOException {
+    
+    File tempDir = new File(System.getProperty("test.build.data", "/tmp"));
+
+    File tmpFile = File.createTempFile("dumpchunkTest", ".seq", tempDir);
+    tmpFile.deleteOnExit();
+    
+    Configuration conf = new Configuration();
+    Path path = new Path(tmpFile.getAbsolutePath());
+    List<ChunkImpl> chunks = new ArrayList<ChunkImpl>();
+    byte[] dat = "test".getBytes();
+    
+    ChunkImpl c = new ChunkImpl("Data", "aname", dat.length, dat, null);
+    chunks.add(c);
+    
+    dat = "ing".getBytes();
+    c = new ChunkImpl("Data", "aname", dat.length+4, dat, null);
+    chunks.add(c);
+    
+    writeSeqFile(conf, FileSystem.getLocal(conf), path, chunks);
+    
+    String[] args = new String[] {"datatype=Data",path.toString()};
+    ByteArrayOutputStream capture = new ByteArrayOutputStream();
+    DumpChunks.dump(args, conf, FileSystem.getLocal(conf), new PrintStream(capture));
+    
+    assertTrue(new String(capture.toByteArray()).startsWith("testing\n---"));
+    //now test for matches.
+    
+    
+    
+  }
+
+}



Mime
View raw message