incubator-blur-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amccu...@apache.org
Subject [1/2] git commit: Added more documentation.
Date Sat, 24 Aug 2013 18:15:48 GMT
Updated Branches:
  refs/heads/master 4fc43e954 -> a4840274d


Added more documentation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-blur/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-blur/commit/0dcc0764
Tree: http://git-wip-us.apache.org/repos/asf/incubator-blur/tree/0dcc0764
Diff: http://git-wip-us.apache.org/repos/asf/incubator-blur/diff/0dcc0764

Branch: refs/heads/master
Commit: 0dcc0764ebada1d33a427755f0f212e8e501ebef
Parents: 4fc43e9
Author: Aaron McCurry <amccurry@gmail.com>
Authored: Sat Aug 24 14:09:25 2013 -0400
Committer: Aaron McCurry <amccurry@gmail.com>
Committed: Sat Aug 24 14:11:59 2013 -0400

----------------------------------------------------------------------
 .../blur/mapreduce/lib/CsvBlurDriver.java       | 17 ++++--
 .../main/java/org/apache/blur/shell/Main.java   |  5 +-
 distribution/src/main/scripts/bin/blur          |  2 +-
 docs/using-blur.base.html                       | 46 ++++++++++++++++-
 docs/using-blur.html                            | 54 +++++++++++++++++++-
 5 files changed, 113 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/0dcc0764/blur-mapred/src/main/java/org/apache/blur/mapreduce/lib/CsvBlurDriver.java
----------------------------------------------------------------------
diff --git a/blur-mapred/src/main/java/org/apache/blur/mapreduce/lib/CsvBlurDriver.java b/blur-mapred/src/main/java/org/apache/blur/mapreduce/lib/CsvBlurDriver.java
index 48bb902..570db31 100644
--- a/blur-mapred/src/main/java/org/apache/blur/mapreduce/lib/CsvBlurDriver.java
+++ b/blur-mapred/src/main/java/org/apache/blur/mapreduce/lib/CsvBlurDriver.java
@@ -60,8 +60,17 @@ import com.google.common.base.Splitter;
 @SuppressWarnings("static-access")
 public class CsvBlurDriver {
 
+  public static final String CSVLOADER = "csvloader";
   public static final String MAPRED_COMPRESS_MAP_OUTPUT = "mapred.compress.map.output";
   public static final String MAPRED_MAP_OUTPUT_COMPRESSION_CODEC = "mapred.map.output.compression.codec";
+  public static final int DEFAULT_WIDTH = 100;
+  public static final String HEADER = "The \"" +CSVLOADER +
+  		"\" command is used to load delimited into a Blur table.\nThe required options are \"-c\",
\"-t\", \"-d\". The " +
+  		"standard format for the contents of a file is:\"rowid,recordid,family,col1,col2,...\".
However there are " +
+  		"several options, such as the rowid and recordid can be generated based on the data in
the record via the " +
+  		"\"-A\" and \"-a\" options. The family can assigned based on the path via the \"-I\"
option. The column " +
+  		"name order can be mapped via the \"-d\" option. Also you can set the input " +
+  		"format to either sequence files vie the \"-S\" option or leave the default text files.";
 
   enum COMPRESSION {
     SNAPPY(SnappyCodec.class), GZIP(GzipCodec.class), BZIP(BZip2Codec.class), DEFAULT(DefaultCodec.class);
@@ -258,9 +267,9 @@ public class CsvBlurDriver {
                 + "charactors like the default hadoop separator of ASCII value 1, you can
use standard "
                 + "java escaping (\\u0001)").create("s"));
     options.addOption(OptionBuilder.withArgName("path*").hasArg()
-        .withDescription("The directory to index. (hdfs://namenode/input/in1)").create("i"));
+        .withDescription("The directory to index, the family name is assumed to BE present
in the file contents. (hdfs://namenode/input/in1)").create("i"));
     options.addOption(OptionBuilder.withArgName("family path*").hasArgs()
-        .withDescription("The directory to index with family name. (family hdfs://namenode/input/in1)").create("I"));
+        .withDescription("The directory to index with a family name, the family name is assumed
to NOT be present in the file contents. (family hdfs://namenode/input/in1)").create("I"));
     options
         .addOption(OptionBuilder
             .withArgName("auto generate record ids")
@@ -319,7 +328,7 @@ public class CsvBlurDriver {
       System.err.println(e.getMessage());
       HelpFormatter formatter = new HelpFormatter();
       PrintWriter pw = new PrintWriter(System.err, true);
-      formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, "csvindexer", null, options, HelpFormatter.DEFAULT_LEFT_PAD,
+      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
           HelpFormatter.DEFAULT_DESC_PAD, null, false);
       return null;
     }
@@ -328,7 +337,7 @@ public class CsvBlurDriver {
       System.err.println("Missing input directory, see options 'i' and 'I'.");
       HelpFormatter formatter = new HelpFormatter();
       PrintWriter pw = new PrintWriter(System.err, true);
-      formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, "csvindexer", null, options, HelpFormatter.DEFAULT_LEFT_PAD,
+      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
           HelpFormatter.DEFAULT_DESC_PAD, null, false);
       return null;
     }

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/0dcc0764/blur-shell/src/main/java/org/apache/blur/shell/Main.java
----------------------------------------------------------------------
diff --git a/blur-shell/src/main/java/org/apache/blur/shell/Main.java b/blur-shell/src/main/java/org/apache/blur/shell/Main.java
index eb23a06..819f7ab 100644
--- a/blur-shell/src/main/java/org/apache/blur/shell/Main.java
+++ b/blur-shell/src/main/java/org/apache/blur/shell/Main.java
@@ -341,9 +341,10 @@ public class Main {
       }
 
       out.println();
-      out.println("  " + buffer("shell", bufferLength) + " - enters into the Blur interactive
shell");
+      out.println("  " + buffer("shell", bufferLength) + " - enters into the Blur interactive
shell.");
       out.println("  " + buffer("execute", bufferLength)
-          + " - executes a custom class passing all the command line args to the main method");
+          + " - executes a custom class passing all the command line args to the main method.");
+      out.println("  " + buffer("csvloader", bufferLength) + " - runs a MapReduce job to
bulk load data into a table.");
     }
 
     private int getMaxCommandLength(Set<String> keySet) {

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/0dcc0764/distribution/src/main/scripts/bin/blur
----------------------------------------------------------------------
diff --git a/distribution/src/main/scripts/bin/blur b/distribution/src/main/scripts/bin/blur
index 8bed999..935386e 100755
--- a/distribution/src/main/scripts/bin/blur
+++ b/distribution/src/main/scripts/bin/blur
@@ -26,7 +26,7 @@ elif [ $1 = "shell" ]; then
   "$JAVA_HOME"/bin/java -Dblur.name=$PROC_NAME -Djava.library.path=$JAVA_LIBRARY_PATH $BLUR_COMMAND
-Dblur.logs.dir=$BLUR_LOGS -Dblur.log.file=blur-$USER-$PROC_NAME -Dlog4j.configuration=file://$BLUR_HOME/conf/log4j-command.xml
-cp $BLUR_CLASSPATH org.apache.blur.shell.Main ${@:2}
 elif [ $1 = "execute" ]; then
   "$JAVA_HOME"/bin/java -Dblur.name=$PROC_NAME -Djava.library.path=$JAVA_LIBRARY_PATH $BLUR_COMMAND
-Dblur.logs.dir=$BLUR_LOGS -Dblur.log.file=blur-$USER-$PROC_NAME -Dlog4j.configuration=file://$BLUR_HOME/conf/log4j-command.xml
-cp $BLUR_CLASSPATH ${@:2}
-elif [ $1 = "csv" ]; then
+elif [ $1 = "csvloader" ]; then
   for f in $BLUR_HOME/lib/*.jar; do
     BLUR_BASE_CLASSPATH=${BLUR_BASE_CLASSPATH}:$f;
   done

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/0dcc0764/docs/using-blur.base.html
----------------------------------------------------------------------
diff --git a/docs/using-blur.base.html b/docs/using-blur.base.html
index 3070d11..693daaa 100644
--- a/docs/using-blur.base.html
+++ b/docs/using-blur.base.html
@@ -249,7 +249,7 @@ CsvBlurMapper.addColumns(job, "cf1", "col");
 
 BlurOutputFormat.setupJob(job, tableDescriptor);
 BlurOutputFormat.setIndexLocally(job, true);
-BlurOutputFormat.setOptimizeInFlight(job, false);
+BlurOutputFormat.setOptimizeInFlight(job, true);
 
 job.waitForCompletion(true);</code></pre>
             <h3>Options</h3>
@@ -277,7 +277,49 @@ job.waitForCompletion(true);</code></pre>
               <h1 id="csv-loader">CSV Loader</h1>
             </div>
 <p>
-TODO	
+The CSV Loader program can be invoked by running:<pre><code class="bash">$BLUR_HOME/bin/blur
csvloader</code></pre>
+<div class="bs-callout bs-callout-warning"><h4>Caution</h4>Also the machine
that will execute this command will need to have Hadoop installed and configured locally,

+otherwise the scripts will not work correctly.</div>
+<pre><code class="bash">usage: csvloader
+The "csvloader" command is used to load delimited into a Blur table.
+The required options are "-c", "-t", "-d". The standard format for the contents of a file
+is:"rowid,recordid,family,col1,col2,...". However there are several options, such as the
rowid and
+recordid can be generated based on the data in the record via the "-A" and "-a" options.
The family
+can assigned based on the path via the "-I" option. The column name order can be mapped via
the "-d"
+option. Also you can set the input format to either sequence files vie the "-S" option or
leave the
+default text files.
+ -A                     No Row Ids - Automatically generate row ids for each record based
on a MD5
+                        has of the data within the record.
+ -a                     No Record Ids - Automatically generate record ids for each record
based on a
+                        MD5 has of the data within the record.
+ -b &lt;size&gt;              The maximum number of Lucene documents to buffer in
the reducer for a single
+                        row before spilling over to disk. (default 1000)
+ -c &lt;controller*&gt;       * Thrift controller connection string. (host1:40010
host2:40010 ...)
+ -C &lt;minimum maximum&gt;   Enables a combine file input to help deal with many
small files as the
+                        input. Provide the minimum and maximum size per mapper.  For a minimum
of
+                        1GB and a maximum of 2.5GB: (1000000000 2500000000)
+ -d &lt;family column*&gt;    * Define the mapping of fields in the CSV file to column
names. (family col1
+                        col2 col3 ...)
+ -I &lt;family path*&gt;      The directory to index with a family name, the family
name is assumed to NOT
+                        be present in the file contents. (family hdfs://namenode/input/in1)
+ -i &lt;path*&gt;             The directory to index, the family name is assumed
to BE present in the file
+                        contents. (hdfs://namenode/input/in1)
+ -l                     Disable the use storage local on the server that is running the reducing
+                        task and copy to Blur table once complete. (enabled by default)
+ -o                     Disable optimize indexes during copy, this has very little overhead.
+                        (enabled by default)
+ -p &lt;codec&gt;             Sets the compression codec for the map compress output
setting.
+                        (SNAPPY,GZIP,BZIP,DEFAULT, or classname)
+ -r &lt;multiplier&gt;        The reducer multipler allows for an increase in the
number of reducers per
+                        shard in the given table.  For example if the table has 128 shards
and the
+                        reducer multiplier is 4 the total number of reducers will be 512,
4 reducers
+                        per shard. (default 1)
+ -s &lt;delimiter&gt;         The file delimiter to be used. (default value ',')
 NOTE: For special
+                        charactors like the default hadoop separator of ASCII value 1, you
can use
+                        standard java escaping (\u0001)
+ -S                     The input files are sequence files.
+ -t &lt;tablename&gt;         * Blur table name.</code></pre>
+
 </p>
           </section>
           <section>

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/0dcc0764/docs/using-blur.html
----------------------------------------------------------------------
diff --git a/docs/using-blur.html b/docs/using-blur.html
index 567fc6b..192f2c0 100644
--- a/docs/using-blur.html
+++ b/docs/using-blur.html
@@ -106,7 +106,7 @@ limitations under the License.
 </ul>
 </li>
 <li><a href="#map-reduce">Map Reduce</a></li>
-
+<li><a href="#csv-loader">CSV Loader</a></li>
 <li><a href="#jdbc">JDBC</a></li>
 </ul>
 </div>
@@ -381,7 +381,7 @@ CsvBlurMapper.addColumns(job, "cf1", "col");
 
 BlurOutputFormat.setupJob(job, tableDescriptor);
 BlurOutputFormat.setIndexLocally(job, true);
-BlurOutputFormat.setOptimizeInFlight(job, false);
+BlurOutputFormat.setOptimizeInFlight(job, true);
 
 job.waitForCompletion(true);</code></pre>
 <h3>Options</h3>
@@ -406,6 +406,56 @@ BlurOutputFormat.setReducerMultiplier(Job,int)
 </section>
 <section>
 <div class="page-header">
+<h1 id="csv-loader">CSV Loader</h1>
+</div>
+<p>
+The CSV Loader program can be invoked by running:<pre><code class="bash">$BLUR_HOME/bin/blur
csvloader</code></pre>
+<div class="bs-callout bs-callout-warning"><h4>Caution</h4>Also the machine
that will execute this command will need to have Hadoop installed and configured locally,
+otherwise the scripts will not work correctly.</div>
+<pre><code class="bash">usage: csvloader
+The "csvloader" command is used to load delimited into a Blur table.
+The required options are "-c", "-t", "-d". The standard format for the contents of a file
+is:"rowid,recordid,family,col1,col2,...". However there are several options, such as the
rowid and
+recordid can be generated based on the data in the record via the "-A" and "-a" options.
The family
+can assigned based on the path via the "-I" option. The column name order can be mapped via
the "-d"
+option. Also you can set the input format to either sequence files vie the "-S" option or
leave the
+default text files.
+-A                     No Row Ids - Automatically generate row ids for each record based
on a MD5
+has of the data within the record.
+-a                     No Record Ids - Automatically generate record ids for each record
based on a
+MD5 has of the data within the record.
+-b &lt;size&gt;              The maximum number of Lucene documents to buffer in
the reducer for a single
+row before spilling over to disk. (default 1000)
+-c &lt;controller*&gt;       * Thrift controller connection string. (host1:40010
host2:40010 ...)
+-C &lt;minimum maximum&gt;   Enables a combine file input to help deal with many
small files as the
+input. Provide the minimum and maximum size per mapper.  For a minimum of
+1GB and a maximum of 2.5GB: (1000000000 2500000000)
+-d &lt;family column*&gt;    * Define the mapping of fields in the CSV file to column
names. (family col1
+col2 col3 ...)
+-I &lt;family path*&gt;      The directory to index with a family name, the family
name is assumed to NOT
+be present in the file contents. (family hdfs://namenode/input/in1)
+-i &lt;path*&gt;             The directory to index, the family name is assumed to
BE present in the file
+contents. (hdfs://namenode/input/in1)
+-l                     Disable the use storage local on the server that is running the reducing
+task and copy to Blur table once complete. (enabled by default)
+-o                     Disable optimize indexes during copy, this has very little overhead.
+(enabled by default)
+-p &lt;codec&gt;             Sets the compression codec for the map compress output
setting.
+(SNAPPY,GZIP,BZIP,DEFAULT, or classname)
+-r &lt;multiplier&gt;        The reducer multipler allows for an increase in the
number of reducers per
+shard in the given table.  For example if the table has 128 shards and the
+reducer multiplier is 4 the total number of reducers will be 512, 4 reducers
+per shard. (default 1)
+-s &lt;delimiter&gt;         The file delimiter to be used. (default value ',') 
NOTE: For special
+charactors like the default hadoop separator of ASCII value 1, you can use
+standard java escaping (\u0001)
+-S                     The input files are sequence files.
+-t &lt;tablename&gt;         * Blur table name.</code></pre>
+
+</p>
+</section>
+<section>
+<div class="page-header">
 <h1 id="jdbc">JDBC</h1>
 </div>
 <p>TODO</p>


Mime
View raw message