hadoop-hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From j..@apache.org
Subject svn commit: r959481 - in /hadoop/hive/trunk: ./ docs/ docs/stylesheets/ docs/xdocs/ docs/xdocs/language_manual/ ivy/
Date Thu, 01 Jul 2010 01:20:41 GMT
Author: jvs
Date: Thu Jul  1 01:20:40 2010
New Revision: 959481

URL: http://svn.apache.org/viewvc?rev=959481&view=rev
Log:
HIVE-1135. Use Anakia for version controlled documentation
(Edward Capriolo via jvs)


Added:
    hadoop/hive/trunk/docs/site.css
    hadoop/hive/trunk/docs/stylesheets/
    hadoop/hive/trunk/docs/stylesheets/project.xml
    hadoop/hive/trunk/docs/stylesheets/site.vsl
    hadoop/hive/trunk/docs/velocity.properties
    hadoop/hive/trunk/docs/xdocs/
    hadoop/hive/trunk/docs/xdocs/index.xml
    hadoop/hive/trunk/docs/xdocs/language_manual/
    hadoop/hive/trunk/docs/xdocs/language_manual/data-manipulation-statements.xml
    hadoop/hive/trunk/docs/xdocs/language_manual/joins.xml
    hadoop/hive/trunk/docs/xdocs/language_manual/working_with_bucketed_tables.xml
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/build-common.xml
    hadoop/hive/trunk/build.xml
    hadoop/hive/trunk/ivy.xml
    hadoop/hive/trunk/ivy/libraries.properties

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=959481&r1=959480&r2=959481&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Thu Jul  1 01:20:40 2010
@@ -259,6 +259,9 @@ Release 0.6.0 -  Unreleased
     HIVE-1359. Unit test should be shim-aware
     (Ning Zhang via jvs)
 
+    HIVE-1135. Use Anakia for version controlled documentation
+    (Edward Capriolo via jvs)
+
   OPTIMIZATIONS
 
     HIVE-1348. Move inputFileChanged() from ExecMapper to where it is needed

Modified: hadoop/hive/trunk/build-common.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/build-common.xml?rev=959481&r1=959480&r2=959481&view=diff
==============================================================================
--- hadoop/hive/trunk/build-common.xml (original)
+++ hadoop/hive/trunk/build-common.xml Thu Jul  1 01:20:40 2010
@@ -179,6 +179,14 @@
       pattern="${build.dir.hadoop}/[artifact]-[revision].[ext]"/>
   </target>
 
+  <target name="ivy-docs" depends="ivy-init"
+    description="Resolve, Retrieve Ivy-managed artifacts for docs configuration">
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="docs"/>
+    <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
+                  pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="docs"/>
+    <ivy:cachepath pathid="docs-classpath" conf="docs"/> 		
+  </target>
+
   <available property="hadoopcore.${hadoop.version.ant-internal}.install.done"
     file="${build.dir.hadoop}/hadoop-${hadoop.version.ant-internal}.installed"/>
 

Modified: hadoop/hive/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/build.xml?rev=959481&r1=959480&r2=959481&view=diff
==============================================================================
--- hadoop/hive/trunk/build.xml (original)
+++ hadoop/hive/trunk/build.xml Thu Jul  1 01:20:40 2010
@@ -36,6 +36,8 @@
   <property name="build.docs" value="${target.dir}/docs"/>
   <property name="build.javadoc" value="${build.docs}/api"/>
   <property name="docs.src" value="${hive.root}/docs"/>
+  <property name="anakia.docs.src" value="${docs.src}/xdocs"/>
+  <property name="anakia.docs.dest" value="${target.dir}/docs"/>
   <property name="changes.src" value="${docs.src}/changes"/>
   <property name="images.src" value="${docs.src}/images"/>
   <property name="javadoc.link.java"
@@ -388,8 +390,9 @@
   <!-- Documentation                                                      -->
   <!-- ================================================================== -->
 
-  <target name="docs">
+  <target name="docs" description="Generate documentation">
     <antcall target="changes-to-html"/>
+    <antcall target="docs-anakia"/>
   </target>
 
   <target name="changes-to-html" description="Convert CHANGES.txt into an html file">
@@ -552,4 +555,31 @@
     </available>
   </target>
 
+  <target name="docs-anakia" depends="ivy-docs">
+    <echo message="Building xdocs with anakia"/>
+    <mkdir dir="${build.dir.hive}/docs"/>
+    <taskdef name="anakia" classname="org.apache.velocity.anakia.AnakiaTask">
+      <classpath refid="common-classpath"/>
+      <classpath refid="docs-classpath"/>
+    </taskdef>
+    <anakia basedir="${anakia.docs.src}" destdir="${anakia.docs.dest}"
+      extension=".html" style="./docs/stylesheets/site.vsl"
+      projectFile="../stylesheets/project.xml"
+      excludes="**/stylesheets/**"
+      includes="**/*.xml"
+      lastModifiedCheck="false"
+      velocityPropertiesFile="${docs.src}/velocity.properties">
+    </anakia>
+    <copy todir="${anakia.docs.dest}/images" filtering="no">
+      <fileset dir="${docs.src}/images">
+        <include name="**/*.gif"/>
+        <include name="**/*.jpeg"/>
+        <include name="**/*.jpg"/>
+        <include name="**/*.png"/>
+      </fileset>
+    </copy>
+    <copy file="${docs.src}/site.css" tofile="${anakia.docs.dest}/site.css" />
+  </target>
+
+  
 </project>

Added: hadoop/hive/trunk/docs/site.css
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/site.css?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/site.css (added)
+++ hadoop/hive/trunk/docs/site.css Thu Jul  1 01:20:40 2010
@@ -0,0 +1,305 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.    
+ */
+
+
+/** defined standard tags **/
+body {
+	background-color: #ffffff;
+	color: #000000;
+}
+
+a:link, a:active, a:visited {
+    color: #525D76;
+}
+
+
+h1 {
+	background-color: #525D76;
+	color: #ffffff;
+	font-family: arial,helvetica,sanserif;
+	font-size: large;
+	padding-left:2px;
+}
+
+h2 {
+	background-color: #828DA6;
+	color: #ffffff;
+	font-family: arial,helvetica,sanserif;
+	font-size: medium;
+	padding-left:2px;
+}
+
+table {
+	border: none;
+	border-spacing:0px;
+	border-collapse: collapse;
+}
+
+img {
+	border: none 0px;
+}
+
+/** define layout **/
+
+/** table used to force footer to end of page **/
+table#layout {
+	width:100%;
+}
+
+table#layout td {
+	padding:0px;
+}
+
+div#container {
+	width: 95%;
+	margin: 10px;
+	margin-left: 0;
+	margin-right: auto;
+	padding: 10px;
+}
+
+div#header {
+	padding: 5px;
+	margin: 0px;
+	margin-top:5px;
+	margin-bottom:5px;
+	height:80px;
+	border-bottom: 1px solid #333333;
+}
+
+div#menu {
+	float: left;
+	width: 200px;
+	margin: 0;
+	margin-left: 0px;
+	margin-right: 5px;
+
+	/** little higher margin since it doesn't start with a header **/
+	margin-top:10px;
+	margin-bottom:0px;
+
+	padding: 5px;
+}
+
+div#body {
+	margin-right:0px;
+	margin-left: 215px;
+	margin-top:5px;
+	margin-bottom:0px;
+
+	padding: 5px;
+
+}
+
+div#footer {
+
+	clear: both;
+
+	padding-top:15px;
+	margin-top:25px;
+	border-top: 1px solid #333333;
+
+
+	text-align:center;
+	color: #525D76;
+	font-style: italic;
+	font-size: smaller;
+}
+
+div#logo1 {
+	float:left;
+	margin-left:5px;
+	margin-top:10px;
+}
+
+
+div#logo2 {
+	float:right;
+	margin-top:10px;
+}
+
+
+/** define body tag redefinitions **/
+
+
+div#body th {
+	background-color: #039acc;
+	color: #000000;
+	font-family: arial,helvetica,sanserif;
+	font-size: smaller;
+	vertical-align: top;
+	text-align:left;
+	border:1px #FFFFFF solid;
+	padding: 2px;
+}
+
+div#body td {
+	background-color: #a0ddf0;
+	color: #000000;
+	font-family: arial,helvetica,sanserif;
+	font-size: smaller;
+	vertical-align: top;
+	text-align:left;
+	border:1px #FFFFFF solid;
+	padding: 2px;
+}
+
+
+div#body li {
+	 margin-top:3px;
+}
+
+/** define other body styles **/
+
+div.section {
+	margin-left: 25px;
+}
+
+div.subsection {
+	margin-left: 25px;
+}
+
+div.source {
+	margin-left:25px;
+	margin-top:20px;
+	margin-bottom:20px;
+	padding-left:4px;
+	padding-right:4px;
+	padding-bottom:4px;
+	padding-top:5px;
+
+	width:600;
+
+	border: 1px solid #333333;
+	background-color: #EEEEEE;
+	color: #333333;
+
+	/** bug: puts a extra line before the block in IE and after the block in FireFox **/
+	white-space: pre;
+
+	font-family: Courier;
+	font-size: smaller;
+	text-align: left;
+
+	overflow:auto;
+}
+
+
+div.license {
+	margin-left:0px;
+	margin-top:20px;
+	margin-bottom:20px;
+	padding:5px;
+
+	border: 1px solid #333333;
+	background-color: #EEEEEE;
+	color: #333333;
+
+	text-align: left;
+}
+
+/** define menu styles **/
+
+div.menusection {
+	margin-bottom:10px;
+}
+
+.menuheader {
+	font-weight:bold;
+	margin-bottom:0px;
+}
+
+div.menusection ul {
+	margin-top:5px;
+
+}
+div.menusection li {
+
+}
+
+
+
+
+/** printing **/
+@page Section1
+    {
+    size:8.5in 11.0in;
+    margin:1.0in .75in 1.0in .75in;
+}
+
+@media print {
+
+	/** make sure this fits the page **/
+	div#container {
+		width:100%;
+		min-height:0px;
+	}
+
+
+	div#menu {
+		display:none;
+	}
+
+	div#header {
+		display:none;
+	}
+
+	div#body {
+		margin-left:5px;
+	}
+
+
+	div.source {
+		width:95%;
+		margin-left:0px;
+	}
+
+	/** make a bit more room on the page **/
+	div.section {
+		margin-left: 0px;
+	}
+
+	div.subsection {
+		margin-left: 0px;
+	}
+
+	h1 {
+		background-color: #FFFFFF;
+		color: #000000;
+	}
+
+	h2 {
+		background-color: #FFFFFF;
+		color: #000000;
+	}
+
+	div#body td {
+		background-color: #FFFFFF;
+		color: #000000;
+		border: #333333 1px solid;
+	}
+
+	div#body th {
+		background-color: #FFFFFF;
+		color: #000000;
+		border: #333333 1px solid;
+		font-style:bold;
+	}
+
+}

Added: hadoop/hive/trunk/docs/stylesheets/project.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/stylesheets/project.xml?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/stylesheets/project.xml (added)
+++ hadoop/hive/trunk/docs/stylesheets/project.xml Thu Jul  1 01:20:40 2010
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.    
+-->
+
+<project name="Apache Hadoop Hive" href="http://hadoop.apache.org/hive">
+  <title>Hadoop Hive</title>
+  <logo href="images/hive-logo.jpg">Hadoop Hive</logo>
+  <body>
+    <menu name="Hadoop hive">
+      <item name="General"  href="/index.html" />
+    </menu>
+    <menu name="Hive Language Manual">
+      <item name="Data Manipulation Statements" href="/language_manual/data-manipulation-statements.html" />
+      <item name="Joins" href="/language_manual/joins.html" />
+    </menu>
+    <menu name="Developer Guide">
+      <item name="Issue Tracking (JIRA)" href="https://issues.apache.org/jira/browse/HIVE"/>
+    </menu>
+  </body>
+</project>

Added: hadoop/hive/trunk/docs/stylesheets/site.vsl
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/stylesheets/site.vsl?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/stylesheets/site.vsl (added)
+++ hadoop/hive/trunk/docs/stylesheets/site.vsl Thu Jul  1 01:20:40 2010
@@ -0,0 +1,317 @@
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##   http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing,
+## software distributed under the License is distributed on an
+## "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+## KIND, either express or implied.  See the License for the
+## specific language governing permissions and limitations
+## under the License.    
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.    
+-->
+
+<!-- start the processing -->
+#document()
+<!-- end the processing -->
+
+## This is where the macro's live
+
+#macro ( table $table)
+<table>
+    #foreach ( $items in $table.getChildren() )
+        #if ($items.getName().equals("tr"))
+            #tr ($items)
+        #end
+    #end
+</table>
+#end
+
+#macro ( tr $tr)
+<tr>
+    #foreach ( $items in $tr.getChildren() )
+        #if ($items.getName().equals("td"))
+            #td ($items)
+        #elseif ($items.getName().equals("th"))
+            #th ($items)
+        #end
+    #end
+</tr>
+#end
+
+#macro ( td $value)
+#if ($value.getAttributeValue("colspan"))
+#set ($colspan = $value.getAttributeValue("colspan"))
+#end
+#if ($value.getAttributeValue("rowspan"))
+#set ($rowspan = $value.getAttributeValue("rowspan"))
+#end
+<td colspan="$!colspan" rowspan="$!rowspan">
+	#foreach ( $items in $value.getContent() )
+		#if($items.name)
+			#display($items)
+		#else
+			$items.value
+		#end
+	#end
+</td>
+#end
+
+#macro ( th $value)
+#if ($value.getAttributeValue("colspan"))
+#set ($colspan = $value.getAttributeValue("colspan"))
+#end
+#if ($value.getAttributeValue("rowspan"))
+#set ($rowspan = $value.getAttributeValue("rowspan"))
+#end
+<th colspan="$!colspan" rowspan="$!rowspan">
+	#foreach ( $items in $value.getContent() )
+		#if($items.name)
+			#display($items)
+		#else
+			$items.value
+		#end
+	#end
+</th>
+#end
+
+#macro ( projectanchor $name $value )
+#if ($value.startsWith("http://"))
+    <a href="$value">$name</a>
+#elseif ($value.startsWith("https://"))
+    <a href="$value">$name</a>
+#else
+    <a href="$relativePath$value">$name</a>
+#end
+#end
+
+#macro ( metaauthor $author $email )
+            <meta name="author" value="$author">
+            <meta name="email" value="$email">
+#end
+
+#macro ( image $value )
+#if ($value.getAttributeValue("width"))
+#set ($width=$value.getAttributeValue("width"))
+#end
+#if ($value.getAttributeValue("height"))
+#set ($height=$value.getAttributeValue("height"))
+#end
+#if ($value.getAttributeValue("align"))
+#set ($align=$value.getAttributeValue("align"))
+#end
+<img src="$relativePath$value.getAttributeValue("src")" width="$!width" height="$!height" align="$!align">
+#end
+
+#macro ( source $value)
+<div class="source">$escape.getText($value.getText())</pre></div>
+#end
+
+
+## need these to catch special macros within lists
+#macro(list $node)
+<$node.getName()>
+	#foreach ( $items in $node.getChildren() )
+		#listitem($items)
+	#end
+</$node.getName()>
+#end
+
+#macro (listitem $node)
+<$node.getName()>
+## use getContent instead of getChildren
+## to include both text and nodes
+	#foreach ( $items in $node.getContent() )
+		#if($items.name)
+			#display($items)
+		#else
+			$items.value
+		#end
+	#end
+</$node.getName()>
+#end
+
+
+## # displays a basic node, calling macros if appropriate
+#macro ( display $node )
+		#if ($node.getName().equals("img"))
+			#image ($node)
+		#elseif ($node.getName().equals("source"))
+			#source ($node)
+		#elseif ($node.getName().equals("table"))
+			#table ($node)
+		#elseif ($node.getName().equals("ul"))
+			#list ($node)
+		#elseif ($node.getName().equals("ol"))
+			#list ($node)
+		#else
+			$node
+		#end
+#end
+
+#macro ( section $section)
+	<a name="#anchorName($section)"></a>
+	<h1>$section.getAttributeValue("name")</h1>
+
+	<div class="subsection">
+		#foreach ( $items in $section.getChildren() )
+			#if ($items.getName().equals("subsection"))
+				#subsection ($items)
+			#else
+				#display($items)
+			#end
+		#end
+	</div>
+#end
+
+#macro ( subsection $subsection)
+	<a name="#anchorName($subsection)"></a>
+	<h2>$subsection.getAttributeValue("name")</h2>
+	<div class="subsection">
+		#foreach ( $items in $subsection.getChildren() )
+			#display($items)
+		#end
+	</div>
+#end
+
+#macro ( anchorName $section)
+#if ($section.getAttributeValue("href"))
+$section.getAttributeValue("href")##
+#else
+$section.getAttributeValue("name")##
+#end
+#end
+
+#macro ( makeProject )
+
+    <!-- ============================================================ -->
+
+    #set ($menus = $project.getChild("body").getChildren("menu"))
+    #foreach ( $menu in $menus )
+    	<div class="menusection">
+    		<span class="menuheader">$menu.getAttributeValue("name")</span>
+			<ul>
+			#foreach ( $item in $menu.getChildren() )
+				#set ($name = $item.getAttributeValue("name"))
+				<li>#projectanchor($name $item.getAttributeValue("href"))</li>
+			#end
+			</ul>
+        </div>
+    #end
+#end
+
+#macro (getProjectImage)
+
+<div id="logo1">
+	<a href="http://hadoop.apache.org/hive/"><img src="${relativePath}/images/hive-logo.jpg" border="0"/></a>
+</div>
+
+
+#if ($project.getChild("logo"))
+
+<div id="logo2">
+
+#set ( $logoString = $project.getChild("logo").getAttributeValue("href") )
+#if ( $logoString.startsWith("/") )
+<a href="$project.getAttributeValue("href")"><img src="$relativePath$logoString" alt="$project.getChild("logo").getText()" border="0"/></a>
+#else
+<a href="$project.getAttributeValue("href")"><img src="$relativePath/$logoString" alt="$project.getChild("logo").getText()" border="0"/></a>
+#end
+
+</div>
+
+#end
+#end
+
+#macro (printMeta $metaElement)
+<meta #set ($attribs = $metaElement.getAttributes())
+#foreach ($a in $attribs) $a.getName()="$a.getValue()" #end />
+#end
+
+#macro (document)
+    <!-- ====================================================================== -->
+    <!-- GENERATED FILE, DO NOT EDIT, EDIT THE XML FILE IN xdocs INSTEAD! -->
+    <!-- Main Page Section -->
+    <!-- ====================================================================== -->
+    <html>
+        <head>
+            <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>
+
+            #set ($authors = $root.getChild("properties").getChildren("author"))
+            #foreach ( $au in $authors )
+                #metaauthor ( $au.getText() $au.getAttributeValue("email") )
+            #end
+
+           #set ($metas = $root.getChildren("meta"))
+
+            ##    Parse meta directives such as
+            ##    <meta name="keyword" content="apache, velocity, java"/>
+            #foreach ($meta in $metas) #printMeta($meta) #end
+
+            ##    Support for <base> tags.
+            #if ($root.getChild("properties").getChild("base"))
+              #set ($url = $root.getChild("properties").getChild("base").getAttributeValue("href"))
+              <base href="$url"/>
+            #end
+
+            <title>$project.getChild("title").getText() - $root.getChild("properties").getChild("title").getText()</title>
+
+			## use a relative CSS for when the page is displayed locally (will overwrite
+			## previous CSS settings)
+			<link rel="stylesheet" href="${relativePath}/site.css" type="text/css">
+        </head>
+
+        <body>
+
+			## use a table in order to force footer to end of page
+
+			<div id="container">
+
+				<div id="header">
+					#getProjectImage()
+				</div>
+
+				<div id="menu">
+					#makeProject()
+				</div>
+
+				<div id="body">
+					#set ($allSections = $root.getChild("body").getChildren("section"))
+					#foreach ( $section in $allSections )
+						#section ($section)
+					#end
+				</div>
+
+				<div id="footer">
+					Copyright &#169; 1999-2007, <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+				</div>
+
+			</div>
+
+        </body>
+    </html>
+#end

Added: hadoop/hive/trunk/docs/velocity.properties
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/velocity.properties?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/velocity.properties (added)
+++ hadoop/hive/trunk/docs/velocity.properties Thu Jul  1 01:20:40 2010
@@ -0,0 +1,2 @@
+#
+runtime.log=build/docs/velocity.log

Added: hadoop/hive/trunk/docs/xdocs/index.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/xdocs/index.xml?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/xdocs/index.xml (added)
+++ hadoop/hive/trunk/docs/xdocs/index.xml Thu Jul  1 01:20:40 2010
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.    
+-->
+<document>
+  <properties>
+    <title>Hadoop Hive</title>
+    <author email="hive-user@hadoop.apache.org">Hadoop Hive Documentation Team</author>
+  </properties>
+  <body>
+    <section name="What is Hive?" href="WhatisHive?">
+      <p>Hive is a data warehouse infrastructure built on top of Hadoop. It provides tools to enable easy data ETL, a mechanism to put structures on the data, and the capability to querying and analysis of large data sets stored in Hadoop files. Hive defines a simple SQL-like query language, called QL, that enables users familiar with SQL to query the data. At the same time, this language also allows programmers who are familiar with the MapReduce fromwork to be able to plug in their custom mappers and reducers to perform more sophisticated analysis that may not be supported by the built-in capabilities of the language.</p>
+
+<p>
+Hive does not mandate read or written data be in the "Hive format"---there is no such thing. Hive works equally well on Thrift, control delimited, or your specialized data formats. Please see File Format and SerDe in Developer Guide for details. </p>
+    </section>
+    <section name="What Hive is NOT" href="WhatHiveIsNot?">
+<p>Hive is based on Hadoop, which is a batch processing system. As a result, Hive does not and cannot promise low latencies on queries. The paradigm here is strictly of submitting jobs and being notified when the jobs are completed as opposed to real-time queries. In contrast to the systems such as Oracle where analysis is run on a significantly smaller amount of data, but the analysis proceeds much more iteratively with the response times between iterations being less than a few minutes, Hive queries response times for even the smallest jobs can be of the order of several minutes. However for larger jobs (e.g., jobs processing terabytes of data) in general they may run into hours.</p>
+
+<p>In summary, low latency performance is not the top-priority of Hive's design principles. What Hive values most are scalability (scale out with more machines added dynamically to the Hadoop cluster), extensibility (with MapReduce framework and UDF/UDAF/UDTF), fault-tolerance, and loose-coupling with its input formats.</p>
+    </section>
+  </body>
+</document>

Added: hadoop/hive/trunk/docs/xdocs/language_manual/data-manipulation-statements.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/xdocs/language_manual/data-manipulation-statements.xml?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/xdocs/language_manual/data-manipulation-statements.xml (added)
+++ hadoop/hive/trunk/docs/xdocs/language_manual/data-manipulation-statements.xml Thu Jul  1 01:20:40 2010
@@ -0,0 +1,234 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.    
+-->
+
+<document>
+
+  <properties>
+    <title>Hadoop Hive- Data Manipulation Statements</title>
+    <author email="hive-user@hadoop.apache.org">Hadoop Hive Documentation Team</author>
+  </properties>
+
+  <body>
+
+    <section name="Create Table Syntax" href="create_table_syntax">
+
+    <source><![CDATA[
+CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
+  [(col_name data_type [COMMENT col_comment], ...)]
+  [COMMENT table_comment]
+  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
+  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
+  [ROW FORMAT row_format]
+  [STORED AS file_format]
+  [LOCATION hdfs_path]
+  [TBLPROPERTIES (property_name=property_value, ...)]  
+  [AS select_statement]  
+
+CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
+  LIKE existing_table_name
+  [LOCATION hdfs_path]
+
+data_type
+  : primitive_type
+  | array_type
+  | map_type
+  | struct_type
+
+primitive_type
+  : TINYINT
+  | SMALLINT
+  | INT
+  | BIGINT
+  | BOOLEAN
+  | FLOAT
+  | DOUBLE
+  | STRING
+
+array_type
+  : ARRAY < data_type >
+
+map_type
+  : MAP < primitive_type, data_type >
+
+struct_type
+  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
+
+row_format
+  : DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
+        [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
+  | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
+
+file_format:
+  : SEQUENCEFILE
+  | TEXTFILE
+  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
+]]></source>
+ 
+<p>
+CREATE TABLE creates a table with the given name. An error is thrown if a table or view with the same name already exists. You can use IF NOT EXISTS to skip the error.
+</p>
+
+<p>
+The EXTERNAL keyword lets you create a table and provide a LOCATION so that Hive does not use a default location for this table. This comes in handy if you already have data generated. When dropping an EXTERNAL table, data in the table is NOT deleted from the file system.
+</p>
+The LIKE form of CREATE TABLE allows you to copy an existing table definition exactly (without copying its data).
+
+<p>
+You can create tables with custom SerDe or using native SerDe. A native SerDe is used if ROW FORMAT is not specified or ROW FORMAT DELIMITED is specified. You can use the DELIMITED clause to read delimited files. Use the SERDE clause to create a table with custom SerDe. Refer to SerDe section of the User Guide for more information on SerDe.
+</p>
+
+<p>
+You must specify a list of a columns for tables that use a native SerDe. Refer to the Types part of the User Guide for the allowable column types. A list of columns for tables that use a custom SerDe may be specified but Hive will query the SerDe to determine the actual list of columns for this table.
+</p>
+
+<p>
+Use STORED AS TEXTFILE if the data needs to be stored as plain text files. Use STORED AS SEQUENCEFILE if the data needs to be compressed. Please read more about Hive/CompressedStorage if you are planning to keep data compressed in your Hive tables. Use INPUTFORMAT and OUTPUTFORMAT to specify the name of a corresponding InputFormat and OutputFormat class as a string literal, e.g. 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextInputFormat'.
+</p>
+
+<p>
+Partitioned tables can be created using the PARTITIONED BY clause. A table can have one or more partition columns and a separate data directory is created for each distinct value combination in the partition columns. Further, tables or partitions can be bucketed using CLUSTERED BY columns, and data can be sorted within that bucket via SORT BY columns. This can improve performance on certain kinds of queries.
+</p>
+
+<p>
+Table names and column names are case insensitive but SerDe and property names are case sensitive. Table and column comments are string literals (single-quoted). The TBLPROPERTIES clause allows you to tag the table definition with your own metadata key/value pairs.
+</p>
+
+<p>A create table example:</p>
+  <source><![CDATA[CREATE TABLE page_view(viewTime INT, userid BIGINT,
+     page_url STRING, referrer_url STRING,
+     ip STRING COMMENT 'IP Address of the User')
+ COMMENT 'This is the page view table'
+ PARTITIONED BY(dt STRING, country STRING)
+ STORED AS SEQUENCEFILE;]]></source>  
+
+ <p>The statement above creates the page_view table with viewTime, userid, page_url, referrer_url, and ip columns (including comments). The table is also partitioned and data is stored in sequence files. The data format in the files is assumed to be field-delimited by ctrl-A and row-delimited by newline.
+  </p>
+
+</section>
+
+<section name="Create Table as Select (CTAS)" href="ctas?">
+
+  <p>
+  Tables can also be created and populated by the results of a query in one create-table-as-select (CTAS) statement. The table created by CTAS is atomic, meaning that the table is not seen by other users until all the query results are populated. So other users will either see the table with the complete results of the query or will not see the table at all.
+  </p>
+
+  <p>
+  There are two parts in CTAS, the SELECT part can be any SELECT statement supported by HiveQL. The CREATE part of the CTAS takes the resulting schema from the SELECT part and creates the target table with other table properties such as the SerDe and storage format. The only restrictions in CTAS is that the target table cannot be a partitioned table (nor can it be an external table).
+  </p> 
+
+  <source><![CDATA[CREATE TABLE page_view(viewTime INT, userid BIGINT,
+     page_url STRING, referrer_url STRING,
+     ip STRING COMMENT 'IP Address of the User')
+ COMMENT 'This is the page view table'
+ PARTITIONED BY(dt STRING, country STRING)
+ STORED AS SEQUENCEFILE;
+]]></source>
+
+</section>
+
+<section name="Using SerDes" href="SerDes">
+
+<p>
+This example CTAS statement creates the target table new_key_value_store with the 
+schema (new_key DOUBLE, key_value_pair STRING) derived from the results of the 
+SELECT statement. If the SELECT statement does not specify column aliases, the 
+column names will be automatically assigned to _col0, _col1, and _col2 etc. 
+In addition, the new target table is created using a specific SerDe and a storage 
+format independent of the source tables in the SELECT statement. 
+</p>
+
+<source><![CDATA[CREATE TABLE new_key_value_store
+   ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
+   STORED AS RCFile AS
+SELECT (key % 1024) new_key, concat(key, value) key_value_pair
+FROM key_value_store
+SORT BY new_key, key_value_pair;
+]]></source>
+
+<p>
+<b>Being able to select data from one table to another is one of the most
+powerful features of Hive. Hive handles the conversion of the data from the source
+format to the destination format as the query is being executed!</b>
+</p>
+
+</section>
+
+<section name="Bucketed Sorted Table" href="bucketed_sorted_table">
+
+<source><![CDATA[CREATE TABLE page_view(viewTime INT, userid BIGINT,
+     page_url STRING, referrer_url STRING,
+     ip STRING COMMENT 'IP Address of the User')
+ COMMENT 'This is the page view table'
+ PARTITIONED BY(dt STRING, country STRING)
+ CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS
+ ROW FORMAT DELIMITED
+   FIELDS TERMINATED BY '\001'
+   COLLECTION ITEMS TERMINATED BY '\002'
+   MAP KEYS TERMINATED BY '\003'
+ STORED AS SEQUENCEFILE;
+]]></source>
+
+<p>In the example above, the page_view table is bucketed (clustered by) userid and within each bucket the data is sorted in increasing order of viewTime. Such an organization allows the user to do efficient sampling on the clustered column - in this case userid. The sorting property allows internal operators to take advantage of the better-known data structure while evaluating queries, also increasing efficiency. MAP KEYS and COLLECTION ITEMS keywords can be used if any of the columns are lists or maps.
+</p>
+
+<p>
+The CLUSTERED BY and SORTED BY creation commands do not affect how data is inserted into a table -- only how it is read. This means that users must be careful to insert data correctly by specifying the number of reducers to be equal to the number of buckets, and using CLUSTER BY and SORT BY commands in their query. See
+<a href="working_with_bucketed_tables.html">Working with Bucketed tables</a> to see how these
+are used. 
+</p>
+
+</section>
+
+<section name="External Tables" href="external_table?">
+
+<p>
+Unless a table is specified as EXTERNAL it will be stored inside a folder specified by the
+configuration property hive.metastore.warehouse.dir.
+EXTERNAL tables points to any hdfs location for its storage. You still have to make sure that the data is format is specified to match the data.
+ 
+</p>
+<source><![CDATA[CREATE EXTERNAL TABLE page_view(viewTime INT, userid BIGINT,
+     page_url STRING, referrer_url STRING,
+     ip STRING COMMENT 'IP Address of the User',
+     country STRING COMMENT 'country of origination')
+ COMMENT 'This is the staging page view table'
+ ROW FORMAT DELIMITED FIELDS TERMINATED BY '\054'
+ STORED AS TEXTFILE
+ LOCATION '<hdfs_location>';
+ ]]></source>
+
+</section>
+
+<section name="Create Table ... Like" href="create_table_like?">
+
+<p>The statement above creates a new empty_key_value_store table whose definition exactly matches the existing key_value_store in all particulars other than table name. The new table contains no rows.
+</p>
+
+<source><![CDATA[CREATE TABLE empty_key_value_store
+LIKE key_value_store;
+]]></source>
+
+</section>
+
+<section name="drop" href="drop">
+<p>Drop it like it is hot</p>
+</section>
+  </body>
+</document>

Added: hadoop/hive/trunk/docs/xdocs/language_manual/joins.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/xdocs/language_manual/joins.xml?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/xdocs/language_manual/joins.xml (added)
+++ hadoop/hive/trunk/docs/xdocs/language_manual/joins.xml Thu Jul  1 01:20:40 2010
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.    
+-->
+
+<document>
+
+  <properties>
+    <title>Hadoop Hive- Joins</title>
+    <author email="hive-user@hadoop.apache.org">Hadoop Hive Documentation Team</author>
+  </properties>
+
+  <body>
+
+    <section name="Join Syntax" href="join_syntax">
+
+    <source><![CDATA[join_table:
+    table_reference JOIN table_factor [join_condition]
+  | table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition
+  | table_reference LEFT SEMI JOIN table_reference join_condition
+
+table_reference:
+    table_factor
+  | join_table
+
+table_factor:
+    tbl_name [alias]
+  | table_subquery alias
+  | ( table_references )
+
+join_condition:
+    ON equality_expression ( AND equality_expression )*
+
+equality_expression: 
+    expression = expression
+]]></source>
+
+<p>
+Only equality joins, outer joins, and left semi joins are supported in Hive. Hive does not support join conditions that are not equality conditions as it is very difficult to express such conditions as a map/reduce job. Also, more than two tables can be joined in Hive. 
+</p>
+
+<b>Allowed Equality Joins</b>
+
+<source><![CDATA[SELECT a.* FROM a JOIN b ON (a.id = b.id) 
+]]></source>
+
+<source><![CDATA[SELECT a.* FROM a JOIN b ON (a.id = b.id AND a.department = b.department)
+]]></source>
+
+<b>Disallowed Joins</b>
+
+<source><![CDATA[SELECT a.* FROM a JOIN b ON (a.id <> b.id)
+]]></source>
+
+<p>Multiple Tables can be joined in the same query</p>
+
+<source><![CDATA[SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
+]]></source>
+
+
+
+<source><![CDATA[
+]]></source>
+ 
+</section>
+
+<section name="Join implementation with Map Reduce" href="join_map_reduce">
+
+<p>Hive converts joins over multiple tables into a single map/reduce job if for every table the same column is used in the join clauses. The query below is
+converted into a single map/reduce job as only key1 column for b is involved in the join.</p>
+
+<source><![CDATA[SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)]]></source>
+<i>It is very interesting to note that any number of tables can be joined in single map/reduce process as long as they fit the above criteria.</i>
+
+<p>However if the join colums are not the same for all tables the is converted into multiple map/reduce jobs</p>
+
+<source><![CDATA[SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
+]]></source>
+
+<p>In this case the first map/reduce job joins a with b and the results are then joined with c in the second map/reduce job. </p>
+</section>
+
+<section name="Largest Table LAST" href="lagest_table_last">
+
+<p>In every map/reduce stage of the join, the last table in the sequence is streamed through the reducers where as the others are buffered. Therefore, it helps to reduce the memory needed in the reducer for buffering the rows for a particular value of the join key by organizing the tables such that the largest tables appear last in the sequence. e.g. in</p>
+
+<source><![CDATA[SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)]]></source>
+
+<p>all the three tables are joined in a single map/reduce job and the values for a particular value of the key for tables a and b are buffered in the memory in the reducers. Then for each row retrieved from c, the join is computed with the buffered rows.</p>
+
+<p>For the query:</p>
+
+<source><![CDATA[SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)]]></source>
+
+<p>    * there are two map/reduce jobs involved in computing the join. The first of these joins a with b and buffers the values of a while streaming the values of b in the reducers. The second of one of these jobs buffers the results of the first join while streaming the values of c through the reducers. </p>
+
+</section>
+
+<section name="Streamtable hint" href="stream_table_hint">
+
+<p>In every map/reduce stage of the join, the table to be streamed can be specified via a hint:</p>
+
+<source><![CDATA[SELECT /*+ STREAMTABLE(a) */ a.val, b.val, c.val 
+FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)]]></source>
+
+<p>All the three tables are joined in a single map/reduce job and the values for a particular value of the key for tables b and c are buffered in the memory in the reducers. Then for each row retrieved from a, the join is computed with the buffered rows.
+</p>
+
+</section>
+
+<section name="Outer Joins" href="outer_joins">
+
+<p>LEFT, RIGHT, and FULL OUTER joins exist in order to provide more control over ON clauses for which there is no match. For example:</p>
+
+<source><![CDATA[SELECT a.val, b.val FROM a LEFT OUTER JOIN b ON (a.key=b.key)
+]]></source>
+
+<p>The above query will return a row for every row in a. This output row will be a.val,b.val when there is a b.key that equals a.key, and the output row will be a.val,NULL when there is no corresponding b.key. Rows from b which have no corresponding a.key will be dropped. The syntax "FROM a LEFT OUTER JOIN b" must be written on one line in order to understand how it works--a is to the LEFT of b in this query, and so all rows from a are kept; a RIGHT OUTER JOIN will keep all rows from b, and a FULL OUTER JOIN will keep all rows from a and all rows from b. OUTER JOIN semantics should conform to standard SQL specs.
+</p>
+
+<p>Joins occur BEFORE WHERE CLAUSES. So, if you want to restrict the OUTPUT of a join, a requirement should be in the WHERE clause, otherwise it should be in the JOIN clause. A big point of confusion for this issue is partitioned tables</p>
+
+<source><![CDATA[SELECT a.val, b.val FROM a LEFT OUTER JOIN b ON (a.key=b.key)
+  WHERE a.ds='2009-07-07' AND b.ds='2009-07-07']]></source>
+
+<p>will join a on b, producing a list of a.val and b.val. The WHERE clause, however, can also reference other columns of a and b that are in the output of the join, and then filter them out. However, whenever a row from the JOIN has found a key for a and no key for b, all of the columns of b will be NULL, including the ds column. This is to say, you will filter out all rows of join output for which there was no valid b.key, and thus you have outsmarted your LEFT OUTER requirement. In other words, the LEFT OUTER part of the join is irrelevant if you reference any column of b in the WHERE clause. Instead, when OUTER JOINing, use this syntax:</p>
+
+<source><![CDATA[SELECT a.val, b.val FROM a LEFT OUTER JOIN b
+  ON (a.key=b.key AND b.ds='2009-07-07' AND a.ds='2009-07-07')]]></source>
+
+<p>Joins are NOT commutative! Joins are left-associative regardless of whether they are LEFT or RIGHT joins. </p>
+
+<source><![CDATA[SELECT a.val1, a.val2, b.val, c.val
+FROM a
+JOIN b ON (a.key = b.key)
+LEFT OUTER JOIN c ON (a.key = c.key)]]></source>
+
+<p>The above query first joins a on b, throwing away everything in a or b that does not have a corresponding key in the other table. The reduced table is then joined on c. This provides unintuitive results if there is a key that exists in both a and c, but not b: The whole row (including a.val1, a.val2, and a.key) is dropped in the "a JOIN b" step, so when the result of that is joined with c, any row with a c.key that had a corresponding a.key or b.key (but not both) will show up as NULL, NULL, NULL, c.val.</p>
+</section>
+
+<section name="Left Semi Join" href="left_semi_join">
+
+<p>LEFT SEMI JOIN implements the correlated IN/EXISTS subquery semantics in an efficient way. Since Hive currently does not support IN/EXISTS subqueries, you can rewrite your queries using LEFT SEMI JOIN. The restrictions of using LEFT SEMI JOIN is that the right-hand-side table should only be referenced in the join condition (ON-clause), but not in WHERE- or SELECT-clauses etc.</p>
+
+<p>This type of query</p>
+<source><![CDATA[SELECT a.key, a.value
+FROM a 
+WHERE a.key in 
+(SELECT b.key
+FROM B);]]></source>
+
+<p>Can be written as:</p>
+
+<source><![CDATA[SELECT a.key, a.val
+FROM a LEFT SEMI JOIN b on (a.key = b.key)]]></source>
+
+</section>
+
+<section name="Map Side Join" href="map_side_join">
+
+<p>If all but one of the tables being joined are small, the join can be performed as a map only job. The query
+does not need a reducer. For every mapper a,b is read completely. A restriction is that a <b>FULL/RIGHT OUTER JOIN b</b> cannot be performed. </p>
+
+<source><![CDATA[SELECT /*+ MAPJOIN(b) */ a.key, a.value
+FROM a join b on a.key = b.key]]></source>
+
+</section>
+
+<section name="Bucketed Map Join" href="bucket_map_join">
+
+<p>If the tables being joined are bucketized, and the buckets are a multiple of each other, the buckets can be joined with each other. If table A has 8 buckets are table B has 4 buckets, the following join:</p>
+
+<source><![CDATA[SELECT /*+ MAPJOIN(b) */ a.key, a.value
+FROM a join b on a.key = b.key]]></source>
+
+<p>can be done on the mapper only. Instead of fetching B completely for each mapper of A, only the required buckets are fetched. For the query above, the mapper processing bucket 1 for A will only fetch bucket 1 of B. It is not the default behavior, and is governed by the following parameter </p>
+
+<i>set hive.optimize.bucketmapjoin = true</i>
+
+<p>If the tables being joined are sorted and bucketized, and the number of buckets are same, a sort-merge join can be performed. The corresponding buckets are joined with each other at the mapper. If both A and B have 4 buckets</p>
+
+<source><![CDATA[ SELECT /*+ MAPJOIN(b) */ a.key, a.value
+FROM A a join B b on a.key = b.key]]></source>
+
+<p>can be done on the mapper only. The mapper for the bucket for A will traverse the corresponding bucket for B. This is not the default behavior, and the following parameters need to be set:</p>
+
+<source><![CDATA[set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;]]></source>
+
+</section>
+
+</body>
+
+
+
+</document>

Added: hadoop/hive/trunk/docs/xdocs/language_manual/working_with_bucketed_tables.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/docs/xdocs/language_manual/working_with_bucketed_tables.xml?rev=959481&view=auto
==============================================================================
--- hadoop/hive/trunk/docs/xdocs/language_manual/working_with_bucketed_tables.xml (added)
+++ hadoop/hive/trunk/docs/xdocs/language_manual/working_with_bucketed_tables.xml Thu Jul  1 01:20:40 2010
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<document>
+
+  <properties>
+    <title>Hadoop Hive- Working with Bucketed Tables</title>
+    <author email="hive-user@hadoop.apache.org">Hadoop Hive Documentation Team</author>
+  </properties>
+
+  <body>
+
+ <section name="Defining Bucketed Tables" href="defining_bucketed_tables?">
+
+<p>
+This is a brief example on creating a populating bucketed tables. Bucketed tables 
+are fantastic in that they allow much more efficient sampling than do non-bucketed 
+tables, and they may later allow for time saving operations such as mapside joins. 
+However, the bucketing specified at table creation is not enforced when the table 
+is written to, and so it is possible for the table's metadata to advertise 
+properties which are not upheld by the table's actual layout. This should obviously 
+be avoided. Here's how to do it right.
+</p>
+<p>First there’s table creation:</p>
+
+ <source><![CDATA[CREATE TABLE user_info_bucketed(user_id BIGINT, firstname STRING, lastname STRING)
+COMMENT 'A bucketed copy of user_info'
+PARTITIONED BY(ds STRING)
+CLUSTERED BY(user_id) INTO 256 BUCKETS;
+ ]]></source>
+
+<p>notice that we define user_id as the bucket column</p>
+</section>
+
+<section name="Populating Bucketed Tables" href="populating_bucketed_tables?">
+
+ <source><![CDATA[set hive.enforce.bucketing = true;  
+FROM user_id
+INSERT OVERWRITE TABLE user_info_bucketed
+PARTITION (ds='2009-02-25')
+SELECT userid, firstname, lastname WHERE ds='2009-02-25';
+ ]]></source>
+
+<p>The command <strong>set hive.enforce.bucketing = true;</strong>  allows the 
+correct number of reducers and the cluster by column to be automatically selected 
+based on the table. Otherwise, you would need to set the number of reducers to be 
+the same as the number of buckets with 
+<strong>set mapred.reduce.tasks = 256;</strong> and have a 
+<strong>CLUSTER BY ...</strong> clause in the select.</p>
+
+</section>
+
+<section name="Bucketing Explained" href="bucketing_explained?">
+<p>
+How does Hive distribute the rows across the buckets? In general, the bucket number is determined by the expression hash_function(bucketing_column) mod num_buckets. (There's a '0x7FFFFFFF in there too, but that's not that important). The hash_function depends on the type of the bucketing column. For an int, it's easy, hash_int(i) == i. For example, if user_id were an int, and there were 10 buckets, we would expect all user_id's that end in 0 to be in bucket 1, all user_id's that end in a 1 to be in bucket 2, etc. For other datatypes, it's a little tricky. In particular, the hash of a BIGINT is not the same as the BIGINT. And the hash of a string or a complex datatype will be some number that's derived from the value, but not anything humanly-recognizable. For example, if user_id were a STRING, then the user_id's in bucket 1 would probably not end in 0. In general, distributing rows based on the hash will give you a even distribution in the buckets.
+</p>
+
+</section>
+
+<section name="What can go wrong?" href="bucketing_gone_wrong?">
+<p>
+So, what can go wrong? As long as you 
+<strong>set hive.enforce.bucketing = true</strong>, and use the syntax above, 
+the tables should be populated properly. Things can go wrong if the bucketing 
+column type is different during the insert and on read, or if you manually 
+cluster by a value that's different from the table definition. 
+</p>
+</section>
+</body>
+</document>

Modified: hadoop/hive/trunk/ivy.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ivy.xml?rev=959481&r1=959480&r2=959481&view=diff
==============================================================================
--- hadoop/hive/trunk/ivy.xml (original)
+++ hadoop/hive/trunk/ivy.xml Thu Jul  1 01:20:40 2010
@@ -28,15 +28,18 @@
     <conf name="master" description="contains the artifact but no dependencies"/>
     <conf name="compile" description="contains the artifact but no dependencies"/>
     <conf name="runtime" description="runtime but not the artifact"/>
-
+    <!-- Private configurations -->
+    <conf name="docs" visibility="private"/>
     <conf name="checkstyle" visibility="private"/>
   </configurations>
 
 
   <dependencies>
    <dependency org="checkstyle" name="checkstyle" rev="${checkstyle.version}"
-               conf="checkstyle->default"/>
-    <conflict manager="all" />
+     conf="checkstyle->default"/>
+   <dependency org="org.jdom" name="jdom" rev="${jdom.version}"
+     conf="docs->default"/>
+   <conflict manager="all" />
   </dependencies>
   
 </ivy-module>

Modified: hadoop/hive/trunk/ivy/libraries.properties
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ivy/libraries.properties?rev=959481&r1=959480&r2=959481&view=diff
==============================================================================
--- hadoop/hive/trunk/ivy/libraries.properties (original)
+++ hadoop/hive/trunk/ivy/libraries.properties Thu Jul  1 01:20:40 2010
@@ -23,6 +23,7 @@ commons-collections.version=3.2.1
 commons-lang.version=2.4
 commons-logging.version=1.0.4
 commons-logging-api.version=1.0.4
+jdom.version=1.1
 ivy.version=2.1.0
 log4j.version=1.2.15
 



Mime
View raw message