hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aihu...@apache.org
Subject hive git commit: HIVE-11072: Add data validation between Hive metastore upgrades tests (Naveen Gangam, reviewed by Chaoyu Tang, Aihua Xu)
Date Tue, 22 Nov 2016 14:45:28 GMT
Repository: hive
Updated Branches:
  refs/heads/master 893b2553a -> 4a235fc55


HIVE-11072: Add data validation between Hive metastore upgrades tests (Naveen Gangam, reviewed by Chaoyu Tang, Aihua Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4a235fc5
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4a235fc5
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4a235fc5

Branch: refs/heads/master
Commit: 4a235fc5598d2a8a72ba673f8ba98334cd290593
Parents: 893b255
Author: Aihua Xu <aihuaxu@apache.org>
Authored: Tue Nov 22 09:44:26 2016 -0500
Committer: Aihua Xu <aihuaxu@apache.org>
Committed: Tue Nov 22 09:44:26 2016 -0500

----------------------------------------------------------------------
 testutils/metastore/dataload.properties         |    6 +
 testutils/metastore/dbs/derby/validate.sh       |   81 +
 testutils/metastore/dbs/mysql/validate.sh       |   76 +
 testutils/metastore/dbs/postgres/validate.sh    |   75 +
 testutils/metastore/metastore-upgrade-test.sh   |   31 +
 .../metastore/metastore-validation-test.sh      | 1442 ++++++++++++++++++
 6 files changed, 1711 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/dataload.properties
----------------------------------------------------------------------
diff --git a/testutils/metastore/dataload.properties b/testutils/metastore/dataload.properties
new file mode 100644
index 0000000..caf20a9
--- /dev/null
+++ b/testutils/metastore/dataload.properties
@@ -0,0 +1,6 @@
+DBS.DB_ID=1
+DBS.DESC='testdatabase created by script to validate data in metastore post-upgrade'
+DBS.DB_LOCATION_URI='none'
+DBS.NAME='testdatabase'
+TBLS.TBL_ID=1
+TBLS.TBL_NAME='hive_testtable'

http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/dbs/derby/validate.sh
----------------------------------------------------------------------
diff --git a/testutils/metastore/dbs/derby/validate.sh b/testutils/metastore/dbs/derby/validate.sh
new file mode 100644
index 0000000..95a9424
--- /dev/null
+++ b/testutils/metastore/dbs/derby/validate.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script executes all hive metastore upgrade scripts on an specific
+# database server in order to verify that upgrade scripts are working
+# properly.
+
+cd $(pwd)/dbs/derby/
+
+echo "Executing query and insert script for version : $1"
+
+name="$(basename $1)"
+
+if [[ "$name" = hive-schema* ]]
+then
+  FROM_VER=`echo $name | cut -d'-' -f3 | sed -e 's/\.derby.sql//g'`
+  SCRIPT_INSERT="derby-$FROM_VER-insert.sql"
+else
+  FROM_VER=`echo $name | cut -d'-' -f2`
+  TO_VER=`echo $name | cut -d'-' -f4 | sed -e 's/\.derby.sql//g'`
+  SCRIPT_INSERT="derby-$TO_VER-insert.sql"
+  SCRIPT_QUERY="derby-$FROM_VER-query.sql"
+  QUERY_RESULT="derby-$FROM_VER-results.txt"
+fi
+
+JAVA_PATH=($(readlink /etc/alternatives/java))
+JAVA_PATH=`dirname $JAVA_PATH`
+
+export JAVA_HOME=$JAVA_PATH/..
+export PATH=$PATH:/usr/share/javadb/bin:$JAVA_HOME/bin
+
+if [[ -z "${SCRIPT_QUERY+x}" ]]
+then
+  echo "Nothing to query"
+else
+  echo "Test query script is $SCRIPT_QUERY"
+  if [[ -e "$SCRIPT_QUERY" ]]
+  then
+    echo "connect 'jdbc:derby:/tmp/hive_hms_testing;create=true';" > ./derbyQuery.sql
+    echo "run '$SCRIPT_QUERY';" >> ./derbyQuery.sql
+    echo "quit;" >> ./derbyQuery.sql
+    ij < ./derbyQuery.sql > "$QUERY_RESULT"
+
+    failures=`awk '/SELECT\ COUNT/{nr[NR]; nr[NR+3]}; NR in nr' "$QUERY_RESULT" | grep -B 1 "0 " | wc -l`
+    if [[ "$failures" -ne 0 ]]
+    then
+      echo "********* FAILED ($failures failed queries)  ********"
+      echo " Check $(pwd)/$QUERY_RESULT for results of test query"
+      echo `awk '/SELECT\ COUNT/{nr[NR]; nr[NR+3]}; NR in nr' "$QUERY_RESULT" | grep -B 1 "0 "`
+      exit 1
+    fi
+  else
+    echo "$SCRIPT_QUERY does not exist, skipping ..."
+  fi
+fi
+
+echo "Test insert script is $SCRIPT_INSERT"
+if [[ -e "$SCRIPT_INSERT" ]]
+then
+  echo "connect 'jdbc:derby:/tmp/hive_hms_testing;create=true';" > ./derbyInsert.sql
+  echo "run '$SCRIPT_INSERT';" >> ./derbyInsert.sql
+  echo "quit;" >> ./derbyInsert.sql
+  ij < ./derbyInsert.sql
+else
+  echo "$SCRIPT_INSERT does not exist, skipping ..."
+fi
+

http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/dbs/mysql/validate.sh
----------------------------------------------------------------------
diff --git a/testutils/metastore/dbs/mysql/validate.sh b/testutils/metastore/dbs/mysql/validate.sh
new file mode 100644
index 0000000..781cc76
--- /dev/null
+++ b/testutils/metastore/dbs/mysql/validate.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script executes all hive metastore upgrade scripts on an specific
+# database server in order to verify that upgrade scripts are working
+# properly.
+
+cd $(pwd)/dbs/mysql/
+
+echo "Executing query and insert script for version : $1"
+
+name="$(basename $1)"
+
+if [[ "$name" = hive-schema* ]]
+then
+  FROM_VER=`echo $name | cut -d'-' -f3 | sed -e 's/\.mysql.sql//g'`
+  SCRIPT_INSERT="mysql-$FROM_VER-insert.sql"
+else
+  FROM_VER=`echo $name | cut -d'-' -f2`
+  TO_VER=`echo $name | cut -d'-' -f4 | sed -e 's/\.mysql.sql//g'`
+  SCRIPT_INSERT="mysql-$TO_VER-insert.sql"
+  SCRIPT_QUERY="mysql-$FROM_VER-query.sql"
+  QUERY_RESULT="mysql-$FROM_VER-results.sql"
+fi
+
+if [[ -z "${SCRIPT_QUERY+x}" ]]
+then
+  echo "Nothing to query"
+else
+  echo "Test query script is $SCRIPT_QUERY"
+  if [[ -e "$SCRIPT_QUERY" ]]
+  then
+    sed -i 's/"//g' $SCRIPT_QUERY
+    if [[ -e "$QUERY_RESULT" ]]
+    then
+      rm $QUERY_RESULT
+    fi
+    mysql hive_hms_testing < "$(pwd)/$SCRIPT_QUERY" > "$QUERY_RESULT"
+
+    failures=`awk '/COUNT/{nr[NR]; nr[NR+3]}; NR in nr' "$QUERY_RESULT" | grep "0" | wc -l`
+    if [[ "$failures" -ne 0 ]]
+    then
+      echo "***** FAILED ($failures failed queries )*******"
+      echo "Check $(pwd)/$QUERY_RESULT for results of the test query"
+      echo `awk '/COUNT/{nr[NR]; nr[NR+3]}; NR in nr' "$QUERY_RESULT" | grep "0" | wc -l`
+      exit 1
+    fi
+
+  else
+    echo "$SCRIPT_QUERY does not exist, skipping ..."
+  fi
+fi
+
+echo "Test insert script is $SCRIPT_INSERT"
+if [[ -e "$SCRIPT_INSERT" ]]
+then
+  sed -i 's/"//g' $SCRIPT_INSERT
+
+  mysql hive_hms_testing < "$(pwd)/$SCRIPT_INSERT"
+else
+  echo "$SCRIPT_INSERT does not exist, skipping ..."
+fi

http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/dbs/postgres/validate.sh
----------------------------------------------------------------------
diff --git a/testutils/metastore/dbs/postgres/validate.sh b/testutils/metastore/dbs/postgres/validate.sh
new file mode 100644
index 0000000..888fc60
--- /dev/null
+++ b/testutils/metastore/dbs/postgres/validate.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script executes all hive metastore upgrade scripts on an specific
+# database server in order to verify that upgrade scripts are working
+# properly.
+
+cd $(pwd)/dbs/postgres/
+
+echo "Executing query and insert script for version : $1"
+
+name="$(basename $1)"
+
+if [[ "$name" = hive-schema* ]]
+then
+  FROM_VER=`echo $name | cut -d'-' -f3 | sed -e 's/\.postgres.sql//g'`
+  SCRIPT_INSERT="postgres-$FROM_VER-insert.sql"
+else
+  FROM_VER=`echo $name | cut -d'-' -f2`
+  TO_VER=`echo $name | cut -d'-' -f4 | sed -e 's/\.postgres.sql//g'`
+  SCRIPT_INSERT="postgres-$TO_VER-insert.sql"
+  SCRIPT_QUERY="postgres-$FROM_VER-query.sql"
+  QUERY_RESULT="postgres-$FROM_VER-results.txt"
+fi
+
+export PGPASSWORD=hivepw
+if [[ -z "${SCRIPT_QUERY+x}" ]]
+then
+  echo "Nothing to query"
+else
+  echo "Test query script is $SCRIPT_QUERY"
+  if [[ -e "$SCRIPT_QUERY" ]]
+  then
+    if [[ -e "$QUERY_RESULT" ]]
+    then
+      rm $QUERY_RESULT
+    fi
+
+    psql -h localhost -U hiveuser -d hive_hms_testing -f "$(pwd)/$SCRIPT_QUERY" -o "$(pwd)/$QUERY_RESULT"
+
+    failures=`awk '/count/{nr[NR]; nr[NR+2]}; NR in nr' "$QUERY_RESULT" | grep "  0" | wc -l`
+    if [[ "$failures" -ne 0 ]]
+    then
+      echo "***** FAILED ($failures failed queries) *******"
+      echo "Check $(pwd)/$QUERY_RESULT for results of the test query"
+      echo `awk '/SELECT\ COUNT/{nr[NR]; nr[NR+2]}; NR in nr' "$QUERY_RESULT" | grep -B 1 "  0"`
+      exit 1
+    fi
+
+  else
+    echo "$SCRIPT_QUERY does not exist, skipping ..."
+  fi
+fi
+
+echo "Test insert script is $SCRIPT_INSERT"
+if [[ -e "$SCRIPT_INSERT" ]]
+then
+  psql -h localhost -U hiveuser -d hive_hms_testing -f "$(pwd)/$SCRIPT_INSERT"
+else
+  echo "$SCRIPT_INSERT does not exist, skipping ..."
+fi

http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/metastore-upgrade-test.sh
----------------------------------------------------------------------
diff --git a/testutils/metastore/metastore-upgrade-test.sh b/testutils/metastore/metastore-upgrade-test.sh
index 66fb766..cbc610f 100644
--- a/testutils/metastore/metastore-upgrade-test.sh
+++ b/testutils/metastore/metastore-upgrade-test.sh
@@ -68,9 +68,13 @@ fi
 
 SCRIPT_PREPARE="$(pwd)/dbs/$DB_SERVER/prepare.sh"
 SCRIPT_EXECUTE="$(pwd)/dbs/$DB_SERVER/execute.sh"
+SCRIPT_VALIDATE="$(pwd)/metastore-validation-test.sh"
+DATA_VALIDATE="$(pwd)/dbs/$DB_SERVER/validate.sh"
 
 [ ! -e "$SCRIPT_PREPARE" ] && log "Error: $SCRIPT_PREPARE does not exist." && exit 1
 [ ! -e "$SCRIPT_EXECUTE" ] && log "Error: $SCRIPT_EXECUTE does not exist." && exit 1
+[ ! -e "$SCRIPT_VALIDATE" ] && log "Error: $SCRIPT_VALIDATE does not exist." && exit 1
+[ ! -e "$DATA_VALIDATE" ] && log "Error: $DATA_VALIDATE does not exist." && exit 1
 
 HMS_UPGRADE_DIR="$(pwd)/../../metastore/scripts/upgrade"
 if [ ! -d "$HMS_UPGRADE_DIR/$DB_SERVER" ]; then
@@ -92,6 +96,13 @@ if ! bash -e -x $SCRIPT_PREPARE; then
 fi
 log "Server prepared."
 
+log "Calling metastore data generation script ..."
+if ! bash -e $SCRIPT_VALIDATE --db $DB_SERVER; then
+        log "Error: $SCRIPT_VALIDATE failed. (see logs)"
+        exit 1
+fi
+log "Test scripts generated."
+
 #
 # Now we execute each of the .sql scripts on the database server.
 #
@@ -118,6 +129,18 @@ if ! execute_test $HIVE_SCHEMA_BASE; then
 	echo "Error: Cannot execute SQL file: $HIVE_SCHEMA_BASE"
 fi
 
+bash -e $DATA_VALIDATE $HIVE_SCHEMA_BASE
+
+success=$?
+
+if [[ "$success" != 0 ]];
+then
+  echo "Data verification failed!!!"
+  exit $success;
+else
+  echo "Data verification successful!!!"
+fi
+
 begin_upgrade_test="false"
 while read script
 do
@@ -126,6 +149,14 @@ do
 		if ! execute_test $HMS_UPGRADE_DIR/$DB_SERVER/upgrade-$script.$DB_SERVER.sql; then
 			echo "Error: Cannot execute SQL file: $HMS_UPGRADE_DIR/$DB_SERVER/upgrade-$script.$DB_SERVER.sql"
 		fi
+                bash -e $DATA_VALIDATE $HMS_UPGRADE_DIR/$DB_SERVER/upgrade-$script.$DB_SERVER.sql
+                success=$?;
+
+                if [[ $success != 0 ]];
+                then
+                  echo "Data verification failed!!!"
+                  exit $success;
+                fi
 	fi
 done < $HMS_UPGRADE_DIR/$DB_SERVER/upgrade.order.$DB_SERVER
 

http://git-wip-us.apache.org/repos/asf/hive/blob/4a235fc5/testutils/metastore/metastore-validation-test.sh
----------------------------------------------------------------------
diff --git a/testutils/metastore/metastore-validation-test.sh b/testutils/metastore/metastore-validation-test.sh
new file mode 100644
index 0000000..0eee9d9
--- /dev/null
+++ b/testutils/metastore/metastore-validation-test.sh
@@ -0,0 +1,1442 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script generates data for hive metastore, inserts it into the DB
+# It verifies that the data exists after the schema has been upgraded.
+# Its a multi-step process.
+
+# 1) This script parses the schema file for a database specified via --db option.
+# and generates a file that contains just the "create table statements" from
+# the schema.
+# 2) This file is then cleansed and sorted according to the foreign
+# key dependencies. The tables that have no foreign keys float above the
+# ones that depend on them.
+# 3) The file created in step 2 is string substituted to convert the create table
+# statements into "insert into <table>" statements. During this stage, string and int
+# data is generated randomly. These statements are saved to a file. This stage also
+# generates another file where the select count(*) statements are generated.
+
+# *************************************** README ************************************************
+# This script is used as part of the HIVE METASTORE PRE-COMMIT build to ensure that
+# any changes to the HMS schema is backward compatible with the older schema and that
+# the changes do no delete any data from the current HMS datastore.
+# For example, dropping a column from an existing schema table is something we want to
+# avoid. Adding a column should be fine.
+# This script does the following
+# * Parses the hive schema file (starts at hive-0.10.0) and then compiles a list of tables
+#    in this schema (into /tmp/<dbVendor>-<schemaVersion>-create.sql)
+# * It then generates 1 row (for now) with random column data for each of the tables (into
+#   dbs/<db>/<db>-<version>-insert.sql) for each version of the hive schema.
+# * It then generates a "select count(*)" query statement for the above row so the query output
+#   should be exactly ONE (1) in all cases.
+
+# * For example:
+# *   INSERT INTO TBLS ( `TBL_ID`, `CREATE_TIME`, `DB_ID`, `LAST_ACCESS_TIME`, `OWNER`, `RETENTION`,
+#       `SD_ID`, `TBL_NAME`, `TBL_TYPE`, `VIEW_EXPANDED_TEXT`, `VIEW_ORIGINAL_TEXT`, `LINK_TARGET_ID` )
+#       VALUES ( 1, 35142, 1, 33771, 'NOLxH4BzdiBbE8i', 83343, 88353, 'hive_testtable', 'bewmH7XTBmxb23C',
+#       'JWKYyQDDYB9UsyV', 'E3AFymvLju857DD', 1 );
+
+# *   SELECT COUNT(*) FROM TBLS where `TBL_ID` = 1 and `CREATE_TIME` = 35142 and `DB_ID` = 1 and
+#       `LAST_ACCESS_TIME` = 33771 and `OWNER` = 'NOLxH4BzdiBbE8i' and `RETENTION` = 83343 and
+#       `SD_ID` = 88353 and `TBL_NAME` = 'hive_testtable' and `TBL_TYPE` = 'bewmH7XTBmxb23C' and
+#       `VIEW_EXPANDED_TEXT` = 'JWKYyQDDYB9UsyV' and `VIEW_ORIGINAL_TEXT` = 'E3AFymvLju857DD'  ;
+
+# * Schema is then installed using the dbs/<db>/execute.sh.
+# * The generated row data is then inserted into the DB.
+# * The schema is then upgraded to the next version, says hive-schema-0.11.0. After the upgrade, this script
+#   then executes the SELECT COUNT(*) queries for the tables from the prior version of schema. All tables are
+#   expected to pass the test with a rowcount of 1. If not, then we know we have lost data in the process of
+#   upgrade.
+
+# * This process of upgrading schema and executing select queries continues until it has reached the latest
+#   version of the schema.
+
+set -e
+
+cd $(dirname $0)
+
+usage() {
+  echo "Usage: "
+  echo "  $0 --db DB_SERVER"
+  echo
+  echo "Options"
+  echo "  --db DB_SERVER  Specifies a database server where to run the"
+  echo "                  upgrade tests."
+  echo
+}
+
+if [ $# -lt 2 ]; then
+  usage && exit 1
+fi
+
+if [ $1 != "--db" ]; then
+  echo "Error: unrecognized $1 option."
+  echo
+  usage && exit 1
+fi
+
+DB_SERVER="$2"
+if [ ! -d "dbs/$DB_SERVER" ]; then
+  log "Error: $DB_SERVER upgrade tests is not supported yet."
+  echo
+  echo "You must select from any of the following tests:"
+  ls dbs/
+  exit 1
+fi
+
+#
+# These tests work by calling special commands from specific
+# servers that are stored as scripts in a directory name
+# related to the server to test.
+#
+
+# 2 for INFO, 4 for DEBUG
+LOGLEVEL=4
+
+CONSOLELEVEL=2
+
+# default length for int datatype
+DEFAULT_INT_SIZE=5
+
+# default length for string datatype
+DEFAULT_STRING_SIZE=15
+
+# path to the log file
+OUT_LOG="/tmp/$(basename $0)-$2.log"
+rm -f $OUT_LOG
+
+getTime() {
+  echo `date +"[%m-%d-%Y %H:%M:%S]"`
+}
+
+# method to print messages to the log file
+log() {
+  echo "$(getTime) $@" >> $OUT_LOG
+}
+
+# method to print messages to STDOUT
+CONSOLE() {
+  echo "$(getTime) $@"
+}
+
+# log method to print error-level messages to the log file
+error() {
+  if [[ "$LOGLEVEL" -ge 1 ]]
+  then
+    log $@
+  fi
+
+  if [[ "$CONSOLELEVEL" -ge 1 ]]
+  then
+    CONSOLE $@
+  fi
+}
+
+# log method to print info-level messages to the log file
+info() {
+  if [[ "$LOGLEVEL" -ge 2 ]]
+  then
+    log $@
+  fi
+
+  if [[ "$CONSOLELEVEL" -ge 2 ]]
+  then
+    CONSOLE $@
+  fi
+}
+
+# log method to print debug-level messages to the log file
+debug() {
+  if [[ "$LOGLEVEL" -ge 4 ]]
+  then
+    log $@
+  fi
+
+  if [[ "$CONSOLELEVEL" -ge 4 ]]
+  then
+    CONSOLE $@
+  fi
+}
+
+# method that takes in 2 arguments and returns true if the second argument is a
+# substring of the first.
+isContains=0
+contains() {
+  isContains=0
+  string="$1"
+  substring="$2"
+  if test "${string#*$substring}" != "$string"
+  then
+    isContains=1
+  else
+    isContains=0
+  fi
+}
+
+# self test to ensure function is working as expected.
+str1="INTEGER NOT NULL"
+str2="INTEGER"
+contains "$str1" "$str2"
+if [[ "$isContains" -eq 1 ]]
+then
+  str2="VARCHAR"
+  contains "$str1" "$str2"
+  if [[ "$isContains" -eq 1 ]]
+  then
+    CONSOLE "contains self-test1 failed"
+    exit 1
+  fi
+else
+  CONSOLE "contains self-test2 failed"
+  exit 1
+fi
+
+
+FCOUNT=0
+# function to count the number of occurances of a string within another string
+num_occurances() {
+  fulltext="$1"
+  searchChars="$2"
+  FCOUNT=0
+  debug "[num_occurances] Search params $fulltext:$searchChars"
+  FCOUNT=$(grep -o "$searchChars" <<< "$fulltext" | wc -l)
+  debug "[num_occurances] Returning $FCOUNT"
+}
+
+# self test to ensure function is working as expected.
+str1="( INTEGER ())) NOT NULL )"
+num_occurances "$str1" "("
+if [[ $FCOUNT -eq 2 ]]
+then
+  num_occurances "$str1" ")"
+  if [[ $FCOUNT -ne 4 ]]
+  then
+    CONSOLE "num_occurances self-test failed"
+    exit 1
+  fi
+else
+  CONSOLE "num_occurances self-test failed"
+  exit 1
+fi
+
+# array that store all legal datatypes from all DB vendors
+TYPES=( "bigint"
+        "smallint"
+        "int"
+        "integer"
+        "tinyint"
+        "boolean"
+        "long varchar"
+        "varchar"
+        "char"
+        "character varying"
+        "character"
+        "bytea"
+        "binary"
+        "varbinary"
+        "mediumtext"
+        "double"
+        "double precision"
+        "serial"
+        "bigserial"
+        "bit"
+        "BIGINT"
+        "SMALLINT"
+        "LONG VARCHAR"
+        "VARCHAR"
+        "INT"
+        "INTEGER"
+        "TINYINT"
+        "BOOLEAN"
+        "CHAR"
+        "CHARACTER VARYING"
+        "CHARACTER"
+        "BYTEA"
+        "BINARY"
+        "VARBINARY"
+        "MEDIUMTEXT"
+        "DOUBLE"
+        "DOUBLE PRECISION"
+        "SERIAL"
+        "BIGSERIAL"
+        "BIT"
+        )
+
+# function to create a new map-type datastructure.
+# this is implemented as file on the local filesystem that contains name/value pairs.
+# first argument is the name of the map. filename matches the map name.
+newhash() {
+  rm -f /tmp/hashmap.$1
+  touch /tmp/hashmap.$1
+}
+
+# function to add an entry to the map. Entry is always added to the end of the map.
+# so if you delete and re-add an entry to the map, its position within a file could change.
+# arg1 is name of the map to add to.
+# arg2 is the key for entry.
+# arg3 is the value for the map entry.
+put() {
+  echo "$2=$3" >> /tmp/hashmap.$1
+}
+
+# function to retrieve an entry from the map.
+# arg1 is name of the map to retrive from.
+# arg2 is the key for entry
+get() {
+  grep "^$2=" /tmp/hashmap.$1 | cut -d'=' -f2
+}
+
+# function that returns the size of the map (aka number if entries)
+# arg1 is name of the map to retrive from.
+size() {
+  echo `cat /tmp/hashmap.$1 | wc -l`
+}
+
+# function that returns the key for the first entry in the map.
+# maps to the key of the first line in the file on disk.
+# arg1 is name of the map to retrive from.
+firstKey() {
+  head -n 1 /tmp/hashmap.$1 | cut -d'=' -f1
+}
+
+# function that returns the value of the first entry in the map.
+# maps to the value of the first line in the file on disk.
+# arg1 is name of the map to retrive from.
+firstValue() {
+  head -n 1 /tmp/hashmap.$1 | cut -d'=' -f2
+}
+
+# function that returns true if the map contains the specified key
+# arg1 is name of the map to retrieve from.
+# arg2 key to be found in the map.
+containsKey() {
+  # find a line that contains the key value at the begining of line.
+  ret=$(grep "^$2=" /tmp/hashmap.$1 | cut -d'=' -f1)
+  if [[ "$2" = "$ret" ]]
+  then
+    echo "true"
+  else
+   echo "false"
+  fi
+}
+
+# function that is specific to a map named "f_keys"
+# This map contains foreign key references. Given a table name, this function
+# returns the a COMMA-separated list of tables its keys references
+# arg1 is name of the map to retrieve from.
+# arg2 name of the table to return references for.
+get_referenced_tables() {
+  if [[ ! "$1" = "f_keys" ]]
+  then
+    echo "NONE"
+    return
+  fi
+
+  tables=""
+  while read fkc
+  do
+    key=`echo $fkc | cut -d'=' -f1`
+    table=`echo $key | cut -d'.' -f1`
+    if [[ "$table" != "$2" ]]
+    then
+      continue
+    fi
+
+    value=`echo $fkc | cut -d'=' -f2`
+    reftable=`echo $value | cut -d'.' -f1`
+    if [[ "$reftable" = "$table" ]]
+    then
+      continue
+    fi
+
+    if [[ "$tables" = "" ]]
+    then
+      tables="$reftable"
+    else
+      tables="$tables,$reftable"
+    fi
+
+  done < /tmp/hashmap.$1;
+  echo "$tables"
+}
+
+# function that deletes an entry from a named map.
+# arg1 is name of the map to delete from.
+# arg2 key of the entry to delete from the map.
+delete() {
+  if [[ "$1" = "" ]]
+  then
+    return
+  fi
+  debug "[delete] deleting entry for $2"
+  todo=$(size "$1")
+  exists=`cat /tmp/hashmap.$1 | grep "^$2=" | wc -l`
+  debug "[delete] entry found in map=$exists"
+  if [[ "$exists" -ge "1" ]]
+  then
+    if [[ "$todo" == 1 ]]
+    then
+      rm /tmp/hashmap.$1
+      touch /tmp/hashmap.$1
+      return
+    fi
+  fi
+  cat /tmp/hashmap.$1 | grep -v "^$2=" > /tmp/hashmap.$1.tmp
+  mv /tmp/hashmap.$1.tmp /tmp/hashmap.$1
+  todo=$(size "unsorted")
+  debug "[delete] map size after delete:$todo"
+}
+
+is_key_like() {
+  if [[ "$1" = "" ]]
+  then
+    return 0
+  fi
+
+  matched=0
+  while read fkc
+  do
+    local tabvalue=$(echo $fkc | grep "^$2." | cut -d'=' -f1 | cut -d'.' -f1)
+    if [[ "$2" = "$tabvalue" ]]
+    then
+      matched=1
+      break
+    fi
+  done < /tmp/hashmap.$1;
+  echo "[is_key_like] found match for $2:$matched"
+}
+
+# a map that maps SQL datatypes to datatype for the values to be generated.
+newhash typemap
+put typemap "bigint" "int"
+put typemap "smallint" "smallint"
+put typemap "int" "int"
+put typemap "integer" "int"
+put typemap "tinyint" "tinyint"
+put typemap "bit" "bit"
+put typemap "boolean" "boolean"
+put typemap "mediumint" "int"
+put typemap "long varchar" "longstring"
+put typemap "varchar" "string"
+put typemap "char" "string"
+put typemap "character" "string"
+put typemap "bytea" "string"
+put typemap "binary" "string"
+put typemap "varbinary" "string"
+put typemap "character varying" "string"
+put typemap "mediumtext" "string"
+put typemap "double" "int"
+put typemap "double precision" "int"
+put typemap "serial" "int"
+put typemap "bigserial" "int"
+put typemap "BIGINT" "int"
+put typemap "SMALLINT" "smallint"
+put typemap "MEDIUMINT" "int"
+put typemap "INT" "int"
+put typemap "INTEGER" "int"
+put typemap "TINYINT" "tinyint"
+put typemap "BIT" "bit"
+put typemap "BOOLEAN" "boolean"
+put typemap "LONG VARCHAR" "longstring"
+put typemap "VARCHAR" "string"
+put typemap "CHAR" "string"
+put typemap "BYTEA" "string"
+put typemap "BINARY" "string"
+put typemap "VARBINARY" "string"
+put typemap "CHARACTER" "string"
+put typemap "CHARACTER VARYING" "string"
+put typemap "MEDIUMTEXT" "string"
+put typemap "DOUBLE" "int"
+put typemap "DOUBLE PRECISION" "int"
+put typemap "SERIAL" "int"
+put typemap "BIGSERIAL" "int"
+
+# a map that contains mappings of foreign key references within the tables.
+# this is sort of hard to parse out of SQL. At some point in the future I will
+# look to have this map auto-generated.
+newhash "f_keys"
+put "f_keys" "BUCKETING_COLS.SD_ID" "SDS.SD_ID"
+put "f_keys" "COLUMNS.SD_ID" "SDS.SD_ID"
+put "f_keys" "COLUMNS_OLD.SD_ID" "SDS.SD_ID"
+put "f_keys" "COLUMNS_V2.CD_ID" "CDS.CD_ID"
+put "f_keys" "DATABASE_PARAMS.DB_ID" "DBS.DB_ID"
+put "f_keys" "DB_PRIVS.DB_ID" "DBS.DB_ID"
+put "f_keys" "FUNCS.DB_ID" "DBS.DB_ID"
+put "f_keys" "FUNC_RU.FUNC_ID" "FUNCS.FUNC_ID"
+put "f_keys" "IDXS.INDEX_TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "IDXS.ORIG_TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "IDXS.SD_ID" "SDS.SD_ID"
+put "f_keys" "INDEX_PARAMS.INDEX_ID" "IDXS.INDEX_ID"
+put "f_keys" "PARTITIONS.LINK_TARGET_ID" "PARTITIONS.PART_ID"
+put "f_keys" "PARTITIONS.SD_ID" "SDS.SD_ID"
+put "f_keys" "PARTITIONS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "PARTITION_KEYS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "PARTITION_KEY_VALS.PART_ID" "PARTITIONS.PART_ID"
+put "f_keys" "PARTITION_PARAMS.PART_ID" "PARTITIONS.PART_ID"
+put "f_keys" "PART_COL_PRIVS.PART_ID" "PARTITIONS.PART_ID"
+put "f_keys" "PART_COL_STATS.PART_ID" "PARTITIONS.PART_ID"
+put "f_keys" "PART_PRIVS.PART_ID" "PARTITIONS.PART_ID"
+put "f_keys" "ROLE_MAP.ROLE_ID" "ROLES.ROLE_ID"
+put "f_keys" "SDS.CD_ID" "CDS.CD_ID"
+put "f_keys" "SDS.SERDE_ID" "SERDES.SERDE_ID"
+put "f_keys" "SD_PARAMS.SD_ID" "SDS.SD_ID"
+put "f_keys" "SERDE_PARAMS.SERDE_ID" "SERDES.SERDE_ID"
+put "f_keys" "SKEWED_COL_NAMES.SD_ID" "SDS.SD_ID"
+put "f_keys" "SKEWED_COL_VALUE_LOC_MAP.SD_ID" "SDS.SD_ID"
+put "f_keys" "SKEWED_COL_VALUE_LOC_MAP.STRING_LIST_ID_KID" "SKEWED_STRING_LIST.STRING_LIST_ID"
+put "f_keys" "SKEWED_STRING_LIST_VALUES.STRING_LIST_ID" "SKEWED_STRING_LIST.STRING_LIST_ID"
+put "f_keys" "SKEWED_VALUES.SD_ID_OID" "SDS.SD_ID"
+put "f_keys" "SKEWED_VALUES.STRING_LIST_ID_EID" "SKEWED_STRING_LIST.STRING_LIST_ID"
+put "f_keys" "SORT_COLS.SD_ID" "SDS.SD_ID"
+put "f_keys" "TABLE_PARAMS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "TAB_COL_STATS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "TBLS.DB_ID" "DBS.DB_ID"
+put "f_keys" "TBLS.LINK_TARGET_ID" "TBLS.TBL_ID"
+put "f_keys" "TBLS.SD_ID" "SDS.SD_ID"
+put "f_keys" "TBL_COL_PRIVS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "TBL_PRIVS.TBL_ID" "TBLS.TBL_ID"
+put "f_keys" "TXN_COMPONENTS.TC_TXNID" "TXNS.TXN_ID"
+put "f_keys" "TYPE_FIELDS.TYPE_NAME" "TYPES.TYPES_ID"
+
+# store generated values in map and print it to a file
+# resetting the values from any prior runs
+newhash "genvalues"
+
+# store names of the tables that have already been sorted
+# resetting the values from any prior runs
+newhash "sorted"
+
+skipParsing=0
+skipGeneration=0
+if [ "$3" = "--skipParsing" ]; then
+  debug "skipParsing is true"
+  skipParsing=1
+fi
+
+if [ "$3" = "--skipProcessing" ]; then
+  debug "skipProcessing is true"
+  skipParsing=1
+  skipGeneration=1
+fi
+
+VERSION_BASE="0.10.0"
+VERSION_CURR="0.10.0"
+SQL_OUT="/tmp/$DB_SERVER-$VERSION_CURR-create.sql"
+SQL_SORTED="/tmp/$DB_SERVER-$VERSION_CURR-sortedcreate.sql"
+SCRIPT_INSERT="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-$VERSION_CURR-insert.sql"
+SCRIPT_QUERY="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-$VERSION_CURR-query.sql"
+INSERTALL="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-insertall.sql"
+QUERYALL="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-queryall.sql"
+
+printSQL() {
+  echo $@ >> $SQL_OUT
+}
+
+# method to delete a file from the file system.
+rmfs() {
+  if [[ -e "$1" ]]
+  then
+    debug "Deleting generated file $1"
+    rm "$1"
+  fi
+}
+
+DATAFILE="$(pwd)/dataload.properties"
+
+# method that returns a static value loaded from the properties file for a column. These values are read from a
+# file on disk
+getStaticValue() {
+  grep -i "^$1=" "$DATAFILE" | cut -d'=' -f2
+}
+
+HMS_UPGRADE_DIR="$(pwd)/../../metastore/scripts/upgrade"
+if [ ! -d "$HMS_UPGRADE_DIR/$DB_SERVER" ]; then
+  error "Error: $DB_SERVER upgrade scripts are not found on $HMS_UPGRADE_DIR."
+  exit 1
+fi
+
+HIVE_SCHEMA_BASE="$HMS_UPGRADE_DIR/$DB_SERVER/hive-schema-$VERSION_BASE.$DB_SERVER.sql"
+
+SCRIPT_SRC="$HIVE_SCHEMA_BASE"
+
+if [ "$skipParsing" -eq "0" ]; then
+  rmfs "$SQL_OUT"
+  rmfs "$SQL_SORTED"
+  rmfs "$SCRIPT_INSERT"
+  rmfs "$SCRIPT_QUERY"
+  rmfs "$INSERTALL"
+  rmfs "$QUERYALL"
+else
+  info "skipParsing flag enabled, skipping parsing for $SCRIPT_SRC"
+fi
+
+if [ "$skipGeneration" -eq "0" ] && [ "$skipParsing" -eq "1" ]; then
+  rmfs "$SCRIPT_INSERT"
+  rmfs "$SCRIPT_QUERY"
+  rmfs "$INSERTALL"
+  rmfs "$QUERYALL"
+fi
+
+#
+# Before running any test, we need to prepare the environment.
+# This is done by calling the prepare script. This script must
+# install, configure and start the database server where to run
+# the upgrade script.
+#
+
+execute_test() {
+  log "Executing sql test: $DB_SERVER/$(basename $1)"
+  if ! bash -x -e $SCRIPT_EXECUTE $1; then
+    log "Test failed: $DB_SERVER/$(basename $1)"
+    return 1
+  fi
+}
+
+# this method parses the SQL schema file and generates a file with all the
+# "create table" definitions. This file is later string-subbed to generate
+# an file with bunch of "insert into <table>" statements.
+parse_schema() {
+  if [ -e $SQL_OUT ] && [ "$2" = 0 ]
+  then
+    debug "[parse_schema] Resetting position in file"
+    rmfs "$SQL_OUT"
+  fi
+
+  info "[parse_schema] Parsing sql file: $1"
+  CREATE_OPEN=false
+  TABLE_DEF=""
+  COUNTER=0
+  shopt -s nocasematch
+
+  while read line
+  do
+    copy=$line
+    # if the line is a comment, begins with #, skip it
+    if [[ "$copy" =~ "^--" ]]
+    then
+      debug "[parse_schema] Comment, skipping ..."
+      continue
+    fi
+
+    # The following code counts the number of open an close parenthesis within
+    # each line read from the schema file. When the openparen count matches the
+    # closeparen count, it assumes that the create table definition has ended.
+    debug "[parse_schema] Processing $line with $COUNTER and $CREATE_OPEN"
+    if [[ "$CREATE_OPEN" = true ]]
+    then
+      ignore=$(is_throwaway "$copy")
+      debug "ignore=$ignore=line=$copy="
+      if [[ "$ignore" = "TRUE" ]]
+      then
+        debug "ignoring line"
+        continue
+      fi
+
+      #if [[ "$copy" =~ "(" ]]
+      num_occurances "$copy" "("
+      if [[ $FCOUNT -gt 0 ]]
+      then
+        COUNTER=$((COUNTER+FCOUNT))
+        debug "[parse_schema] count=$COUNTER"
+      fi
+
+      #if [[ "$copy" =~ ")" ]]
+      num_occurances "$copy" ")"
+      if [[ $FCOUNT -gt 0 ]]
+      then
+        if [[ "$COUNTER" -gt 1 ]]
+        then
+          debug "[parse_schema] Adding line to table: $line :: $COUNTER"
+          TABLE_DEF="$TABLE_DEF $line"
+          debug "[parse_schema] TABLEDEF=$TABLE_DEF"
+          COUNTER=$((COUNTER-FCOUNT))
+          debug "[parse_schema] count=$COUNTER"
+        elif [[ "$COUNTER" -eq 1 ]]
+        then
+          copy=$line
+          debug "[parse_schema] CLOSE FOUND $line"
+          PART=`echo $copy | cut -d')' -f1 | sed 's/$/)/'`
+          debug "[parse_schema] $PART appended to table def"
+          TABLE_DEF="$TABLE_DEF $PART"
+          COUNTER=0
+          CREATE_OPEN=false
+          debug "[parse_schema] Full Table definition is : $TABLE_DEF"
+          printSQL "$TABLE_DEF"
+        fi
+      else
+        debug "[parse_schema] Adding line to table: $line :: $COUNTER"
+        TABLE_DEF="$TABLE_DEF $line"
+        debug "[parse_schema] TABLEDEF=$TABLE_DEF"
+      fi
+    else
+      if [[ "$copy" =~ "create table" ]];
+      then
+        debug "[parse_schema] OPEN FOUND $line"
+        TABLE_DEF="$line"
+
+        num_occurances "$copy" ")"
+        local closeCount=$FCOUNT
+        num_occurances "$copy" "("
+        local openCount=$FCOUNT
+        debug "[parse_schema] matches $openCount $closeCount"
+        if [ "$openCount" -eq "$closeCount" ] && [ "$openCount" -gt 0 ]
+        then
+          PART=`echo $copy | cut -d';' -f1 | sed 's/$/;/'`
+          printSQL "$PART"
+          COUNTER=0
+          CLOSE_OPEN=false
+          continue
+        fi
+
+        CREATE_OPEN=true
+        if [[ "$FCOUNT" -gt 0 ]]
+        then
+          COUNTER=$((COUNTER+FCOUNT))
+          debug "[parse_schema] count=$COUNTER"
+        fi
+      fi
+    fi
+  done < "$1";
+}
+
+# function to detemine if an argument is a number. This is used to determine the
+# length of the column values that are to be generated.
+is_int() {
+  return $(test "$@" -eq "$@" > /dev/null 2>&1);
+}
+
+# self-test to ensure function is working as expected
+if $(is_int "500");
+then
+  if $(is_int "foo");
+  then
+    # function not working properly
+    exit 1
+  fi
+else
+  # function not working properly
+  exit 1
+fi
+
+# trims the leading and trailing spaces from a line
+trim() {
+  local var="$*"
+  var="${var#"${var%%[![:space:]]*}"}"   # remove leading whitespace characters
+  var="${var%"${var##*[![:space:]]}"}"   # remove trailing whitespace characters
+  echo -n "$var"
+}
+
+# function that generates a random int value of default size
+int() {
+  limit="$DEFAULT_INT_SIZE"
+  if $(is_int "${1}");
+  then
+    if [[ "$1" -lt "$limit" ]]
+    then
+      limit="$1"
+    fi
+  fi
+
+  limit_int "$limit"
+}
+
+# function that generates a random int value of a specified length.
+# if the limit passed in is NOT a number, an int of default length is returned.
+limit_int() {
+  if $(is_int "${1}");
+  then
+    limit="$1"
+  else
+    limit=5
+  fi
+
+  if [ "$(uname)" == "Darwin" ]; then
+    cat /dev/urandom | env LC_CTYPE=C tr -dc '1-9' | fold -w ${limit:-15} | head -n 1
+  elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+    cat /dev/urandom | tr -dc '1-9' | fold -w ${limit:-15} | head -n 1
+  fi
+  #cat /dev/urandom | env LC_CTYPE=C tr -dc '1-9' | fold -w ${1:-15} | head -n 1
+}
+
+# function that returns a random bit value of either 0 or 1
+bit() {
+  r_int=$(int)
+  echo $[ $[r_int % 2 ]]
+}
+
+# function that returns a random boolean value of TRUE or FALSE
+boolean() {
+  trueval="TRUE"
+  falseval="FALSE"
+
+  # have to special case derby because derby has no native bit datatype
+  # it uses a char of length 1 to store Y or N for booleans.
+  if [[ "$DB_SERVER" = "derby" ]]
+  then
+    trueval="Y"
+    falseval="N"
+  fi
+
+  r_bit=$(bit)
+  if [[ "$r_bit" == 1 ]]
+  then
+    echo "$trueval"
+  else
+    echo "$falseval"
+  fi
+}
+
+tinyint() {
+  r_int=$(int)
+  echo $[ $[r_int % 127 ]]
+}
+
+smallint() {
+  r_int=$(int)
+  echo $[ $[r_int % 32000 ]]
+}
+
+hex() {
+  input="$1"
+  hexval=$(xxd -pu <<< "$input")
+  echo "$hexval"
+}
+
+limit_string() {
+  if $(is_int "${1}");
+  then
+    limit="$1"
+  else
+    limit=5
+  fi
+
+  # for derby a char of length generates either a Y or N
+  # serves well for boolean types in the schema
+  if [[ ( "$DB_SERVER" = "derby" ) && ( "$limit" -eq 1 ) ]]
+  then
+    boolean
+    return
+  fi
+
+  if [ "$(uname)" == "Darwin" ]; then
+    cat /dev/urandom | env LC_CTYPE=C tr -dc 'a-z0-9' | fold -w ${limit:-32} | head -n 1
+  elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+    cat /dev/urandom | tr -dc 'a-z0-9A-Z' | fold -w ${limit:-32} | head -n 1
+  fi
+}
+
+# function that returns a random string of default length.
+string()
+{
+  limit="$DEFAULT_STRING_SIZE"
+  if $(is_int "${1}");
+  then
+    if [[ "$1" -lt "$limit" ]]
+    then
+      limit="$1"
+    fi
+  fi
+
+  limit_string "$limit"
+}
+
+# function to determine if a token within a column definition is to be ignored.
+# this is to differentiate column definitions from PRIMARY/FOREIGN KEY specifications.
+is_throwaway() {
+  coldef=$(trim "$1")
+  if [[ ( "$coldef" =~ ^PRIMARY\ KEY.* ) || ( "$coldef" =~ FOREIGN\ KEY ) ]]
+  then
+    echo "TRUE"
+  else
+    echo "FALSE"
+  fi
+}
+
+# extracts the name of the columns from a column definition within SQL.
+# Assumes that the first token is the name of the column
+extract_colname() {
+  coldef=$(trim "$1")
+  echo $coldef | cut -d' ' -f1
+}
+
+# returns properly quoted column string, with ticks, single or double quotes according to the DB's preference.
+# Assumes that the first token is the name of the column
+localize_colname() {
+  coldef=$(trim "$1")
+  if [[ "$DB_SERVER" = "mysql" ]]
+  then
+    echo "\`$coldef\`"
+  elif [[ "$DB_SERVER" = "derby" ]]
+  then
+    echo "\"$coldef\""
+  else
+    echo "$coldef"
+  fi
+}
+
+# extracts the name of the table from a create statement from the parsed SQL.
+# Assumes that the thrid SPACE-separated token is the name of the table.
+# ex: CREATE TABLE DBS ( DB_ID bigint(20) );
+extract_tablename() {
+  if [[ "$DB_SERVER" = "derby" ]]
+  then
+    stmt=`echo "$1" | cut -d' ' -f3 | cut -d'(' -f1 | sed -e 's/APP.//g'`
+  else
+    #stmt=`echo "$1" | cut -d' ' -f3`
+    stmt=`echo "$1" | cut -d'(' -f1 | cut -d' ' -f3`
+  fi
+  echo "$stmt" | tr -d '"'
+}
+
+# this function inserts the comma-separated column names for the table
+# into the insert statement
+# first argument is the current SQL insert statement, the second is a list of columns for this table
+# ex: INSERT INTO foo VALUES (1, "ABC")
+# becomes
+# INSERT INTO foo (id, name) VALUES (1, "ABC")
+insert_column_names() {
+  line="$1"
+  colnames="$2"
+  merged=`echo "$line" | sed -e "s/\ VALUES\ /\ ${colnames}\ VALUES\ /g"`
+  echo "$merged"
+}
+
+# extracts the name of the table in its native form from a create statement from the parsed SQL.
+# Assumes that the thrid SPACE-separated token is the name of the table. if there are quotes around it will 
+# return the quotes too.
+# ex: CREATE TABLE DBS ( DB_ID bigint(20) );
+extract_raw_tablename() {
+  if [[ "$DB_SERVER" = "derby" ]]
+  then
+    stmt=`echo "$1" | cut -d' ' -f3 | cut -d'(' -f1 | sed -e 's/APP.//g'`
+  else
+    stmt=`echo "$1" | cut -d'(' -f1 | cut -d' ' -f3`
+  fi
+  echo "$stmt"
+}
+
+# function to extract the maxlength of the column from the column definition.
+# if the column definition contains (), then the value within the () is the length.
+# 100 otherwise.
+extract_columnsize() {
+  if test "${1#*(}" != "$1"
+  then
+    echo "$1" | cut -d "(" -f2 | cut -d ")" -f1
+  else
+    echo "100"
+  fi
+}
+
+# this functions reads all the create table statements from the file generated by
+# parse_schema and then sorts them according to the order in which the insert
+# statements are to be generated. It uses the foreign key references to sort the tables.
+# Any tables without FK references appear above the tables with FKs to these.
+# if table B's column has a FK reference to table A's column, this sort function
+# guarantees that table A appears before table B in the sorted file.
+sort_tabledefs() {
+  if [[ -e $SQL_SORTED ]] && [[ "$1" = 0 ]]
+  then
+    debug "[sort] Resetting the position in file"
+    rmfs "$SQL_SORTED"
+    #return
+  fi
+
+  if [ ! -e $SQL_OUT ]
+  then
+    info "[sort] $SQL_OUT does not exist, returning .."
+    return
+  fi
+
+  info "[sort] Sorting table definitions based on dependencies"
+  newhash "unsorted"
+
+  while read line
+  do
+    table=$(extract_tablename "$line")
+    debug "[sort] Extracted table name=$table from $line"
+    put "unsorted" "$table" "$line"
+  done < $SQL_OUT;
+
+  table=""
+  todo=$(size "unsorted")
+  debug "[sort] $todo entries are to be sorted"
+  while [[ "$todo" > 0 ]]
+  do
+    table=$(firstKey "unsorted")
+    line=$(firstValue "unsorted")
+    debug "[sort] table name $table"
+    references=$(get_referenced_tables "f_keys" "$table")
+    debug "[sort] table $table references $references"
+    if [[ ! "$references" = "" ]]
+    then
+      commit="true"
+      OLDIFS=$IFS
+      IFS=',' eval 'array=($references)'
+      IFS=$OLDIFS
+      for i in "${!array[@]}"
+      do
+        hashsize=$(size "sorted")
+        if [[ "$hashsize" > 0 ]]
+        then
+          hasKey=$(containsKey "sorted" "${array[i]}")
+          debug "[sort] is table ${array[i]} sorted:$hasKey"
+          if [[ ! "$hasKey" = true ]]
+          then
+            commit="false"
+            break
+          fi
+        else
+          commit="false"
+        fi
+      done
+      debug "[sort] commit table $table:$commit"
+      if [[ "$commit" = "true" ]]
+      then
+        echo "$line" >> $SQL_SORTED
+        put "sorted" "$table" "$line"
+        debug "[sort] deleting table $table from unsorted"
+        delete "unsorted" "$table"
+        debug "[sort] table $table committed"
+      else
+        delete "unsorted" "$table"
+        put "unsorted" "$table" "$line"
+        debug "[sort] table $table deferred"
+      fi
+    else
+      echo "$line" >> $SQL_SORTED
+      put "sorted" "$table" "$line"
+      delete "unsorted" "$table"
+      debug "[sort] table $table committed as it has no references"
+    fi
+
+    todo=$(size "unsorted")
+    debug "[sort] todo size=$todo"
+  done
+  info "[sort] Sorting Complete!!!"
+}
+
+# this functions performs some cleansing on the SQL statements to a unified format.
+# for example, mysql schema file contains "create table if not exists" while PG and derby
+# just have "create table"
+cleanse_sql() {
+  info "[Cleansing] Cleansing data"
+  TMP="/tmp/insert-$DB_SERVER-$VERSION_CURR-tmp.sql"
+  TMP2="/tmp/insert-$DB_SERVER-$VERSION_CURR-tmp2.sql"
+  if [[ -e "$SQL_OUT" ]]
+  then
+    if [[ "$DB_SERVER" = "postgres" ]]
+    then
+      # TODO perhaps there is a better means to do this with just cat and set
+      # but due to inconsistencies in the schema files for postgres, some tables are upper case and some tables are lowercase
+      # CREATE TABLE TXNS --> creates a lower case "txns" table
+      # CREATE TABLE "TXNS" --> creates a upper case "TXNS" table
+      # so we have retain the quotes or lack thereof from the original create table command.
+      while read line; do
+        # so we cut the statement into 2 parts and only massage the second part.
+        part1="$( cut -d '(' -f 1 <<< "$line" )"
+        part1modified=$(echo $part1 | sed -e 's/IF\ NOT\ EXISTS\ //g')
+        part2="$( cut -d '(' -f 2- <<< "$line" )"
+        echo "$part1modified( $part2" >> $TMP
+      done < $SQL_OUT
+    else
+      cat $SQL_OUT | sed -e 's/"//g' | sed -e 's/`//g' | sed -e 's/IF\ NOT\ EXISTS\ //g' > $TMP
+    fi
+
+    if [[ "$DB_SERVER" = "mysql" ]]
+    then
+      cat $TMP | tr -s ' ' | sed 's/,\ PRIMARY\ KEY.*$/)/g' > $TMP2
+      mv $TMP2 $TMP
+    fi
+    mv $TMP $SQL_OUT
+  fi
+  info "[Cleansing] Cleansing complete!!"
+}
+
+# this is third and final step in the converting the file with "create table" statements
+# into a file with "insert into" statements where the column values are either looked from
+# from the static data file or generated according to their datatype determined in step 1
+generate_data() {
+  info "[generate_data] SQL inserts will be saved to $SCRIPT_INSERT, SQL query to $SCRIPT_QUERY"
+  if [ -e $SCRIPT_INSERT ] && [ -e $SCRIPT_QUERY ] && [ "$1" = 0 ]
+  then
+    debug "[sort] Skipping data generation"
+    rmfs "$SCRIPT_INSERT"
+    rmfs "$SCRIPT_QUERY"
+  fi
+
+  if [[ -e "$QUERYALL" ]]
+  then
+    cp "$QUERYALL" "$SCRIPT_QUERY"
+  fi
+
+  if [[ ! -e $SQL_SORTED ]]
+  then
+    info "[generate_data] $SQL_SORTED does not exist, returning .."
+    return
+  fi
+
+  info "[generate_data] Generating table data for $DB_SERVER-$VERSION_CURR"
+  INSERT="/tmp/$DB_SERVER-$VERSION_CURR-insert.sql"
+  TMP="/tmp/insert-$DB_SERVER-$VERSION_CURR-tmp.sql"
+  cat $SQL_SORTED | sed -e 's/IF\ NOT\ EXISTS\ //g' | sed -e 's/CREATE\ TABLE/INSERT\ INTO/g' | sed -e 's/(/\ VALUES\ \(/' > $INSERT
+
+  while read line
+  do
+    debug "[generate_data] Processing...$line"
+    BEGIN=`echo $line | cut -d'(' -f1 | sed 's/$/(/'`
+    insert_cols="("
+    select="SELECT COUNT(*) FROM"
+    REST=`echo $line | cut -d'(' -f2-`
+    table=$(extract_raw_tablename "$BEGIN")
+    select="$select $table where"
+    table=`echo $table | tr -d '"'`
+    OLDIFS=$IFS
+    IFS=',' eval 'array=($REST)'
+    IFS=$OLDIFS
+    hasOpen=0
+    hasClose=0
+    tokenbuffer=""
+    for i in "${!array[@]}"
+    do
+      token="${array[i]}"
+      debug "[generate_data] Processing table:$table, token=>${array[i]},hasOpen=$hasOpen,hasClose=$hasClose"
+
+      debug "token value before extracting column=$token"
+      colname=$(extract_colname "$token")
+      unquotedColname=$(echo $colname | sed -e 's/"//g')
+      sqlColname=$(localize_colname "$colname")
+
+      debug "[generate_data] Processing table:$table, column:$colname, tablename.columnname is $table.$colname"
+
+      # the value for this column is generated by the DB. The column def for such DB managed
+      # columns is very different. A value of default has to be supplied for this key.
+      if [[ "$table.$colname" == "MASTER_KEYS.KEY_ID" ]]
+      then
+        info "$table.$colname is generated value"
+        BEGIN="$BEGIN DEFAULT,"
+        insert_cols="$insert_cols $sqlColname,"
+        continue
+      fi
+
+      for type in "${!TYPES[@]}"
+      do
+        fk=""
+        val=""
+        # TODO we are over-iterating here for each column
+        debug "[generate_data] column name=$colname"
+        staticValue=$(getStaticValue "$table.$colname")
+        if [[ ! "$staticValue" = "" ]]
+        then
+          info "pre-loaded value for $table.$colname is $staticValue"
+          put "genvalues" "$table.$colname" "$staticValue"
+          BEGIN="$BEGIN $staticValue,"
+          insert_cols="$insert_cols $sqlColname,"
+          if [ ! "$table.$colname" = "TBLS.LINK_TARGET_ID" ] && [ ! "$table.$colname" = "PARTITIONS.LINK_TARGET_ID" ]
+          then
+            select="$select $sqlColname = $staticValue and"
+          fi
+          continue 2
+        fi
+
+        contains "$token" "${TYPES[type]}"
+        if [[ $isContains -eq 1 ]]
+        then
+          mappedtype=$(get "typemap" "${TYPES[type]}")
+          if [[ "$DB_SERVER" -eq "derby" ]]
+          then
+            contains "$token" "for bit data"
+            if [[ $isContains -eq 1 ]]
+            then
+              mappedtype="derbybinary"
+            fi
+          fi
+
+          debug "[generate_data] column type for $table.$colname is ${TYPES[type]}, mapped type=$mappedtype"
+          colsize=$(extract_columnsize "$token")
+          debug "[generate_data] columnsize=$colsize for $table.$colname"
+          fk=$(get "f_keys" "$table.$unquotedColname")
+          if [[ ! "$fk" = "" ]]
+          then
+            val=$(get "genvalues" "$fk")
+            info "[generate_data] column $table.$unquotedColname references $fk whose value is $val"
+          fi
+
+          if [[ "$val" != "" ]]
+          then
+            BEGIN="$BEGIN $val,"
+            insert_cols="$insert_cols $sqlColname,"
+            if [ ! "$table.$colname" = "TBLS.LINK_TARGET_ID" ] && [ ! "$table.$colname" = "PARTITIONS.LINK_TARGET_ID" ]
+            then
+              select="$select $sqlColname = $val and"
+            fi
+            continue 2
+          fi
+
+          info "[generate_data] Generating random value for column $table.$unquotedColname"
+          case "$mappedtype" in
+            int )
+              if [[ "$val" = "" ]]
+              then
+                val=$(int "$colsize")
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+            longstring )
+              if [[ "$val" = "" ]]
+              then
+                val=$(string "$colsize")
+                val="'$val'"
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+
+              BEGIN="$BEGIN $val,"
+              insert_cols="$insert_cols $sqlColname,"
+              select="$select TRIM($sqlColname) = TRIM($val) and"
+              continue 2
+              ;;
+            string )
+              if [[ "$val" = "" ]]
+              then
+                val=$(string "$colsize")
+                val="'$val'"
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+            bit )
+              if [[ "$val" = "" ]]
+              then
+                val=$(bit)
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+
+              BEGIN="$BEGIN $val,"
+              insert_cols="$insert_cols $sqlColname,"
+              select="$select CAST( $sqlColname AS UNSIGNED) = '$val' and"
+              continue 2
+              ;;
+            boolean )
+              if [[ "$val" = "" ]]
+              then
+                val=$(boolean)
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+            tinyint )
+              if [[ "$val" = "" ]]
+              then
+                val=$(tinyint)
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+            smallint )
+              if [[ "$val" = "" ]]
+              then
+                val=$(smallint)
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+            derbybinary )
+              if [[ "$val" = "" ]]
+              then
+                val=$(string "$colsize")
+                val=$(hex "$val")
+                val="X'$val'"
+                put "genvalues" "$table.$unquotedColname" "$val"
+              fi
+              ;;
+          esac
+
+          BEGIN="$BEGIN $val,"
+          insert_cols="$insert_cols $sqlColname,"
+          if [ ! "$table.$unquotedColname" = "TBLS.LINK_TARGET_ID" ] && [ ! "$table.$unquotedColname" = "PARTITIONS.LINK_TARGET_ID" ]
+          then
+            select="$select $sqlColname = $val and"
+          fi
+          break
+        fi
+      done
+    done
+    BEGIN="${BEGIN: 0:${#BEGIN} - 1} );"
+    insert_cols="${insert_cols: 0:${#insert_cols} - 1} )"
+    foo=$(insert_column_names "$BEGIN" "$insert_cols")
+    select="${select: 0:${#select} - 3} ;"
+    echo "$foo" >> $SCRIPT_INSERT
+    echo "$select" >> $SCRIPT_QUERY
+    echo "$select" >> $QUERYALL
+  done < $INSERT;
+}
+
+if [ ! -f $HIVE_SCHEMA_BASE ]; then
+  log "Error: $HIVE_SCHEMA_BASE is not found."
+  exit 1
+fi
+
+processFile() {
+
+  debug "[processFile] $VERSION_CURR"
+  debug "parsed sql output file is $SQL_OUT, sorted file is $SQL_SORTED"
+  debug "generated sql script is $SCRIPT_INSERT, query file is $SCRIPT_QUERY"
+
+  if [ -e "$SQL_OUT" ] && [ -e "$SQL_SORTED" ] && [ "$skipParsing" -eq "1" ];
+  then
+    info "skipParsing flag enabled, skipping parsing for $HIVE_SCHEMA_BASE"
+  else
+    debug "Invoking the parsing routine for [$1]"
+    parse_schema "$1" "$2"
+    debug "Invoking the cleansing routine for[$1]"
+    cleanse_sql
+    debug "Sorting tables for [$1]"
+    sort_tabledefs "$2"
+  fi
+
+  if [ -e "$SQL_SORTED" ] && [ "$skipGeneration" -eq "0" ];
+  then
+    debug "Invoking the generate routine for[$1]"
+    generate_data "$2"
+  fi
+
+  debug "[$1] DONE!!"
+}
+
+parseAndAppend()
+{
+  info "[parseAndAppend] $SQL_OUT:$SCRIPT_INSERT:$SQL_SORTED for $VERSION_CURR"
+  if [ -e $SQL_OUT ] && [ -e $SQL_SORTED ] && [ "$skipParsing" -eq "1" ];
+  then
+    info "[parseAndAppend] Files exist, skipping parsing for $VERSION_CURR"
+    return
+  fi
+
+  parse_schema "$1" "$2"
+}
+
+sortAndGenerate()
+{
+  debug "sortAndGenerate for $SQL_SORTED::$SCRIPT_INSERT"
+  if [ -e "$SQL_SORTED" ] && [ "$skipParsing" -eq "1" ];
+  then
+    debug "$SQL_SORTED exists and skipParsing is true, skipping sorting."
+  else
+    cleanse_sql
+    sort_tabledefs "$1"
+  fi
+
+  if [ -e "$SQL_SORTED" ] && [ "$skipGeneration" -eq "0" ];
+  then
+    generate_data "$1"
+  else
+    debug "SQL file not found or skipGeneration is true, skipping generation..."
+  fi
+}
+
+clearFiles() {
+  rmfs "$SQL_OUT"
+  rmfs "$SQL_SORTED"
+  rmfs "$SCRIPT_INSERT"
+  rmfs "$SCRIPT_QUERY"
+}
+
+info "Processing $HIVE_SCHEMA_BASE"
+processFile "$HIVE_SCHEMA_BASE" "0"
+
+if [[ -e "$SCRIPT_INSERT" ]]
+  then
+    debug "Appending file to master:$SCRIPT_INSERT"
+    cat "$SCRIPT_INSERT" >> "$INSERTALL"
+fi
+
+FILE_LIST="$HMS_UPGRADE_DIR/$DB_SERVER/upgrade.order.$DB_SERVER"
+
+prefix=""
+  case "$DB_SERVER" in
+    mysql)
+      # TODO perform a case-insensitive match
+      prefix="SOURCE"
+      ;;
+    postgres)
+      prefix="i\ "
+      ;;
+    derby)
+      # TODO perform a case-insensitive match
+      prefix="RUN"
+      ;;
+    oracle)
+      prefix="NOTSUPPORTED"
+      ;;
+    mssql)
+      prefix="NOTSUPPORTED"
+      ;;
+  esac
+
+# iterate thru every upgrade file listed in the upgrade.order.<db> file and for each
+# upgrade file, parse the schema file and generate the data and a query file.
+while read order
+do
+  curr="$VERSION_CURR-to-"
+  if test "${order#*$curr}" != "$order"
+  then
+    VERSION_CURR="${order#*$curr}"
+    SQL_OUT="/tmp/$DB_SERVER-$VERSION_CURR-create.sql"
+    SQL_SORTED="/tmp/$DB_SERVER-$VERSION_CURR-sortedcreate.sql"
+    SCRIPT_INSERT="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-$VERSION_CURR-insert.sql"
+    SCRIPT_QUERY="$(pwd)/dbs/$DB_SERVER/$DB_SERVER-$VERSION_CURR-query.sql"
+    upgrade="$HMS_UPGRADE_DIR/$DB_SERVER/upgrade-$order.$DB_SERVER.sql"
+    append=0
+
+    if [ -e $SQL_OUT ] && [ -e $SCRIPT_INSERT ] && [ -e $SQL_SORTED ] && [ "$skipParsing" -eq "1" ] && [ "$skipGeneration" -eq "1" ];
+    then
+      info "Files exist, Skipping parsing/generation for $VERSION_CURR"
+      continue
+    fi
+
+    if [ "$skipParsing" -eq "0" ];
+    then
+      clearFiles
+    elif [ "$skipGeneration" -eq "0" ];
+    then
+      rmfs "$SCRIPT_INSERT"
+      rmfs "$SCRIPT_QUERY"
+    fi
+
+    while read line
+    do
+      debug "$line"
+      if [[ $line =~ $prefix.* ]]
+      then
+        debug "upgrade.order processing $line prefix=$prefix"
+        minisql=`echo $line | cut -d' ' -f2- | cut -d';' -f1`
+        minisql="${minisql%\'}"
+        minisql="${minisql#\'}"
+        info "Processing $DB_SERVER/$minisql"
+        parseAndAppend "$HMS_UPGRADE_DIR/$DB_SERVER/$minisql" "$append"
+        append=1
+      fi
+    done < $upgrade;
+
+    sortAndGenerate "$append"
+
+    if [[ -e $SCRIPT_INSERT ]]
+    then
+      info "Appending file to master $SCRIPT_INSERT"
+      cat "$SCRIPT_INSERT" >> "$INSERTALL"
+    fi
+
+    append=0
+  fi
+done < $FILE_LIST;
+
+info "Hive Metastore data validation test successful."


Mime
View raw message