Mailing-List: contact commits-help@climate.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@climate.incubator.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r1494354 -
 /incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py
Date: Tue, 18 Jun 2013 22:41:44 -0000
To: commits@climate.incubator.apache.org
From: huikyole@apache.org
Message-Id: <20130618224144.F0B1B23888D2@eris.apache.org>

Author: huikyole
Date: Tue Jun 18 22:41:44 2013
New Revision: 1494354

URL: http://svn.apache.org/r1494354
Log:
Review Board #11873: metrics improvements through vectorization. Committing this for Alex until his svn access is approved

Modified:
    incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py

Modified: incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py
URL: http://svn.apache.org/viewvc/incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py?rev=1494354&r1=1494353&r2=1494354&view=diff
==============================================================================
--- incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py (original)
+++ incubator/climate/trunk/rcmet/src/main/python/rcmes/toolkit/metrics.py Tue Jun 18 22:41:44 2013
@@ -21,96 +21,38 @@ Module storing functions to calculate st
 
 import subprocess
 import os, sys
+import datetime
 import numpy as np
 import numpy.ma as ma
 import scipy.stats as stats
-import scipy.stats.mstats as mstats
-import datetime
 from toolkit import plots, process
 from utils import misc
 from storage import files 
 
-def calcAnnualCycleMeans(dataset1, times):
+def calcAnnualCycleMeans(dataset1):
     '''
     Purpose:: 
         Calculate annual cycle in terms of monthly means at every grid point.
 
     Input::
         dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-        times - an array of python datetime objects
 
     Output:: 
         means - if 3d numpy was entered, 3d (# of months,lat,lon), if 1d data entered
         it is a timeseries of the data of length # of months
 
      '''
-    
-    # note: this routine is identical to 'calc_annual_cycle_means': must be converted to calculate the annual mean
-    # Extract months from time variable
-    months = np.empty(len(times))
-
-    for t in np.arange(len(times)):
-        months[t] = times[t].month
-
-    if dataset1.ndim == 3:
-        means = ma.empty((12, dataset1.shape[1], dataset1.shape[2])) # empty array to store means
-        # Calculate means month by month
-        for i in np.arange(12)+1:
-            means[i - 1, :, :] = dataset1[months == i, :, :].mean(0)
-
-    if dataset1.ndim == 1:
-        means = np.empty((12)) # empty array to store means
-        # Calculate means month by month
-        for i in np.arange(12)+1:
-            means[i - 1] = dataset1[months == i].mean(0)
-
-    return means
-
-'''
-def calcClimMonth(dataset1, times):
-    
-    Purpose:: 
-        Calculate annual cycle in terms of monthly means at every grid point.
-
-    Input::
-        dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-        times - an array of python datetime objects
-
-    Output:: means - if 3d numpy was entered, 3d (# of months,lat,lon), if 1d data entered
-        it is a timeseries of the data of length # of months
-
-    Same as calcAnnulCycleMeans
-    Deprivable as not called anywhere in the code. 5 Jun 2013.
-    
-    months = np.empty(len(times))
-
-    for t in np.arange(len(times)):
-        months[t] = time[t].month
-
-    if dataset1.ndim == 3:
-        means = ma.empty((12, dataset1.shape[1], dataset1.shape[2])) # empty array to store means
-        # Calculate means month by month
-        for i in np.arange(12)+1:
-            means[i - 1, :, :] = dataset1[months == i, :, :].mean(0)
-
-    if dataset1.ndim == 1:
-        means = np.empty((12)) # empty array to store means
-        # Calculate means month by month
-        for i in np.arange(12)+1:
-            means[i - 1] = dataset1[months == i].mean(0)
-
+    data = misc.reshapeMonthlyData(dataset1)
+    means = data.mean(axis = 0)
     return means
-'''
 
-def calcClimYear(nYR, dataset1, times):
+def calcClimYear(dataset1):
     '''
     Purpose:: 
        Calculate annual mean timeseries and climatology for both 2-D and point time series.
 
     Input::
-        nYR - an integer for the number of years
         dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-        times - an array of python datetime objects
 
     Output:: 
         tSeries - if 3d numpy was entered, 3d (nYr,lat,lon), if 1d data entered
@@ -118,264 +60,82 @@ def calcClimYear(nYR, dataset1, times):
         means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
         it is a floating point number representing the overall mean
     '''
-    # Extract months from time variable
-    nT = dataset1.shape[0]
-    ngrdY = dataset1.shape[1]
-    ngrdX = dataset1.shape[2]
-    yy = times.year
-    mm = times.month
-
-    #np.empty(nT)
-    #mm = np.empty(nT)
-
-    #for t in np.arange(nT):
-    #    yy[t] = time[t].year
-    #    mm[t] = time[t].month
-
-    if dataset1.ndim == 3:
-        tSeries = ma.zeros((nYR, ngrdY, ngrdX))
-        #i = 0
-        for i, myunit in enumerate(np.unique(yy)):
-            dataTemporary = dataset1[(yy == myunit), :, :]
-            tSeries[i, :, :] = ma.average(dataTemporary, axis = 0)
-            #print 'data.shape= ',data.shape,'  i= ',i,'  yy= ',yy
-        means = ma.average(tSeries, axis = 0)
-
-    elif dataset1.ndim == 1:
-        tSeries = ma.zeros((nYR))
-        for i, myunit in enumerate(np.unique(yy)):
-            dataTemporary = dataset1[(yy == myunit)]
-            tSeries[i] = ma.average(dataTemporary, axis = 0)
-            #print 'data.shape= ',data.shape,'  i= ',i,'  yy= ',yy
-        means = ma.average(tSeries, axis = 0)
-
+    data = misc.reshapeMonthlyData(dataset1)
+    tSeries = data.mean(axis = 1)
+    means = tSeries.mean(axis = 0)
     return tSeries, means
 
-
-def calcClimSeason(monthBegin, monthEnd, dataset1, times):
+def calcClimSeason(monthBegin, monthEnd, dataset1):
     '''
     Purpose :: 
-       Calculate seasonal mean timeseries and climatology for both 3-D and point time series.
+       Calculate seasonal mean montheries and climatology for both 3-D and point time series.
        For example, to calculate DJF mean time series, monthBegin = 12, monthEnd =2 
        This can handle monthBegin=monthEnd i.e. for climatology of a specific month
 
     Input::
         monthBegin - an integer for the beginning month (Jan =1)
         monthEnd - an integer for the ending month (Jan = 1)
-        dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-        times - an array of python datetime objects
+        dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array montheries
 
     Output:: 
         tSeries - if 3d numpy was entered, 3d (number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
-        if 1d data entered it is a timeseries of the data of length number of years
+        if 1d data entered it is a montheries of the data of length number of years
         means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
         it is a floating point number representing the overall mean
     '''
-    #-------------------------------------------------------------------------------------
-    # Calculate seasonal mean timeseries and climatology for both 2-d and point time series
-    # The season to be calculated is defined by moB and moE; moE>=moB always
-    #-------------------------------------------------------------------------------------
-    # Extract months from time variable
-    nT = dataset1.shape[0]
-    ngrdY = dataset1.shape[1]
-    ngrdX = dataset1.shape[2]
-    mm = times.month
-
-    indexBeginMonth=np.where(mm==monthBegin)   # beginning month of each year
-    indexEndMonth=np.where(mm==monthEnd)   # ending month of each year
-    # For special case where monthEnd is smaller then monthBegin e.g. DJF
-    indexB2=np.array(indexBeginMonth[0])
-    indexE2=np.array(indexEndMonth[0])
-    indexBeginMonth=indexB2[np.where(indexB2 < indexE2.max())]
-    indexEndMonth=indexE2[np.where(indexE2 > indexB2.min())]
-    
-    if dataset1.ndim == 3:
-        tSeries = ma.zeros((len(indexBeginMonth), ngrdY, ngrdX))
-       
-        for i,myunit in enumerate(np.arange(len(indexBeginMonth))):
-            dataTemporary = dataset1[indexBeginMonth[i]:indexEndMonth[i]+1, :, :]
-            tSeries[i, :, :] = ma.average(dataTemporary, axis = 0)
-            #print 'data.shape= ',data.shape,'  i= ',i,'  yy= ',yy      
-       
-        means = ma.average(tSeries, axis = 0)
-
-    elif dataset1.ndim == 1:
-        tSeries = ma.zeros((len(indexBeginMonth)))
-        for i, myunit in enumerate(np.arange(len(indexBeginMonth))):
-            dataTemporary = dataset1[indexBeginMonth[i]:indexEndMonth[i]+1]
-            tSeries[i] = ma.average(dataTemporary, axis = 0)
-            #print 'data.shape= ',data.shape,'  i= ',i,'  yy= ',yy
-           
-        means = ma.average(tSeries, axis = 0)
-    return tSeries, means
-
-'''
-def calc_clim_mo(nYR, nT, ngrdY, ngrdX, t2, time):
-    
-    Purpose:: 
-       Calculate monthly means at every grid points and the annual time series of single model.
-
-    Input::
-        monthBegin - an integer for the beginning month (Jan =1)
-        monthEnd - an integer for the ending month (Jan = 1)
-        dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-        times - an array of python datetime objects
-
-    Output:: 
-        tSeries - if 3d numpy was entered, 3d (number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
-        if 1d data entered it is a timeseries of the data of length number of years
-        means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
-        it is a floating point number representing the overall mean
-
-    Deprivable because this funtionality has been handled in the general function calcClimSeason
-    
-    #-------------------------------------------------------------------------------------
-    # JK20: This routine is modified from 'calc_clim_month'  with additional arguments and
-    #       output, the annual time series of single model output (mData)
-    # Calculate monthly means at every grid point including single point case (ndim=1)
-    #-------------------------------------------------------------------------------------
-    # Extract months and monthly time series from the time and raw variable, respectively
-    months = np.empty(nT)
-    for t in np.arange(nT):
-        months[t] = time[t].month
-        if t == 0:
-            yy0 = time[t].year
-        # for a 2-D time series data
-    if t2.ndim == 3:
-        mData = ma.empty((nYR, 12, ngrdY, ngrdX))
-        for t in np.arange(nT):
-            yy = time[t].year
-            mm = months[t]
-            yr = yy - yy0
-            mData[yr, mm, :, :] = t2[t, :, :]
-        # Calculate means month by month. means is an empty array to store means
-        means = ma.empty((12, ngrdY, ngrdX))
-        for i in np.arange(12) + 1:
-            means[i - 1, :, :] = t2[months == i, :, :].mean(0)
-        # for a point time series data
-    if t2.ndim == 1:
-        mData = ma.empty((nYR, 12))
-        for t in np.arange(nT):
-            yy = time[t].year
-            mm = months[t]
-            yr = yy - yy0
-            mData[yr, mm] = t2[t]
-        means = np.empty((12))
-        # Calculate means month by month. means is an empty array to store means
-        for i in np.arange(12) + 1:
-            means[i - 1] = t2[months == i].mean(0)
-    return mData, means
-'''
-'''
-def calc_clim_One_month(moID, nYR, nT, t2, time):
-    
-    Calculate the montly mean at every grid point for a specified month.
-    Deprivable because this funtionality has been handled in the general function calcClimSeason
-    
-    #-------------------------------------------------------------------------------------
-    # Calculate monthly means at every grid point for a specified month
-    #-------------------------------------------------------------------------------------
-    # Extract months and the corresponding time series from time variable
-    months = np.empty(nT)
-    for t in np.arange(nT):
-        months[t] = time[t].month
-    if t2.ndim == 3:
-        mData = ma.empty((nYR, t2.shape[1], t2.shape[2])) # empty array to store time series
-        n = 0
-        if months[t] == moID:
-            mData[n, :, :] = t2[t, :, :]
-            n += 1
-        means = ma.empty((t2.shape[1], t2.shape[2])) # empty array to store means
-        # Calculate means for the month specified by moID
-        means[:, :] = t2[months == moID, :, :].mean(0)
-    return mData, means
-'''
-'''
-def calc_annual_cycle_means(data, time):
-    
-     Calculate monthly means for every grid point
-     
-     Inputs:: 
-     	data - masked 3d array of the model data (time, lon, lat)
-     	time - an array of python datetime objects
-     
-     Deprivable because this does the exact same calculation as calcAnnualCycleMeans
-
-    # Extract months from time variable
-    months = np.empty(len(time))
-    
-    for t in np.arange(len(time)):
-        months[t] = time[t].month
-    
-    #if there is data varying in t and space
-    if data.ndim == 3:
-        # empty array to store means
-        means = ma.empty((12, data.shape[1], data.shape[2]))
-        
-        # Calculate means month by month
-        for i in np.arange(12) + 1:
-            means[i - 1, :, :] = data[months == i, :, :].mean(0)
-        
-    #if the data is a timeseries over area-averaged values
-    if data.ndim == 1:
-        # TODO - Investigate using ma per KDW
-        means = np.empty((12)) # empty array to store means??WHY NOT ma?
+    if monthBegin > monthEnd:
+        # Offset the original array so that the the first month
+        # becomes monthBegin, note that this cuts off the first year of data
+        offset = slice(monthBegin - 1, monthBegin - 13)
+        data = misc.reshapeMonthlyData(dataset1[offset])
+        monthIndex = slice(0, 13 - monthBegin + monthEnd)
+    else:
+        # Since monthBegin <= monthEnd, just take a slice containing those months
+        data = misc.reshapeMonthlyData(dataset1)
+        monthIndex =  slice(monthBegin - 1, monthEnd)
         
-        # Calculate means month by month
-        for i in np.arange(12) + 1:
-            means[i - 1] = data[months == i].mean(0)
-    
-    return means
-'''
+    tSeries = data[:, monthIndex].mean(axis = 1)
+    means = tSeries.mean(axis = 0)        
+    return tSeries, means
 
-def calcAnnualCycleStdev(dataset1, times):
+def calcAnnualCycleStdev(dataset1):
     '''
      Purpose:: 
         Calculate monthly standard deviations for every grid point
      
      Input::
         dataset1 - 3d numpy array of data in (12* number of years,lat,lon) 
-        times - an array of python datetime objects
 
      Output:: 
         stds - if 3d numpy was entered, 3d (12,lat,lon)
     '''
-    # Extract months from time variable
-    months = times.month
-    
-    # empty array to store means
-    stds = np.empty((12, dataset1.shape[1], dataset1.shape[2]))
-    
-    # Calculate sample standard deviation month by month (January - December)
-    for i in np.arange(12):
-        stds[i, :, :] = dataset1[months == i+1, :, :].std(axis = 0, ddof = 1)
-    
+    data = misc.reshapeMonthlyData(dataset1)
+    stds = data.std(axis = 0, ddof = 1)
     return stds
 
 
-def calcAnnualCycleDomainMeans(dataset1, times):
+
+def calcAnnualCycleDomainMeans(dataset1):
     '''
      Purpose:: 
         Calculate spatially averaged monthly climatology and standard deviation
 
      Input::
         dataset1 - 3d numpy array of data in (12* number of years,lat,lon) 
-        times - an array of python datetime objects
 
      Output::  
         means - time series (12)
     '''
-    # Extract months from time variable
-    months = times.month
-
-    means = np.empty(12) # empty array to store means
-    
-    # Calculate means month by month
-    for i in np.arange(12) + 1:
-        means[i - 1] = dataset1[months == i, :, :].mean()
+    data = misc.reshapeMonthlyData(dataset1)
     
+    # Calculate the means, month by month
+    means = np.zeros(12)
+    for i in np.arange(12):
+        means[i] = data[:, i, :, :].mean()
+        
     return means
 
+
 def calcTemporalStdev(dataset1):
     '''
      Purpose:: 
@@ -388,33 +148,29 @@ def calcTemporalStdev(dataset1):
      Output::  
         stds - time series (lat, lon)
     '''
-    stds = dataset1.std(axis=0, ddof=1)
+    stds = dataset1.std(axis = 0, ddof = 1)
     return stds
 
-def calcAnnualCycleDomainStdev(dataset1, times):
+def calcAnnualCycleDomainStdev(dataset1):
     '''
      Purpose:: 
         Calculate sample standard deviations representing the domain in each month
 
      Input::
         dataset1 - 3d numpy array of data in (12* number of years,lat,lon) 
-        times - an array of python datetime objects
 
      Output::  
         stds - time series (12)
     '''
-    # Extract months from time variable
-    months = times.month
-    
-    stds = np.empty(12) # empty array to store standard deviations
-    
-    # Calculate standard deviations of the entire domain from January through December
-    for i in np.arange(12) + 1:
-        stds[i - 1] = dataset1[months == i, :, :].std(ddof = 1)
+    data = misc.reshapeMonthlyData(dataset1)
     
+    # Calculate the standard deviation, month by months
+    stds = np.zeros(12)
+    for i in np.arange(12):
+        stds[i] = data[:, i, :, :].std(ddof = 1)
+        
     return stds
 
-
 def calcBiasAveragedOverTime(evaluationData, referenceData, option):        # Mean Bias
     '''
     Purpose:: 
@@ -462,24 +218,23 @@ def calcBiasAveragedOverTimeAndSigLev(ev
         For example: sig[iy,ix] = 0.95 means that the observation and model is different at 95% confidence level 
         at X=ix and Y=iy
     '''
+    # If either gridcell in each data set is missing, set that cell to
+    # missing for the output significance level
     evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
     referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
     
+    # The overall mask associated with missing data
+    overallMask = np.logical_or(evaluationDataMask, referenceDataMask)
+    
     diff = evaluationData - referenceData
     bias = diff.mean(axis = 0)
-    ngrdY = evaluationData.shape[1]
-    ngrdX = evaluationData.shape[2]
-    sigLev = ma.zeros([ngrdY,ngrdX])-100.
-    for iy in np.arange(ngrdY):
-        for ix in np.arange(ngrdX):
-            if not evaluationDataMask[iy,ix] and not referenceDataMask[iy,ix]:
-                sigLev [iy,ix] = 1-stats.ttest_rel(evaluationData[:,iy,ix],referenceData[:,iy,ix])[1]
-                
-    sigLev = ma.masked_equal(sigLev.data, -100.) 
+    sigLev = 1 - stats.ttest_rel(evaluationData, referenceData, axis = 0)[1]
+    sigLev[overallMask] = -100.
+    sigLev = ma.masked_equal(sigLev, -100.) 
     # Set mask for bias metric using missing data in obs or model data series
-    #   i.e. if obs contains more than threshold (e.g.50%) missing data 
-    #        then classify time average bias as missing data for that location. 
-    bias = ma.masked_array(bias.data, np.logical_or(evaluationDataMask, referenceDataMask))
+    # i.e. if obs contains more than threshold (e.g.50%) missing data 
+    # then classify time average bias as missing data for that location. 
+    bias = ma.masked_array(bias.data, overallMask)
     return bias, sigLev
 
 
@@ -518,46 +273,6 @@ def calcBias(evaluationData, referenceDa
     diff = evaluationData - referenceData
     return diff
 
-'''
-def calc_mae(t1, t2):
-   
-    Calculate mean difference between two fields over time for each grid point
-    
-    Classify missing data resulting from multiple times (using threshold 
-    data requirement) 
-    
-    i.e. if the working time unit is monthly data, and we are dealing with
-    multiple months of data then when we show mean of several months, we need
-    to decide what threshold of missing data we tolerate before classifying
-    a data point as missing data.
-    Deprivable because this funtionality has been handled in the general function calcBiasAveragedOverTime
-   
-    t1Mask = process.create_mask_using_threshold(t1, threshold = 0.75)
-    t2Mask = process.create_mask_using_threshold(t2, threshold = 0.75)
-    
-    diff = t1 - t2
-    adiff = abs(diff)
-    
-    mae = adiff.mean(axis = 0)
-    
-    # Set mask for mae metric using missing data in obs or model data series
-    #   i.e. if obs contains more than threshold (e.g.50%) missing data 
-    #        then classify time average mae as missing data for that location. 
-    mae = ma.masked_array(mae.data, np.logical_or(t1Mask, t2Mask))
-    return mae
-'''
-'''
-def calc_mae_dom(t1, t2):
-   
-     Calculate domain mean difference between two fields over time
-     Deprivable because this funtionality has been handled in the general function calcBiasAveragedOverTimeAndDomain
-    
-    diff = t1 - t2
-    adiff = abs(diff)
-    mae = adiff.mean()
-    return mae
-'''
-
 def calcRootMeanSquaredDifferenceAveragedOverTime(evaluationData, referenceData):
     '''
     Purpose:: 
@@ -596,24 +311,24 @@ def calcTemporalCorrelation(evaluationDa
     '''
     Purpose ::
         Calculate the temporal correlation.
-
+    
     Assumption(s) ::
         The first dimension of two datasets is the time axis.
-
+    
     Input ::
         evaluationData - model data array of any shape
         referenceData- observation data array of any shape
-
+            
     Output::
         temporalCorelation - A 2-D array of temporal correlation coefficients at each grid point.
         sigLev - A 2-D array of confidence levels related to temporalCorelation 
-
+    
     REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
     sigLev: the correlation between model and observation is significant at sigLev * 100 %
     '''
     evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
     referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
+    
     ngrdY = evaluationData.shape[1] 
     ngrdX = evaluationData.shape[2] 
     temporalCorrelation = ma.zeros([ngrdY,ngrdX])-100.
@@ -627,13 +342,12 @@ def calcTemporalCorrelation(evaluationDa
                     temporalCorrelation[iy,ix], sigLev[iy,ix]=stats.pearsonr(t1[(t1.mask==False) & (t2.mask==False)],
                                                   t2[(t1.mask==False) & (t2.mask==False)])
                     sigLev[iy,ix]=1-sigLev[iy,ix]  # p-value => confidence level
-
+    
     temporalCorrelation=ma.masked_equal(temporalCorrelation.data, -100.)        
     sigLev=ma.masked_equal(sigLev.data, -100.)    
-
+    
     return temporalCorrelation, sigLev
 
-
 def calcPatternCorrelation(evaluationData, referenceData):
     '''
     Purpose ::
@@ -669,8 +383,7 @@ def calcPatternCorrelationEachTime(evalu
      Output::
         patternCorrelation - a timeseries (time)
         sigLev - a time series (time)
-    '''
-     
+    ''' 
     nT = evaluationData.shape[0]
     patternCorrelation = ma.zeros(nT)-100.
     sigLev = ma.zeros(nT)-100.
@@ -679,236 +392,40 @@ def calcPatternCorrelationEachTime(evalu
 
     return patternCorrelation,sigLev
 
-'''
-Deprivable because of overlapping function, calcPatternCorrelation, exist
-def calc_spatial_pat_cor(t1, t2, nY, nX):
-    
-    Calcualte pattern correlation between 2-D arrays.
-
-    Input:
-        t1 - 2-D array of model data
-        t2 - 2-D array of observation data
-        nY
-        nX
-
-    Output:
-        Pattern correlation between two input arrays.
-    
-    # TODO - Update docstring. What are nY and nX?
-    mt1 = t1.mean()
-    mt2 = t2.mean()
-    st1 = t1.std()
-    st2 = t2.std()
-    patcor = ((t1 - mt1) * (t2 - mt2)).sum() / (float(nX * nY) * st1 * st2)
-    return patcor
-'''
-
-'''
-Deprivable because of overlapping function, calcPatternCorrelationEachTime, exist
-def calc_pat_cor2D(t1, t2, nT):
-    
-    Calculate the pattern correlation between 3-D input arrays.
-
-    Input:
-        t1 - 3-D array of model data
-        t2 - 3-D array of observation data
-        nT
-
-    Output:
-        1-D array (time series) of pattern correlation coefficients.
-    
-    # TODO - Update docstring. What is nT?
-    nt = t1.shape[0]
-    if(nt != nT):
-        print 'input time levels do not match: Exit', nT, nt
-        return -1
-    # store results in list for convenience (then convert to numpy array at the end)
-    patcor = []
-    for t in xrange(nt):
-        mt1 = t1[t, :, :].mean()
-        mt2 = t2[t, :, :].mean()
-        sigma_t1 = t1[t, :, :].std()
-        sigma_t2 = t2[t, :, :].std()
-         # TODO: make means and standard deviations weighted by grid box area.
-        patcor.append((((((t1[t, :, :] - mt1) * (t2[t, :, :] - mt2)).sum()) / 
-                     (t1.shape[1] * t1.shape[2]) ) / (sigma_t1 * sigma_t2)))
-        print t, mt1.shape, mt2.shape, sigma_t1.shape, sigma_t2.shape, patcor[t]
-    # TODO: deal with missing data appropriately, i.e. mask out grid points with missing data above tolerence level
-    # convert from list into numpy array
-    patcor = numpy.array(patcor)
-    print patcor.shape
-    return patcor
-'''    
-'''
-Deprivable because of overlapping function, calcPatternCorrelationEachTime, exists
-def calc_pat_cor(dataset_1, dataset_2):
-    
-     Purpose: Calculate the Pattern Correlation Timeseries
-
-     Assumption(s)::  
-     	Both dataset_1 and dataset_2 are the same shape.
-        * lat, lon must match up
-        * time steps must align (i.e. months vs. months)
-    
-     Input::
-        dataset_1 - 3d (time, lat, lon) array of data
-        dataset_2 - 3d (time, lat, lon) array of data
-         
-     Output:
-        patcor - a 1d array (time series) of pattern correlation coefficients.
-    
-     **Note:** Standard deviation is using 1 degree of freedom.  Debugging print 
-     statements to show the difference the n-1 makes. http://docs.scipy.org/doc/numpy/reference/generated/numpy.std.html
-    
-
-    # TODO:  Add in try block to ensure the shapes match
-    nt = dataset_1.shape[0]
-    # store results in list for convenience (then convert to numpy array)
-    patcor = []
-    for t in xrange(nt):
-        # find mean and std_dev 
-        mt1 = dataset_1[t, :, :].mean()
-        mt2 = dataset_2[t, :, :].mean()
-        sigma_t1 = dataset_1[t, :, :].std(ddof = 1)
-        sigma_t2 = dataset_2[t, :, :].std(ddof=1)
-        
-        # TODO: make means and standard deviations weighted by grid box area.
-        # Equation from Santer_et_al 1995 
-        #     patcor = (1/(N*M_std*O_std))*sum((M_i-M_bar)*(O_i-O_bar))
-        patcor.append((((((dataset_1[t, :, :] - mt1) * 
-                          (dataset_2[t, :, :] - mt2)).sum()) / 
-                        (dataset_1.shape[1] * dataset_1.shape[2])) / (sigma_t1 * sigma_t2)))
-        print t, mt1.shape, mt2.shape, sigma_t1.shape, sigma_t2.shape, patcor[t]
-        
-        # TODO: deal with missing data appropriately, i.e. mask out grid points
-        # with missing data above tolerance level
-
-    # convert from list into numpy array
-    patcor = np.array(patcor)
-    
-    print patcor.shape
-    return patcor
-'''
-'''
-TODO: KDW working on pattern correlation against reference climatology. 5 Jun 2013 
-def calc_anom_corn(dataset_1, dataset_2, climatology = None):
-    
-    Calculate the anomaly correlation.
-
-    Input:
-        dataset_1 - First input dataset
-        dataset_2 - Second input dataset
-        climatology - Optional climatology input array. Assumption is that it is for 
-            the same time period by default.
-
-    Output:
-        The anomaly correlation.
-    
-    # TODO: Update docstring with actual useful information
-
-    # store results in list for convenience (then convert to numpy array)
-    anomcor = []    
-    nt = dataset_1.shape[0]
-    #prompt for the third file, i.e. climo file...  
-    #include making sure the lat, lon and times are ok for comparision
-    # find the climo in here and then using for eg, if 100 yrs 
-    # is given for the climo file, but only looking at 10yrs
-    # ask if want to input climo dataset for use....if no, call previous 
-   
-    if climatology != None:
-        climoFileOption = raw_input('Would you like to use the full observation dataset as \
-                                     the climatology in this calculation? [y/n] \n>')
-        if climoFileOption == 'y':
-            base_dataset = climatology
-        else:
-            base_dataset = dataset_2
-    for t in xrange(nt):
-        mean_base = base_dataset[t, :, :].mean()
-        anomcor.append((((dataset_1[t, :, :] - mean_base) * (dataset_2[t, :, :] - mean_base)).sum()) / 
-                       np.sqrt(((dataset_1[t, :, :] - mean_base) ** 2).sum() * 
-                               ((dataset_2[t, :, :] - mean_base) ** 2).sum()))
-        print t, mean_base.shape, anomcor[t]
-
-    # TODO: deal with missing data appropriately, i.e. mask out grid points 
-    # with missing data above tolerence level
-    
-    # convert from list into numpy array
-    anomcor = np.array(anomcor)
-    print anomcor.shape, anomcor.ndim, anomcor
-    return anomcor
-'''
-'''
-def calc_anom_cor(t1, t2):
-    
-     Calculate the Anomaly Correlation (Deprecated)
-    
-    
-    nt = t1.shape[0]
-    
-    # store results in list for convenience (then convert to numpy 
-    # array at the end)
-    anomcor = []
-    for t in xrange(nt):
-        
-        mt2 = t2[t, :, :].mean()
-        
-        sigma_t1 = t1[t, :, :].std(ddof = 1)
-        sigma_t2 = t2[t, :, :].std(ddof = 1)
-        
-        # TODO: make means and standard deviations weighted by grid box area.
-        
-        anomcor.append(((((t1[t, :, :] - mt2) * (t2[t, :, :] - mt2)).sum()) / 
-                        (t1.shape[1] * t1.shape[2])) / (sigma_t1 * sigma_t2))
-        
-        print t, mt2.shape, sigma_t1.shape, sigma_t2.shape, anomcor[t]
-        
-        # TODO: deal with missing data appropriately, i.e. mask out grid points with 
-        #       missing data above tolerence level
-        
-    # convert from list into numpy array
-    anomcor = np.array(anomcor)
-    print anomcor.shape, anomcor.ndim, anomcor
-    return anomcor
-'''
-
-def calc_nash_sutcliff(dataset_1, dataset_2):
+def calcNashSutcliff(evaluationData, referenceData):
     '''
-    TODO: KYO and KDW 5 Jun 2013
-    Routine to calculate the Nash-Sutcliff coefficient of efficiency (E)
-    
     Assumption(s)::  
-    	Both dataset_1 and dataset_2 are the same shape.
+    	Both evaluationData and referenceData are the same shape.
         * lat, lon must match up
         * time steps must align (i.e. months vs. months)
     
     Input::
-    	dataset_1 - 3d (time, lat, lon) array of data
-        dataset_2 - 3d (time, lat, lon) array of data
+    	evaluationData - 3d (time, lat, lon) array of data
+        referenceData - 3d (time, lat, lon) array of data
     
     Output:
         nashcor - 1d array aligned along the time dimension of the input
         datasets. Time Series of Nash-Sutcliff Coefficient of efficiency
      
-     '''
-
-    nt = dataset_1.shape[0]
-    nashcor = []
-    for t in xrange(nt):
-        mean_dataset_2 = dataset_2[t, :, :].mean()
-        
-        nashcor.append(1 - ((((dataset_2[t, :, :] - dataset_1[t, :, :]) ** 2).sum()) / 
-                            ((dataset_2[t, :, :] - mean_dataset_2) ** 2).sum()))
-        
-        print t, mean_dataset_2.shape, nashcor[t]
-        
-    nashcor = np.array(nashcor)
-    print nashcor.shape, nashcor.ndim, nashcor
+    '''
+    # Flatten the spatial dimensions
+    data1 = evaluationData[:]
+    data2 = referenceData[:]
+    nT = data1.shape[0]
+    data1.shape = nT, data1.size / nT
+    data2.shape = nT, data2.size / nT 
+    meanData2 = data2.mean(axis = 1)
+    
+    # meanData2 must be reshaped to 2D as to obey
+    # numpy broadcasting rules
+    meanData2.shape = nT, 1
+    nashcor = 1 - ((((data2 - data1) ** 2).sum(axis = 1)) / 
+               (((data2 - meanData2) ** 2).sum(axis = 1)))
     return nashcor
 
 
-def calc_pdf(dataset_1, dataset_2):
+def calcPdf(evaluationData, referenceData):
     '''
-    TODO: KYO and KDW 5 Jun 2013
     Routine to calculate a normalized Probability Distribution Function with 
     bins set according to data range.
     Equation from Perkins et al. 2007
@@ -941,18 +458,13 @@ def calc_pdf(dataset_1, dataset_2):
         PDF for the year
 
     '''
-    #list to store PDFs of modelData and obsData
-    pdf_mod = []
-    pdf_obs = []
     # float to store the final PDF similarity score
-    similarity_score = 0.0
-    d1_max = dataset_1.amax()
-    d1_min = dataset_1.amin()
-
-    print 'min modelData', dataset_1[:, :, :].min()
-    print 'max modelData', dataset_1[:, :, :].max()
-    print 'min obsData', dataset_2[:, :, :].min()
-    print 'max obsData', dataset_2[:, :, :].max()
+    similarityScore = 0.0
+
+    print 'min modelData', evaluationData[:, :, :].min()
+    print 'max modelData', evaluationData[:, :, :].max()
+    print 'min obsData', referenceData[:, :, :].min()
+    print 'max obsData', referenceData[:, :, :].max()
     # find a distribution for the entire dataset
     #prompt the user to enter the min, max and number of bin values. 
     # The max, min info above is to help guide the user with these choises
@@ -964,59 +476,23 @@ def calc_pdf(dataset_1, dataset_2):
     mybins = np.linspace(minEdge, maxEdge, nbins)
     print 'nbins is', nbins, 'mybins are', mybins
     
+    pdfMod, edges = np.histogram(evaluationData, bins = mybins, normed = True, density = True)  
+    print 'evaluationData distribution and edges', pdfMod, edges
+    pdfObs, edges = np.histogram(referenceData, bins = mybins, normed = True, density = True)           
+    print 'referenceData distribution and edges', pdfObs, edges    
     
-    # TODO:  there is no 'new' kwargs for numpy.histogram 
-    # per: http://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
-    # PLAN: Replace new with density param.
-    pdf_mod, edges = np.histogram(dataset_1, bins = mybins, normed = True, new = True)  
-    print 'dataset_1 distribution and edges', pdf_mod, edges
-    pdf_obs, edges = np.histogram(dataset_2, bins = mybins, normed = True, new = True)           
-    print 'dataset_2 distribution and edges', pdf_obs, edges    
-    
-    # TODO: drop this
-    """
-    considering using pdf function from statistics package. It is not 
-     installed. Have to test on Mac.  
-     http://bonsai.hgc.jp/~mdehoon/software/python/Statistics/manual/index.xhtml#TOC31 
-    pdf_mod, edges = stats.pdf(dataset_1, bins=mybins)
-    print 'dataset_1 distribution and edges', pdf_mod, edges
-    pdf_obs,edges=stats.pdf(dataset_2,bins=mybins)           
-    print 'dataset_2 distribution and edges', pdf_obs, edges 
-    """
-
     #find minimum at each bin between lists 
     i = 0
-    for model_value in pdf_mod :
-        print 'model_value is', model_value, 'pdf_obs[', i, '] is', pdf_obs[i]
-        if model_value < pdf_obs[i]:
-            similarity_score += model_value
+    for model_value in pdfMod :
+        print 'model_value is', model_value, 'pdfObs[', i, '] is', pdfObs[i]
+        if model_value < pdfObs[i]:
+            similarityScore += model_value
         else:
-            similarity_score += pdf_obs[i] 
+            similarityScore += pdfObs[i] 
         i += 1 
-    print 'similarity_score is', similarity_score
-    return similarity_score
+    print 'similarity_score is', similarityScore
+    return similarityScore
 
-'''
-Deprivable functionality in calcTemporalStdev
-def calcStdev(dataset1):
-     
-    Purpose::
-        Calculate the standard deviation for a given dataset.
-
-    Input:
-        t1 - Dataset to calculate the standard deviation on.
-
-    Output:
-        Array of the standard deviations for each month in the provided dataset.
-    
-    nt = t1.shape[0]
-    sigma_t1 = []
-    for t in xrange(nt):
-        sigma_t1.append(t1[t, :, :].std(ddof = 1))
-    sigma_t1 = np.array(sigma_t1)
-    print sigma_t1, sigma_t1.shape
-    return sigma_t1
-'''
 
 def metrics_plots(varName, numOBS, numMDL, nT, ngrdY, ngrdX, Times, lons, 
                   lats, obsData, mdlData, obsList, mdlList, workdir,subRegions, FoutOption):
@@ -1089,7 +565,7 @@ def metrics_plots(varName, numOBS, numMD
         if ans == 'y':
             ans = raw_input('Input subregion info interactively? y/n: \n> ')
             if ans == 'y':
-                numSubRgn, subRgnName, subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1 = misc.assign_subRgns_interactively()
+                numSubRgn, subRgnName, subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1 = misc.createSubRegionObjectInteractively()
             else:
                 print 'Read subregion info from a pre-fabricated text file'
                 ans = raw_input('Read from a defaule file (workdir + "/sub_regions.txt")? y/n: \n> ')
@@ -1238,8 +714,8 @@ def metrics_plots(varName, numOBS, numMD
                 timeScale = 'Annual'
                 # compute the annual-mean time series and climatology. 
                 # mTser=ma.zeros((nYR,ngrdY,ngrdX)), mClim = ma.zeros((ngrdY,ngrdX))
-                mTser, mClim = calcClimYear(nYR, mdlData[mdlSelect, :, :, :], Times)
-                oTser, oClim = calcClimYear(nYR, obsData[obsSelect, :, :, :], Times)
+                mTser, mClim = calcClimYear(nYR, mdlData[mdlSelect, :, :, :])
+                oTser, oClim = calcClimYear(nYR, obsData[obsSelect, :, :, :])
             elif timeOption == 2:
                 timeScale = 'Seasonal'
                 # select the timeseries and climatology for a season specifiec by a user
@@ -1247,16 +723,16 @@ def metrics_plots(varName, numOBS, numMD
                 moBgn = int(raw_input('Enter the beginning month for your season. 1-12: \n> '))
                 moEnd = int(raw_input('Enter the ending month for your season. 1-12: \n> '))
                 print ' '
-                mTser, mClim = calcClimSeason(moBgn, moEnd, mdlData[mdlSelect, :, :, :], Times)
-                oTser, oClim = calcClimSeason(moBgn, moEnd, obsData[obsSelect, :, :, :], Times)
+                mTser, mClim = calcClimSeason(moBgn, moEnd, mdlData[mdlSelect, :, :, :])
+                oTser, oClim = calcClimSeason(moBgn, moEnd, obsData[obsSelect, :, :, :])
             elif timeOption == 3:
                 timeScale = 'Monthly'
                 # compute the monthly-mean time series and climatology
                 # Note that the shapes of the output vars are: 
                 #   mTser = ma.zeros((nYR,12,ngrdY,ngrdX)) & mClim = ma.zeros((12,ngrdY,ngrdX))
                 # Also same for oTser = ma.zeros((nYR,12,ngrdY,ngrdX)) &,oClim = ma.zeros((12,ngrdY,ngrdX))
-                mTser, mClim = calcAnnualCycleMeans(mdlData[mdlSelect, :, :, :], Times)
-                oTser, oClim = calcAnnualCycleMeans(obsData[mdlSelect, :, :, :], Times)
+                mTser, mClim = calcAnnualCycleMeans(mdlData[mdlSelect, :, :, :])
+                oTser, oClim = calcAnnualCycleMeans(obsData[mdlSelect, :, :, :])
             else:
                 # undefined process options. exit
                 print 'The desired temporal scale is not available this time. END the job'
@@ -1428,8 +904,8 @@ def metrics_plots(varName, numOBS, numMD
             mData = mdlRgn[mdlSelect, rgnSelect, :]
 
             # compute the monthly-mean climatology to construct the annual cycle
-            obsAnnCyc = calcAnnualCycleMeans(oData, Times)
-            mdlAnnCyc = calcAnnualCycleMeans(mData, Times)
+            obsAnnCyc = calcAnnualCycleMeans(oData)
+            mdlAnnCyc = calcAnnualCycleMeans(mData)
             print 'obsAnnCyc= ', obsAnnCyc
             print 'mdlAnnCyc= ', mdlAnnCyc
 
@@ -1473,5 +949,3 @@ def metrics_plots(varName, numOBS, numMD
 
     # Processing complete if a user enters 'n' for 'doMetricsOption'
     print 'RCMES processing completed.'
-
-