parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From alexleven...@apache.org
Subject [1/2] parquet-mr git commit: PARQUET-229 Add a strict thrift projection API with backwards compat support
Date Fri, 01 May 2015 00:45:12 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 22c6d0870 -> 7fc799839


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/StrictFieldProjectionFilter.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/StrictFieldProjectionFilter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/StrictFieldProjectionFilter.java
new file mode 100644
index 0000000..645ae96
--- /dev/null
+++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/StrictFieldProjectionFilter.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.parquet.Log;
+import org.apache.parquet.Strings;
+import org.apache.parquet.glob.WildcardPath;
+
+/**
+ * Stricter Implementation of {@link FieldProjectionFilter}.
+ *
+ * See {@link parquet.thrift.projection.deprecated.DeprecatedFieldProjectionFilter} for the previous
+ * syntax that allows for more powerful glob patterns, but has less error reporting and less strict requirements.
+ *
+ * This filter requires that every *possible* expansion of glob expressions (like '{x,y,z}') must match at least one
+ * column. Each expansion may match more than one if it contains wildcards ('*').
+ *
+ * Note that this class is stateful -- it keeps track of which expanded glob paths have matched a column, so that it can
+ * throw when {@link #assertNoUnmatchedPatterns()} is called.
+ */
+public class StrictFieldProjectionFilter implements FieldProjectionFilter {
+  private static final Log LOG = Log.getLog(FieldProjectionFilter.class);
+  private static final String GLOB_SEPARATOR = ";";
+
+  // use a list instead of a Set, so we can detect overlapping patterns and
+  // warn about it.
+  private final List<WildcardPathStatus> columnsToKeep;
+
+  // visible for testing
+  static List<String> parseSemicolonDelimitedString(String columnsToKeepGlobs) {
+    String[] splits = columnsToKeepGlobs.split(GLOB_SEPARATOR);
+    List<String> globs = new ArrayList<String>();
+    for (String s : splits) {
+      if (!s.isEmpty()) {
+        globs.add(s);
+      }
+    }
+
+    if (globs.isEmpty()) {
+      throw new ThriftProjectionException(String.format("Semicolon delimited string '%s' contains 0 glob strings",
+          columnsToKeepGlobs));
+    }
+
+    return globs;
+  }
+
+  /**
+   * Construct a StrictFieldProjectionFilter from a single string.
+   *
+   * columnsToKeepGlobs should be a list of Strings in the format expected by
+   * {@link Strings#expandGlobToWildCardPaths(String, char)}, separated by ';'
+   * Should only be used for parsing values out of the hadoop config -- for APIs
+   * and programmatic access, use {@link StrictFieldProjectionFilter(List)}.
+   */
+  public static StrictFieldProjectionFilter fromSemicolonDelimitedString(String columnsToKeepGlobs) {
+    return new StrictFieldProjectionFilter(parseSemicolonDelimitedString(columnsToKeepGlobs));
+  }
+
+  /**
+   * Construct a StrictFieldProjectionFilter from a list of Strings in the format expected by
+   * {@link Strings#expandGlobToWildCardPaths(String, char)}
+   */
+  public StrictFieldProjectionFilter(List<String> columnsToKeepGlobs) {
+    this.columnsToKeep = new ArrayList<WildcardPathStatus>();
+    for (String glob : columnsToKeepGlobs) {
+      for (WildcardPath wp : Strings.expandGlobToWildCardPaths(glob, '.')) {
+        columnsToKeep.add(new WildcardPathStatus(wp));
+      }
+    }
+  }
+
+  @Override
+  public boolean keep(FieldsPath path) {
+    return keep(path.toDelimitedString("."));
+  }
+
+  // visible for testing
+  boolean keep(String path) {
+    WildcardPath match = null;
+
+    // since we have a rule of every path must match at least one column,
+    // we visit every single wildcard path, instead of short circuiting,
+    // for the case where more than one pattern matches a column. Otherwise
+    // we'd get a misleading exception saying a path didn't match a column,
+    // even though it looks like it should have (but didn't because of short circuiting).
+    // This also allows us log a warning when more than one glob path matches.
+    for (WildcardPathStatus wp : columnsToKeep) {
+      if (wp.matches(path)) {
+        if (match != null && !match.getParentGlobPath().equals(wp.getWildcardPath().getParentGlobPath())) {
+          String message = "Field path: '%s' matched more than one glob path pattern. First match: " +
+              "'%s' (when expanded to '%s') second match:'%s' (when expanded to '%s')";
+
+          warn(String.format(message,
+              path, match.getParentGlobPath(), match.getOriginalPattern(),
+              wp.getWildcardPath().getParentGlobPath(), wp.getWildcardPath().getOriginalPattern()));
+        } else {
+          match = wp.getWildcardPath();
+        }
+      }
+    }
+
+    return match != null;
+  }
+
+  // visible for testing
+  protected void warn(String warning) {
+    LOG.warn(warning);
+  }
+
+  private List<WildcardPath> getUnmatchedPatterns() {
+    List<WildcardPath> unmatched = new ArrayList<WildcardPath>();
+    for (WildcardPathStatus wp : columnsToKeep) {
+      if (!wp.hasMatched()) {
+        unmatched.add(wp.getWildcardPath());
+      }
+    }
+    return unmatched;
+  }
+
+  @Override
+  public void assertNoUnmatchedPatterns() throws ThriftProjectionException{
+    List<WildcardPath> unmatched = getUnmatchedPatterns();
+    if (!unmatched.isEmpty()) {
+      StringBuilder message =
+          new StringBuilder("The following projection patterns did not match any columns in this schema:\n");
+      for (WildcardPath wp : unmatched) {
+        message.append(String.format("Pattern: '%s' (when expanded to '%s')",
+            wp.getParentGlobPath(), wp.getOriginalPattern()));
+        message.append('\n');
+      }
+      throw new ThriftProjectionException(message.toString());
+    }
+  }
+
+  /**
+   * Holds a WildcardPath and a boolean, used to track whether
+   * this path has ever matched anything.
+   */
+  public static final class WildcardPathStatus {
+    private final WildcardPath wildcardPath;
+    private boolean hasMatched;
+
+    public WildcardPathStatus(WildcardPath wildcardPath) {
+      this.wildcardPath = wildcardPath;
+      this.hasMatched = false;
+    }
+
+    public boolean matches(String path) {
+      boolean matches = wildcardPath.matches(path);
+      this.hasMatched = hasMatched || matches;
+      return matches;
+    }
+
+    public WildcardPath getWildcardPath() {
+      return wildcardPath;
+    }
+
+    public boolean hasMatched() {
+      return hasMatched;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/DeprecatedFieldProjectionFilter.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/DeprecatedFieldProjectionFilter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/DeprecatedFieldProjectionFilter.java
new file mode 100644
index 0000000..78eef09
--- /dev/null
+++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/DeprecatedFieldProjectionFilter.java
@@ -0,0 +1,107 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection.deprecated;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.thrift.projection.FieldsPath;
+import org.apache.parquet.thrift.projection.FieldProjectionFilter;
+import org.apache.parquet.thrift.projection.ThriftProjectionException;
+
+/**
+ * Filter thrift attributes using glob syntax.
+ * This is used for parsing values assigned to ThriftReadSupport.DEPRECATED_THRIFT_COLUMN_FILTER_KEY
+ * @author Tianshuo Deng
+ */
+@Deprecated
+public class DeprecatedFieldProjectionFilter implements FieldProjectionFilter {
+  public static final String PATTERN_SEPARATOR = ";";
+  private final List<PathGlobPatternStatus> filterPatterns;
+
+  /**
+   * Class for remembering if a glob pattern has matched anything.
+   * If there is an invalid glob pattern that matches nothing, it should throw.
+   */
+  @Deprecated
+  private static class PathGlobPatternStatus {
+    PathGlobPattern pattern;
+    boolean hasMatchingPath = false;
+
+    PathGlobPatternStatus(String pattern) {
+      this.pattern = new PathGlobPattern(pattern);
+    }
+
+    public boolean matches(String path) {
+      if (this.pattern.matches(path)) {
+        this.hasMatchingPath = true;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  public DeprecatedFieldProjectionFilter(String filterDescStr) {
+    Preconditions.checkNotNull(filterDescStr, "filterDescStr");
+
+    filterPatterns = new LinkedList<PathGlobPatternStatus>();
+
+    if (filterDescStr == null || filterDescStr.isEmpty())
+      return;
+
+    String[] rawPatterns = filterDescStr.split(PATTERN_SEPARATOR);
+    for (String rawPattern : rawPatterns) {
+      filterPatterns.add(new PathGlobPatternStatus(rawPattern));
+    }
+  }
+
+  @Override
+  public boolean keep(FieldsPath path) {
+    if (filterPatterns.size() == 0)
+      return true;
+
+    for (PathGlobPatternStatus pattern : filterPatterns) {
+      if (pattern.matches(path.toDelimitedString("/")))
+        return true;
+    }
+    return false;
+  }
+
+  @Override
+  public void assertNoUnmatchedPatterns() throws ThriftProjectionException {
+    List<PathGlobPattern> unmatched = new LinkedList<PathGlobPattern>();
+    for (PathGlobPatternStatus p : filterPatterns) {
+      if (!p.hasMatchingPath) {
+        unmatched.add(p.pattern);
+      }
+    }
+
+    if (!unmatched.isEmpty()) {
+      StringBuilder message =
+          new StringBuilder("The following projection patterns did not match any columns in this schema:\n");
+      for (PathGlobPattern p : unmatched) {
+        message.append(p);
+        message.append('\n');
+      }
+      throw new ThriftProjectionException(message.toString());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
new file mode 100644
index 0000000..0893ab4
--- /dev/null
+++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
@@ -0,0 +1,187 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection.deprecated;
+
+import org.apache.hadoop.fs.GlobPattern;
+
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Enhanced version of GlobPattern class that is defined in hadoop with extra capability of matching
+ * full path separated by '/', and double star matching
+ *
+ * This is used for parsing values assigned to ThriftReadSupport.DEPRECATED_THRIFT_COLUMN_FILTER_KEY
+ *
+ * @author Tianshuo Deng
+ */
+@Deprecated
+public class PathGlobPattern {
+  private static final char BACKSLASH = '\\';
+  private static final char PATH_SEPARATOR = '/';
+  private Pattern compiled;
+  private boolean hasWildcard = false;
+
+  /**
+   * Construct the glob pattern object with a glob pattern string
+   *
+   * @param globPattern the glob pattern string
+   */
+  public PathGlobPattern(String globPattern) {
+    set(globPattern);
+  }
+
+  /**
+   * Compile glob pattern string
+   *
+   * @param globPattern the glob pattern
+   * @return the pattern object
+   */
+  public static Pattern compile(String globPattern) {
+    return new GlobPattern(globPattern).compiled();
+  }
+
+  private static void error(String message, String pattern, int pos) {
+    throw new PatternSyntaxException(message, pattern, pos);
+  }
+
+  /**
+   * @return the compiled pattern
+   */
+  public Pattern compiled() {
+    return compiled;
+  }
+
+  /**
+   * Match input against the compiled glob pattern
+   *
+   * @param s input chars
+   * @return true for successful matches
+   */
+  public boolean matches(CharSequence s) {
+    return compiled.matcher(s).matches();
+  }
+
+  /**
+   * Set and compile a glob pattern
+   *
+   * @param glob the glob pattern string
+   */
+  public void set(String glob) {
+    StringBuilder regex = new StringBuilder();
+    int setOpen = 0;
+    int curlyOpen = 0;
+    int len = glob.length();
+    hasWildcard = false;
+
+    for (int i = 0; i < len; i++) {
+      char c = glob.charAt(i);
+
+      switch (c) {
+        case BACKSLASH:
+          if (++i >= len) {
+            error("Missing escaped character", glob, i);
+          }
+          regex.append(c).append(glob.charAt(i));
+          continue;
+        case '.':
+        case '$':
+        case '(':
+        case ')':
+        case '|':
+        case '+':
+          // escape regex special chars that are not glob special chars
+          regex.append(BACKSLASH);
+          break;
+        case '*':
+          if (i + 1 < len && glob.charAt(i + 1) == '*') {
+            regex.append('.');
+            i++;
+            break;
+          }
+          regex.append("[^" + PATH_SEPARATOR + "]");
+          hasWildcard = true;
+          break;
+        case '?':
+          regex.append('.');
+          hasWildcard = true;
+          continue;
+        case '{': // start of a group
+          regex.append("(?:"); // non-capturing
+          curlyOpen++;
+          hasWildcard = true;
+          continue;
+        case ',':
+          regex.append(curlyOpen > 0 ? '|' : c);
+          continue;
+        case '}':
+          if (curlyOpen > 0) {
+            // end of a group
+            curlyOpen--;
+            regex.append(")");
+            continue;
+          }
+          break;
+        case '[':
+          if (setOpen > 0) {
+            error("Unclosed character class", glob, i);
+          }
+          setOpen++;
+          hasWildcard = true;
+          break;
+        case '^': // ^ inside [...] can be unescaped
+          if (setOpen == 0) {
+            regex.append(BACKSLASH);
+          }
+          break;
+        case '!': // [! needs to be translated to [^
+          regex.append(setOpen > 0 && '[' == glob.charAt(i - 1) ? '^' : '!');
+          continue;
+        case ']':
+          // Many set errors like [][] could not be easily detected here,
+          // as []], []-] and [-] are all valid POSIX glob and java regex.
+          // We'll just let the regex compiler do the real work.
+          setOpen = 0;
+          break;
+        default:
+      }
+      regex.append(c);
+    }
+
+    if (setOpen > 0) {
+      error("Unclosed character class", glob, len);
+    }
+    if (curlyOpen > 0) {
+      error("Unclosed group", glob, len);
+    }
+    compiled = Pattern.compile(regex.toString());
+  }
+
+  @Override
+  public String toString() {
+    return compiled.toString();
+  }
+
+  /**
+   * @return true if this is a wildcard pattern (with special chars)
+   */
+  public boolean hasWildcard() {
+    return hasWildcard;
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestParquetToThriftReadWriteAndProjection.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestParquetToThriftReadWriteAndProjection.java b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestParquetToThriftReadWriteAndProjection.java
index ebf9494..bf9b2a3 100644
--- a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestParquetToThriftReadWriteAndProjection.java
+++ b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestParquetToThriftReadWriteAndProjection.java
@@ -197,7 +197,7 @@ public class TestParquetToThriftReadWriteAndProjection {
 
   private void shouldDoProjectionWithThriftColumnFilter(String filterDesc, TBase toWrite, TBase toRead, Class<? extends TBase<?, ?>> thriftClass) throws Exception {
     Configuration conf = new Configuration();
-    conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, filterDesc);
+    conf.set(ThriftReadSupport.DEPRECATED_THRIFT_COLUMN_FILTER_KEY, filterDesc);
     shouldDoProjection(conf, toWrite, toRead, thriftClass);
   }
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftSchemaConverter.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftSchemaConverter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftSchemaConverter.java
index 9837b4e..97e1a12 100644
--- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftSchemaConverter.java
+++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftSchemaConverter.java
@@ -27,7 +27,8 @@ import org.junit.Test;
 
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
-import org.apache.parquet.thrift.projection.FieldProjectionFilter;
+import org.apache.parquet.thrift.projection.StrictFieldProjectionFilter;
+import org.apache.parquet.thrift.projection.deprecated.DeprecatedFieldProjectionFilter;
 import org.apache.parquet.thrift.projection.ThriftProjectionException;
 import org.apache.parquet.thrift.struct.ThriftType;
 import org.apache.parquet.thrift.struct.ThriftType.StructType;
@@ -67,51 +68,44 @@ public class TestThriftSchemaConverter {
   @Test
   public void testToProjectedThriftType() {
 
-    shouldGetProjectedSchema("name/first_name", "message ParquetSchema {" +
+    shouldGetProjectedSchema("name/first_name", "name.first_name", "message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
             "  }}", Person.class);
 
-    shouldGetProjectedSchema("name/first_name;name/last_name", "message ParquetSchema {" +
+    shouldGetProjectedSchema("name/first_name;name/last_name", "name.first_name;name.last_name" ,"message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
             "    optional binary last_name (UTF8) = 2;" +
             "  }}", Person.class);
 
-    shouldGetProjectedSchema("name/{first,last}_name;", "message ParquetSchema {" +
+    shouldGetProjectedSchema("name/{first,last}_name;", "name.{first,last}_name;", "message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
             "    optional binary last_name (UTF8) = 2;" +
             "  }}", Person.class);
 
-    shouldGetProjectedSchema("name/*", "message ParquetSchema {" +
+    shouldGetProjectedSchema("name/*", "name" ,"message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
             "    optional binary last_name (UTF8) = 2;" +
             "  }" +
             "}", Person.class);
 
-    shouldGetProjectedSchema("name/*", "message ParquetSchema {" +
+    shouldGetProjectedSchema("*/*_name", "*.*_name" ,"message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
             "    optional binary last_name (UTF8) = 2;" +
             "  }" +
             "}", Person.class);
 
-    shouldGetProjectedSchema("*/*_name", "message ParquetSchema {" +
+    shouldGetProjectedSchema("name/first_*", "name.first_*","message ParquetSchema {" +
             "  required group name = 1 {" +
             "    optional binary first_name (UTF8) = 1;" +
-            "    optional binary last_name (UTF8) = 2;" +
             "  }" +
             "}", Person.class);
 
-    shouldGetProjectedSchema("name/first_*", "message ParquetSchema {" +
-            "  required group name = 1 {" +
-            "    optional binary first_name (UTF8) = 1;" +
-            "  }" +
-            "}", Person.class);
-
-    shouldGetProjectedSchema("*/*", "message ParquetSchema {" +
+    shouldGetProjectedSchema("*/*", "*.*", "message ParquetSchema {" +
             "  required group name = 1 {" +
             "  optional binary first_name (UTF8) = 1;" +
             "  optional binary last_name (UTF8) = 2;" +
@@ -122,10 +116,6 @@ public class TestThriftSchemaConverter {
             "      optional binary type (ENUM) = 2;" +
             "    }" +
             "}}", Person.class);
-
-
-//    MessageType mapSchema=  MessageTypeParser.parseMessageType()
-
   }
 
   /* Original message type, before projection
@@ -153,7 +143,7 @@ public class TestThriftSchemaConverter {
   @Test
   public void testProjectMapThriftType() {
     //project nested map
-    shouldGetProjectedSchema("name;names/key*;names/value/**", "message ParquetSchema {\n" +
+    shouldGetProjectedSchema("name;names/key*;names/value/**", "name;names.key*;names.value", "message ParquetSchema {\n" +
             "  optional binary name (UTF8) = 1;\n" +
             "  optional group names (MAP) = 2 {\n" +
             "    repeated group map (MAP_KEY_VALUE) {\n" +
@@ -175,7 +165,7 @@ public class TestThriftSchemaConverter {
             "}", TestStructInMap.class);
 
     //project only one level of nested map
-    shouldGetProjectedSchema("name;names/key;names/value/name/*", "message ParquetSchema {\n" +
+    shouldGetProjectedSchema("name;names/key;names/value/name/*", "name;names.key;names.value.name","message ParquetSchema {\n" +
             "  optional binary name (UTF8) = 1;\n" +
             "  optional group names (MAP) = 2 {\n" +
             "    repeated group map (MAP_KEY_VALUE) {\n" +
@@ -193,7 +183,7 @@ public class TestThriftSchemaConverter {
 
   @Test
   public void testProjectOnlyKeyInMap() {
-    shouldGetProjectedSchema("name;names/key","message ParquetSchema {\n" +
+    shouldGetProjectedSchema("name;names/key", "name;names.key", "message ParquetSchema {\n" +
             "  optional binary name (UTF8) = 1;\n" +
             "  optional group names (MAP) = 2 {\n" +
             "    repeated group map (MAP_KEY_VALUE) {\n" +
@@ -203,15 +193,14 @@ public class TestThriftSchemaConverter {
             "}",TestStructInMap.class);
   }
 
-
   private void shouldThrowWhenProjectionFilterMatchesNothing(String filters, String unmatchedFilter, Class<? extends TBase<?, ?>> thriftClass) {
     try {
-      getFilteredSchema(filters, thriftClass);
+      getDeprecatedFilteredSchema(filters, thriftClass);
+      fail("should throw projection exception when filter matches nothing");
     } catch (ThriftProjectionException e) {
-      assertEquals("unmatched projection filters: [" + unmatchedFilter + "]", e.getMessage());
-      return;
+      assertEquals("The following projection patterns did not match any columns in this schema:\n"
+          + unmatchedFilter + "\n", e.getMessage());
     }
-    fail("should throw projection exception when filter matches nothing");
   }
 
   @Test
@@ -220,29 +209,39 @@ public class TestThriftSchemaConverter {
     shouldThrowWhenProjectionFilterMatchesNothing("name;non_existing", "non_existing", TestStructInMap.class);
     shouldThrowWhenProjectionFilterMatchesNothing("**;non_existing", "non_existing", TestStructInMap.class);
     shouldThrowWhenProjectionFilterMatchesNothing("**;names/non_existing", "names/non_existing", TestStructInMap.class);
-    shouldThrowWhenProjectionFilterMatchesNothing("**;names/non_existing;non_existing", "names/non_existing, non_existing", TestStructInMap.class);
+    shouldThrowWhenProjectionFilterMatchesNothing("**;names/non_existing;non_existing", "names/non_existing\nnon_existing", TestStructInMap.class);
   }
 
-  @Test(expected = ThriftProjectionException.class)
   public void testProjectOnlyValueInMap() {
-    getFilteredSchema("name;names/value/**", TestStructInMap.class);
+    try {
+      getDeprecatedFilteredSchema("name;names/value/**", TestStructInMap.class);
+      fail("this should throw");
+    } catch (ThriftProjectionException e) {
+      assertEquals("", e.getMessage());
+    }
   }
 
-  private void shouldGetProjectedSchema(String filterDesc, String expectedSchemaStr, Class<? extends TBase<?,?>> thriftClass) {
-    MessageType requestedSchema = getFilteredSchema(filterDesc, thriftClass);
+  private void shouldGetProjectedSchema(String deprecatedFilterDesc, String strictFilterDesc, String expectedSchemaStr, Class<? extends TBase<?,?>> thriftClass) {
+    MessageType depRequestedSchema = getDeprecatedFilteredSchema(deprecatedFilterDesc, thriftClass);
+    MessageType strictRequestedSchema = getStrictFilteredSchema(strictFilterDesc, thriftClass);
     MessageType expectedSchema = parseMessageType(expectedSchemaStr);
-    assertEquals(expectedSchema, requestedSchema);
+    assertEquals(expectedSchema, depRequestedSchema);
+    assertEquals(expectedSchema, strictRequestedSchema);
   }
 
-  private MessageType getFilteredSchema(String filterDesc, Class<? extends TBase<?,?>> thriftClass) {
-    FieldProjectionFilter fieldProjectionFilter = new FieldProjectionFilter(filterDesc);
+  private MessageType getDeprecatedFilteredSchema(String filterDesc, Class<? extends TBase<?,?>> thriftClass) {
+    DeprecatedFieldProjectionFilter fieldProjectionFilter = new DeprecatedFieldProjectionFilter(filterDesc);
+    return new ThriftSchemaConverter(fieldProjectionFilter).convert(thriftClass);
+  }
+
+  private MessageType getStrictFilteredSchema(String semicolonDelimitedString, Class<? extends TBase<?,?>> thriftClass) {
+    StrictFieldProjectionFilter fieldProjectionFilter = StrictFieldProjectionFilter.fromSemicolonDelimitedString(semicolonDelimitedString);
     return new ThriftSchemaConverter(fieldProjectionFilter).convert(thriftClass);
   }
 
   @Test
   public void testToThriftType() throws Exception {
-    ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
-    final StructType converted = schemaConverter.toStructType(AddressBook.class);
+    final StructType converted = ThriftSchemaConverter.toStructType(AddressBook.class);
     final String json = converted.toJSON();
     final ThriftType fromJSON = StructType.fromJSON(json);
     assertEquals(json, fromJSON.toJSON());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/PathGlobPatternTest.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/PathGlobPatternTest.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/PathGlobPatternTest.java
deleted file mode 100644
index d4a8b9b..0000000
--- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/PathGlobPatternTest.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/* 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.parquet.thrift.projection;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-/**
- * Test using glob syntax to specify which attribute to retain
- */
-public class PathGlobPatternTest {
-  @Test
-  public void testRecursiveGlob() {
-    PathGlobPattern g = new PathGlobPattern("a/**/b");
-    assertFalse(g.matches("a/b"));
-    assertTrue(g.matches("a/asd/b"));
-    assertTrue(g.matches("a/asd/ss/b"));
-
-    g = new PathGlobPattern("a/**");
-    assertTrue(g.matches("a/as"));
-    assertTrue(g.matches("a/asd/b"));
-    assertTrue(g.matches("a/asd/ss/b"));
-
-
-  }
-
-  @Test
-  public void testStandardGlob() {
-    PathGlobPattern g = new PathGlobPattern("a/*");
-    assertTrue(g.matches("a/as"));
-    assertFalse(g.matches("a/asd/b"));
-    assertFalse(g.matches("a/asd/ss/b"));
-
-    g = new PathGlobPattern("a/{bb,cc}/d");
-    assertTrue(g.matches("a/bb/d"));
-    assertTrue(g.matches("a/cc/d"));
-    assertFalse(g.matches("a/cc/bb/d"));
-    assertFalse(g.matches("a/d"));
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestFieldsPath.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestFieldsPath.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestFieldsPath.java
new file mode 100644
index 0000000..335a7ff
--- /dev/null
+++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestFieldsPath.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.parquet.thrift.ThriftSchemaConverter;
+import org.apache.parquet.thrift.struct.ThriftField;
+import org.apache.parquet.thrift.struct.ThriftType;
+import org.apache.parquet.thrift.struct.ThriftType.BoolType;
+import org.apache.parquet.thrift.struct.ThriftType.ByteType;
+import org.apache.parquet.thrift.struct.ThriftType.DoubleType;
+import org.apache.parquet.thrift.struct.ThriftType.EnumType;
+import org.apache.parquet.thrift.struct.ThriftType.I16Type;
+import org.apache.parquet.thrift.struct.ThriftType.I32Type;
+import org.apache.parquet.thrift.struct.ThriftType.I64Type;
+import org.apache.parquet.thrift.struct.ThriftType.ListType;
+import org.apache.parquet.thrift.struct.ThriftType.MapType;
+import org.apache.parquet.thrift.struct.ThriftType.SetType;
+import org.apache.parquet.thrift.struct.ThriftType.StringType;
+import org.apache.parquet.thrift.struct.ThriftType.StructType;
+import org.junit.Test;
+
+import com.twitter.data.proto.tutorial.thrift.Person;
+import com.twitter.elephantbird.thrift.test.TestStructInMap;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestFieldsPath {
+  @Test
+  public void testFieldsPath() {
+    StructType person = ThriftSchemaConverter.toStructType(Person.class);
+
+    List<String> paths = PrimitivePathVisitor.visit(person, ".");
+    assertEquals(Arrays.asList("name.first_name", "name.last_name", "id", "email", "phones.number", "phones.type"),
+        paths);
+
+    paths = PrimitivePathVisitor.visit(person, "/");
+    assertEquals(Arrays.asList("name/first_name", "name/last_name", "id", "email", "phones/number", "phones/type"),
+        paths);
+
+    StructType structInMap = ThriftSchemaConverter.toStructType(TestStructInMap.class);
+
+    paths = PrimitivePathVisitor.visit(structInMap, ".");
+    assertEquals(Arrays.asList("name", "names.key", "names.value.name.first_name", "names.value.name.last_name",
+            "names.value.phones.key", "names.value.phones.value", "name_to_id.key", "name_to_id.value"), paths);
+
+    paths = PrimitivePathVisitor.visit(structInMap, "/");
+    assertEquals(Arrays.asList("name", "names/key", "names/value/name/first_name", "names/value/name/last_name",
+        "names/value/phones/key", "names/value/phones/value", "name_to_id/key", "name_to_id/value"), paths);
+
+  }
+
+  private static class PrimitivePathVisitor implements ThriftType.TypeVisitor {
+    private List<String> paths = new ArrayList<String>();
+    private FieldsPath path = new FieldsPath();
+    private String delim;
+
+    private PrimitivePathVisitor(String delim) {
+      this.delim = delim;
+    }
+
+    public static List<String> visit(StructType s, String delim) {
+      PrimitivePathVisitor v = new PrimitivePathVisitor(delim);
+      s.accept(v);
+      return v.getPaths();
+    }
+
+    public List<String> getPaths() {
+      return paths;
+    }
+
+    @Override
+    public void visit(MapType mapType) {
+      ThriftField key = mapType.getKey();
+      ThriftField value = mapType.getValue();
+      path.push(key);
+      key.getType().accept(this);
+      path.pop();
+      path.push(value);
+      value.getType().accept(this);
+      path.pop();
+    }
+
+    @Override
+    public void visit(SetType setType) {
+      setType.getValues().getType().accept(this);
+    }
+
+    @Override
+    public void visit(ListType listType) {
+      listType.getValues().getType().accept(this);
+    }
+
+    @Override
+    public void visit(StructType structType) {
+      for (ThriftField child : structType.getChildren()) {
+        path.push(child);
+        child.getType().accept(this);
+        path.pop();
+      }
+    }
+
+    private void visitPrimitive() {
+      paths.add(path.toDelimitedString(delim));
+    }
+
+    @Override
+    public void visit(EnumType enumType) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(BoolType boolType) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(ByteType byteType) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(DoubleType doubleType) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(I16Type i16Type) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(I32Type i32Type) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(I64Type i64Type) {
+      visitPrimitive();
+    }
+
+    @Override
+    public void visit(StringType stringType) {
+      visitPrimitive();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestStrictFieldProjectionFilter.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestStrictFieldProjectionFilter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestStrictFieldProjectionFilter.java
new file mode 100644
index 0000000..92e86a6
--- /dev/null
+++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/TestStrictFieldProjectionFilter.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Test;
+
+import static org.easymock.EasyMock.createMockBuilder;
+import static org.easymock.EasyMock.replay;
+import static org.easymock.EasyMock.verify;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class TestStrictFieldProjectionFilter {
+
+  @Test
+  public void testFromSemicolonDelimitedString() {
+    List<String> globs = StrictFieldProjectionFilter.parseSemicolonDelimitedString(";x.y.z;*.a.b.c*;;foo;;;;bar;");
+    assertEquals(Arrays.asList("x.y.z", "*.a.b.c*", "foo", "bar"), globs);
+
+    try {
+      StrictFieldProjectionFilter.parseSemicolonDelimitedString(";;");
+      fail("this should throw");
+    } catch (ThriftProjectionException e) {
+      assertEquals("Semicolon delimited string ';;' contains 0 glob strings", e.getMessage());
+    }
+  }
+
+  private static void assertMatches(StrictFieldProjectionFilter filter, String... strings) {
+    for (String s : strings) {
+      if (!filter.keep(s)) {
+        fail(String.format("String '%s' was expected to match", s));
+      }
+    }
+  }
+
+  private static void assertDoesNotMatch(StrictFieldProjectionFilter filter, String... strings) {
+    for (String s : strings) {
+      if (filter.keep(s)) {
+        fail(String.format("String '%s' was not expected to match", s));
+      }
+    }
+  }
+
+
+  @Test
+  public void testProjection() {
+    StrictFieldProjectionFilter filter = StrictFieldProjectionFilter.fromSemicolonDelimitedString(
+        "home.phone_number;home.address;work.address.zip;base_info;*.average;a.b.c.pre{x,y,z{a,b,c}}post");
+
+    assertMatches(filter, "home.phone_number", "home.address", "work.address.zip", "base_info",
+        "foo.average", "bar.x.y.z.average", "base_info.nested.field", "a.b.c.prexpost", "a.b.c.prezapost");
+
+    assertDoesNotMatch(filter, "home2.phone_number", "home2.address", "work.address", "base_info2",
+        "foo_average", "bar.x.y.z_average", "base_info_nested.field", "hi", "average", "a.b.c.pre{x,y,z{a,b,c}}post",
+        "");
+
+  }
+
+  @Test
+  public void testIsStrict() {
+    StrictFieldProjectionFilter filter = StrictFieldProjectionFilter.fromSemicolonDelimitedString(
+        "home.phone_number;a.b.c.pre{x,y,z{a,b,c}}post;bar.*.average");
+
+    assertMatches(filter, "home.phone_number", "bar.foo.average", "a.b.c.prexpost", "a.b.c.prezcpost");
+    assertDoesNotMatch(filter, "hello");
+    try {
+      filter.assertNoUnmatchedPatterns();
+      fail("this should throw");
+    } catch (ThriftProjectionException e) {
+      String expectedMessage = "The following projection patterns did not match any columns in this schema:\n" +
+          "Pattern: 'a.b.c.pre{x,y,z{a,b,c}}post' (when expanded to 'a.b.c.preypost')\n" +
+          "Pattern: 'a.b.c.pre{x,y,z{a,b,c}}post' (when expanded to 'a.b.c.prezapost')\n" +
+          "Pattern: 'a.b.c.pre{x,y,z{a,b,c}}post' (when expanded to 'a.b.c.prezbpost')\n";
+      assertEquals(expectedMessage, e.getMessage());
+    }
+  }
+
+  @Test
+  public void testWarnWhenMultiplePatternsMatch() {
+    StrictFieldProjectionFilter filter = createMockBuilder(StrictFieldProjectionFilter.class)
+        .withConstructor(Arrays.asList("a.b.c.{x_average,z_average}", "a.*_average"))
+        .addMockedMethod("warn")
+        .createMock();
+
+    // set expectations
+    filter.warn("Field path: 'a.b.c.x_average' matched more than one glob path pattern. "
+        + "First match: 'a.b.c.{x_average,z_average}' (when expanded to 'a.b.c.x_average') "
+        + "second match:'a.*_average' (when expanded to 'a.*_average')");
+    filter.warn("Field path: 'a.b.c.z_average' matched more than one glob path pattern. "
+        + "First match: 'a.b.c.{x_average,z_average}' (when expanded to 'a.b.c.z_average') "
+        + "second match:'a.*_average' (when expanded to 'a.*_average')");
+
+    replay(filter);
+
+    assertMatches(filter, "a.b.c.x_average", "a.b.c.z_average", "a.other.w_average");
+    assertDoesNotMatch(filter, "hello");
+    verify(filter);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPatternTest.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPatternTest.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPatternTest.java
new file mode 100644
index 0000000..e3ea0f4
--- /dev/null
+++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPatternTest.java
@@ -0,0 +1,59 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.thrift.projection.deprecated;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test using glob syntax to specify which attribute to retain
+ */
+public class PathGlobPatternTest {
+  @Test
+  public void testRecursiveGlob() {
+    PathGlobPattern g = new PathGlobPattern("a/**/b");
+    assertFalse(g.matches("a/b"));
+    assertTrue(g.matches("a/asd/b"));
+    assertTrue(g.matches("a/asd/ss/b"));
+
+    g = new PathGlobPattern("a/**");
+    assertTrue(g.matches("a/as"));
+    assertTrue(g.matches("a/asd/b"));
+    assertTrue(g.matches("a/asd/ss/b"));
+
+
+  }
+
+  @Test
+  public void testStandardGlob() {
+    PathGlobPattern g = new PathGlobPattern("a/*");
+    assertTrue(g.matches("a/as"));
+    assertFalse(g.matches("a/asd/b"));
+    assertFalse(g.matches("a/asd/ss/b"));
+
+    g = new PathGlobPattern("a/{bb,cc}/d");
+    assertTrue(g.matches("a/bb/d"));
+    assertTrue(g.matches("a/cc/d"));
+    assertFalse(g.matches("a/cc/bb/d"));
+    assertFalse(g.matches("a/d"));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/parquet_cascading.md
----------------------------------------------------------------------
diff --git a/parquet_cascading.md b/parquet_cascading.md
index 3836e6c..9ea4837 100644
--- a/parquet_cascading.md
+++ b/parquet_cascading.md
@@ -84,44 +84,61 @@ One of the big benefit of using columnar format is to be able to read only a sub
 Parquet support projection pushdown for Thrift records and tuples.
 
 ### 2.1 Projection Pushdown with Thrift/Scrooge Records
-To read only a subset of attributes in a Thrift/Scrooge class, the columns of interest should be specified using glob syntax. For example, for a thrift class as follows:
-
-    
-    struct Address{
-      1: string street
-      2: string zip
-    }
-    struct Person{
-      1: string name
-      2: int16 age
-      3: Address addr
-    }
+To read only a subset of columns in a Thrift/Scrooge class, the columns of interest should be specified using a glob syntax.
 
+For example, imagine a Person struct defined as:
 
-In the above example, when reading records of type Person, we can use following glob expression to specify the attributes we want to read:
+    struct Person {
+      1: required string name
+      2: optional int16 age
+      3: optional Address primaryAddress
+      4: required map<string, Address> otherAddresses
+    }
 
-- Exact match:
-"`name`" will only fetch the name attribute.
+    struct Address {
+      1: required string street
+      2: required string zip
+      3: required PhoneNumber primaryPhone
+      4: required PhoneNumber secondaryPhone
+      4: required list<PhoneNumber> otherPhones
+    }
 
-- Alternative match:
-"`address/{street,zip}`" will fetch both street and zip in the Address
+    struct PhoneNumber {
+      1: required i32 areaCode
+      2: required i32 number
+      3: required bool doNotCall
+    }
 
-- Wildcard match:
-"`*`" will fetch name and age, but not address, since address is a nested structure
+A column is specified as the path from the root of the schema down to the field of interest, separated by `.`, just as you would access the field
+in java or scala code. For example: `primaryAddress.phone.doNotCall`. 
+This applies for repeated fields as well, for example `primaryAddress.otherPhones.number` selects all the `number`s from all the elements of `otherPhones`.
+Maps are a special case -- the map is split into two columns, the key and the value. All the columns in the key are required, but you can select a subset of the
+columns in the value (or skip the value entirely), for example: `otherAddresses.{key,value.street}` will select only the streets from the
+values of the map, but the entire key will be kept. To select an entire map, you can do: `otherAddresses.{key,value}`, 
+and to select only the keys: `otherAddresses.key`. When selecting a field that is a struct, for example `primaryAddress.primaryPhone`, 
+it will select the entire struct. So `primaryAddress.primaryPhone.*` is redundant.
 
-- Recursive match:
-"`**`" will recursively match all attributes defined in Person.
+Columns can be specified concretely (like `primaryAddress.phone.doNotCall`), or a restricted glob syntax can be used.
+The glob syntax supports only wildcards (`*`) and glob expressions (`{}`).
 
-- Joined match:
-Multiple glob expression can be joined together separated by ";". eg. "name;address/street" will match only name and street in Address.
+For example:
 
-To specify the glob filter for thrift/scrooge, simply set the conf with "parquet.thrift.column.filter" set to the glob expression string.
+  * `name` will select just the `name` from the Person
+  * `{name,age}` will select both the `name` and `age` from the Person
+  * `primaryAddress` will select the entire `primaryAddress` struct, including all of its children (recursively)
+  * `primaryAddress.*Phone` will select all of `primaryAddress.primaryPhone` and `primaryAddress.secondaryPhone`
+  * `primaryAddress.*Phone*` will select all of `primaryAddress.primaryPhone` and `primaryAddress.secondaryPhone` and `primaryAddress.otherPhones`
+  * `{name,age,primaryAddress.{*Phone,street}}` will select `name`, `age`, `primaryAddress.primaryPhone`, `primaryAddress.secondaryPhone`, and `primaryAddress.street`
 
+Multiple Patterns:
+Multiple glob expression can be joined together separated by ";". eg. `name;primaryAddress.street` will match only name and street in Address.
+This is useful if you want to combine a list of patterns without making a giant `{}` group.
 
-    Map<Object, Object> props=new HashMap<Object, Object>();
-    props.put("parquet.thrift.column.filter","name;address/street");
-    HadoopFlowConnector hadoopFlowConnector = new HadoopFlowConnector(props);
+Note: all possible glob patterns must match at least one column. For example, if you provide the glob: `a.b.{c,d,e}` but only columns `a.b.c` and `a.b.d` exist, an
+exception will be thrown.
 
+You can provide your projection globs to parquet by setting `parquet.thrift.column.projection.globs` in the hadoop config, or using the methods in the
+scheme builder classes.
 
 ### 2.2 Projection Pushdown with Tuples
 When using ParquetTupleScheme, specifying projection pushdown is as simple as specifying fields as the parameter of the constructor of ParquetTupleScheme:

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7fc79983/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 880191d..ea6d67f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -234,6 +234,9 @@
                    <dumpDetails>true</dumpDetails>
                    <previousVersion>${previous.version}</previousVersion>
                    <excludes>
+                     <exclude>parquet/hadoop/thrift/**</exclude>
+                     <exclude>parquet/thrift/projection/**</exclude>
+                     <exclude>parquet/thrift/ThriftSchemaConverter</exclude>
                      <exclude>parquet/filter2/**</exclude>
                      <exclude>parquet/org/**</exclude>
                      <exclude>parquet/column/**</exclude>


Mime
View raw message