orc-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasanthj <...@git.apache.org>
Subject [GitHub] orc pull request #55: ORC-54: Evolve schemas based on field name rather than...
Date Tue, 16 Aug 2016 23:18:14 GMT
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/55#discussion_r75038675
  
    --- Diff: java/core/src/java/org/apache/orc/impl/SchemaEvolution.java ---
    @@ -20,59 +20,132 @@
     
     import java.util.ArrayList;
     import java.util.Arrays;
    +import java.util.HashMap;
     import java.util.List;
    +import java.util.Map;
    +import java.util.regex.Pattern;
     
    +import org.apache.orc.Reader;
     import org.apache.orc.TypeDescription;
     import org.slf4j.Logger;
     import org.slf4j.LoggerFactory;
     
     /**
    - * Take the file types and the (optional) configuration column names/types and
    - * see if there has been schema evolution.
    + * Infer and track the evolution between the schema as stored in the file and
    + * the schema that has been requested by the reader.
      */
     public class SchemaEvolution {
       // indexed by reader column id
       private final TypeDescription[] readerFileTypes;
       // indexed by reader column id
    -  private final boolean[] included;
    +  private final boolean[] readerIncluded;
    +  // indexed by file column id
    +  private final boolean[] fileIncluded;
       private final TypeDescription fileSchema;
       private final TypeDescription readerSchema;
       private boolean hasConversion = false;
    +  private final boolean isAcid;
    +
       // indexed by reader column id
       private final boolean[] ppdSafeConversion;
     
    -  public SchemaEvolution(TypeDescription fileSchema, boolean[] includedCols) {
    -    this(fileSchema, null, includedCols);
    +  private static final Logger LOG =
    +    LoggerFactory.getLogger(SchemaEvolution.class);
    +  private static final Pattern missingMetadataPattern =
    +    Pattern.compile("_col\\d+");
    +
    +  public static class IllegalEvolutionException extends RuntimeException {
    +    public IllegalEvolutionException(String msg) {
    +      super(msg);
    +    }
    +  }
    +
    +  public SchemaEvolution(TypeDescription fileSchema,
    +                         Reader.Options options) {
    +    this(fileSchema, null, options);
       }
     
       public SchemaEvolution(TypeDescription fileSchema,
                              TypeDescription readerSchema,
    -                         boolean[] includedCols) {
    -    this.included = includedCols == null ? null :
    +                         Reader.Options options) {
    +    boolean allowMissingMetadata = options.getTolerateMissingSchema();
    +    boolean[] includedCols = options.getInclude();
    +    this.readerIncluded = includedCols == null ? null :
           Arrays.copyOf(includedCols, includedCols.length);
    +    this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1];
         this.hasConversion = false;
         this.fileSchema = fileSchema;
    +    isAcid = checkAcidSchema(fileSchema);
         if (readerSchema != null) {
    -      if (checkAcidSchema(fileSchema)) {
    +      if (isAcid) {
             this.readerSchema = createEventSchema(readerSchema);
           } else {
             this.readerSchema = readerSchema;
           }
    -      this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
    -      buildConversionFileTypesArray(fileSchema, this.readerSchema);
    +      this.readerFileTypes =
    +        new TypeDescription[this.readerSchema.getMaximumId() + 1];
    +      int positionalLevels = 0;
    +      if (!hasColumnNames(isAcid? getBaseRow(fileSchema) : fileSchema)){
    +        if (!this.fileSchema.equals(this.readerSchema)) {
    +          if (!allowMissingMetadata) {
    +            throw new RuntimeException("Found that schema metadata is missing"
    +                + " from file. This is likely caused by"
    +                + " a writer earlier than HIVE-4243. Will"
    +                + " not try to reconcile schemas");
    +          } else {
    +            LOG.warn("Column names are missing from this file. This is"
    +                + " caused by a writer earlier than HIVE-4243. The reader will"
    +                + " reconcile schemas based on index. File type: " +
    +                this.fileSchema + ", reader type: " + this.readerSchema);
    +            positionalLevels = isAcid ? 2 : 1;
    --- End diff --
    
    What does positional level mean? Is it real row level? 
    Does acid file schema look like this struct<struct<[acid_cols]>,struct[real_cols]>>?
If so can you leave a comment about it?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message