accumulo-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From joshelser <...@git.apache.org>
Subject [GitHub] accumulo pull request #224: ACCUMULO-4500 ACCUMULO-96 Added summarization
Date Wed, 01 Mar 2017 16:49:58 GMT
Github user joshelser commented on a diff in the pull request:

    https://github.com/apache/accumulo/pull/224#discussion_r103729447
  
    --- Diff: core/src/main/java/org/apache/accumulo/core/client/summary/CountingSummarizer.java
---
    @@ -0,0 +1,302 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.accumulo.core.client.summary;
    +
    +import java.util.HashMap;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Map.Entry;
    +import java.util.function.Consumer;
    +import java.util.function.Function;
    +import java.util.function.UnaryOperator;
    +import java.util.stream.Collectors;
    +
    +import org.apache.accumulo.core.client.summary.summarizers.VisibilitySummarizer;
    +import org.apache.accumulo.core.data.Key;
    +import org.apache.accumulo.core.data.Value;
    +import org.apache.commons.lang.mutable.MutableLong;
    +
    +//checkstyle and formatter are in conflict
    +//@formatter:off
    +/**
    + * This class counts arbitrary keys while defending against too many keys and keys that
are too long.
    + *
    + * <p>
    + * During collection and summarization this class will use the functions from {@link
#converter()} and {@link #encoder()}. For each key/value the function from
    + * {@link #converter()} will be called to create zero or more counter objects. A counter
associated with each counter object will be incremented, as long as
    + * there are not too many counters and the counter object is not too long.
    + *
    + * <p>
    + * When {@link Summarizer.Collector#summarize(Summarizer.StatisticConsumer)} is called,
the function from {@link #encoder()} will be used to convert counter
    + * objects to strings. These strings will be used to emit statistics. Overriding {@link
#encoder()} is optional. One reason to override is if the counter object
    + * contains binary or special data. For example, a function that base64 encodes counter
objects could be created.
    + *
    + * <p>
    + * If the counter key type is mutable, then consider overriding {@link #copier()}.
    + *
    + * <p>
    + * The function returned by {@link #converter()} will be called frequently and should
be very efficient. The function returned by {@link #encoder()} will be
    + * called less frequently and can be more expensive. The reason these two functions exists
is to avoid the conversion to string for each key value, if that
    + * conversion is unnecessary.
    + *
    + * <p>
    + * Below is an example implementation that counts column visibilities. This example avoids
converting column visibility to string for each key/value. This
    + * example shows the source code for {@link VisibilitySummarizer}.
    + *
    + * <pre>
    + * <code>
    + *   public class VisibilitySummarizer extends CountingSummarizer&lt;ByteSequence&gt;
{
    + *     &#064;Override
    + *     protected UnaryOperator&lt;ByteSequence&gt; copier() {
    + *       // ByteSequences are mutable, so override and provide a copy function
    + *       return ArrayByteSequence::new;
    + *     }
    + *
    + *     &#064;Override
    + *     protected Converter&lt;ByteSequence&gt; converter() {
    + *       return (key, val, consumer) -&gt; consumer.accept(key.getColumnVisibilityData());
    + *     }
    + *   }
    + * </code>
    + * </pre>
    + *
    + * @param <K>
    + *          The counter key type. This type must have good implementations of {@link
Object#hashCode()} and {@link Object#equals(Object)}.
    + * @see CounterSummary
    + * @since 2.0.0
    + */
    +//@formatter:on
    +public abstract class CountingSummarizer<K> implements Summarizer {
    +
    +  /**
    +   * A configuration option for specifying the maximum number of unique counters an instance
of this summarizer should track. If not specified, a default of
    +   * {@value #MAX_COUNTER_DEFAULT} will be used.
    +   */
    +  public static final String MAX_COUNTERS_OPT = "maxCounters";
    +
    +  /**
    +   * A configuration option for specifying the maximum length of an individual counter
key. If not specified, a default of {@value #MAX_CKL_DEFAULT} will be
    +   * used.
    +   */
    +  public static final String MAX_COUNTER_LEN_OPT = "maxCounterLen";
    +
    +  /**
    +   * A configuration option to determine if delete keys should be counted. If set to
true then delete keys will not be passed to the {@link Converter} and the
    +   * statistic {@value #DELETES_IGNORED_STAT} will track the number of deleted ignored.
This options defaults to {@value #INGNORE_DELETES_DEFAULT}.
    +   */
    +  public static final String INGNORE_DELETES_OPT = "ignoreDeletes";
    +
    +  /**
    +   * This prefixes all counters when emitting statistics in {@link Summarizer.Collector#summarize(Summarizer.StatisticConsumer)}.
    +   */
    +  public static final String COUNTER_STAT_PREFIX = "c:";
    +
    +  /**
    +   * This is the name of the statistic that tracks how many counters objects were ignored
because the number of unique counters was exceeded. The max number of
    +   * unique counters is specified by {@link #MAX_COUNTERS_OPT}.
    +   */
    +  public static final String TOO_MANY_STAT = "tooMany";
    +
    +  /**
    +   * This is the name of the statistic that tracks how many counter objects were ignored
because they were too long. The maximum lenght is specified by
    +   * {@link #MAX_COUNTER_LEN_OPT}.
    +   */
    +  public static final String TOO_LONG_STAT = "tooLong";
    +
    +  /**
    +   * This is the name of the statistic that tracks the total number of counter objects
emitted by the {@link Converter}. This includes emitted Counter objects
    +   * that were ignored.
    +   */
    +  public static final String EMITTED_STAT = "emitted";
    +
    +  /**
    +   * This is the name of the statistic that tracks the total number of deleted keys seen.
This statistic is only incremented when the
    +   * {@value #INGNORE_DELETES_OPT} option is set to true.
    +   */
    +  public static final String DELETES_IGNORED_STAT = "deletesIgnored";
    +
    +  /**
    +   * This tracks the total number of key/values seen by the {@link Summarizer.Collector}
    +   */
    +  public static final String SEEN_STAT = "seen";
    +
    +  // this default can not be changed as persisted summary data depends on it
    --- End diff --
    
    Can you expand on this comment?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message