spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarekauel <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-8255][SPARK-8256][SQL]Add regex_extract...
Date Sun, 19 Jul 2015 07:19:55 GMT
Github user tarekauel commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7468#discussion_r34955738
  
    --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
---
    @@ -673,6 +673,110 @@ case class Encode(value: Expression, charset: Expression)
     }
     
     /**
    + * Replace all substrings of str that match regexp with rep
    + */
    +case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
    +  extends Expression with ImplicitCastInputTypes {
    +
    +  // last regex in string, we will update the pattern iff regexp value changed.
    +  @transient private var lastRegex: UTF8String = _
    +  // last regex pattern, we cache it for performance concern
    +  @transient private var pattern: Pattern = _
    +  // last replacement string, we don't want to convert a UTF8String => java.langString
every time.
    +  @transient private var lastReplacement: String = _
    +  @transient private var lastReplacementInUTF8: UTF8String = _
    +  // result buffer write by Matcher
    +  @transient private val result: StringBuffer = new StringBuffer
    +
    +  override def nullable: Boolean = children.foldLeft(false)(_ || _.nullable)
    +  override def foldable: Boolean = children.foldLeft(true)(_ && _.foldable)
    +
    +  override def eval(input: InternalRow): Any = {
    +    val s = subject.eval(input)
    +    if (null != s) {
    +      val p = regexp.eval(input)
    +      if (null != p) {
    +        val r = rep.eval(input)
    +        if (null != r) {
    +          if (!p.equals(lastRegex)) {
    +            // regex value changed
    +            lastRegex = p.asInstanceOf[UTF8String]
    +            pattern = Pattern.compile(lastRegex.toString)
    +          }
    +          if (!r.equals(lastReplacementInUTF8)) {
    +            // replacement string changed
    +            lastReplacementInUTF8 = r.asInstanceOf[UTF8String]
    +            lastReplacement = lastReplacementInUTF8.toString
    +          }
    +          val m = pattern.matcher(s.toString())
    +          result.delete(0, result.length())
    +
    +          while (m.find) {
    +            m.appendReplacement(result, lastReplacement)
    +          }
    +          m.appendTail(result)
    +
    +          return UTF8String.fromString(result.toString)
    +        }
    +      }
    +    }
    +
    +    null
    +  }
    +
    +  override def dataType: DataType = StringType
    +  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType)
    +  override def children: Seq[Expression] = subject :: regexp :: rep :: Nil
    +  override def prettyName: String = "regexp_replace"
    +}
    +
    +/**
    + * UDF to extract a specific(idx) group identified by a java regex.
    + */
    +case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
    +  extends Expression with ImplicitCastInputTypes {
    +  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
    +
    +  // last regex in string, we will update the pattern iff regexp value changed.
    +  @transient private var lastRegex: UTF8String = _
    +  // last regex pattern, we cache it for performance concern
    +  @transient private var pattern: Pattern = _
    +
    +  override def nullable: Boolean = children.foldLeft(false)(_ || _.nullable)
    +  override def foldable: Boolean = children.foldLeft(true)(_ && _.foldable)
    +
    +  override def eval(input: InternalRow): Any = {
    +    val s = subject.eval(input)
    +    if (null != s) {
    +      val p = regexp.eval(input)
    +      if (null != p) {
    +        val r = idx.eval(input)
    +        if (null != r) {
    +          if (!p.equals(lastRegex)) {
    +            // regex value changed
    +            lastRegex = p.asInstanceOf[UTF8String]
    +            pattern = Pattern.compile(lastRegex.toString)
    +          }
    +          val m = pattern.matcher(s.toString())
    +          if (m.find) {
    +            val mr: MatchResult = m.toMatchResult
    +            return UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
    +          }
    +          return UTF8String.fromString("")
    --- End diff --
    
    `UTF8String.fromByte(Array[Byte]())` should be slightly faster and avoids creating the
string.
    
    @rxin / @davies A little bit off-topic, but do you guys think we should add something
to `UTF8String` to create an empty UTF8String? Something like:
    ```
    public UTF8String EMTPY_STRING() {
      return UTF8String.fromBytes(new byte[0])
    }
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message