flume-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hshreedha...@apache.org
Subject [1/9] FLUME-2070. Add a Flume Morphline Solr Sink.
Date Tue, 18 Jun 2013 07:35:17 GMT
Updated Branches:
  refs/heads/trunk 296fc9f92 -> cf6298415


http://git-wip-us.apache.org/repos/asf/flume/blob/cf629841/flume-ng-sinks/flume-ng-morphline-solr-sink/src/test/resources/test-morphlines/solrCellDocumentTypes.conf
----------------------------------------------------------------------
diff --git a/flume-ng-sinks/flume-ng-morphline-solr-sink/src/test/resources/test-morphlines/solrCellDocumentTypes.conf
b/flume-ng-sinks/flume-ng-morphline-solr-sink/src/test/resources/test-morphlines/solrCellDocumentTypes.conf
new file mode 100644
index 0000000..2574144
--- /dev/null
+++ b/flume-ng-sinks/flume-ng-morphline-solr-sink/src/test/resources/test-morphlines/solrCellDocumentTypes.conf
@@ -0,0 +1,260 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Application configuration file in HOCON format (Human-Optimized Config Object Notation).
+# HOCON syntax is defined at http://github.com/typesafehub/config/blob/master/HOCON.md
+# and also used by Akka (http://www.akka.io) and Play (http://www.playframework.org/).
+# For more examples see http://doc.akka.io/docs/akka/2.1.2/general/configuration.html
+
+# morphline.conf example file
+# this is a comment
+// this is yet another comment
+
+morphlines : [
+  {
+    id : morphline1
+    importCommands : ["com.cloudera.**", "org.apache.solr.**"]
+
+    commands : [
+      { separateAttachments {} }
+
+      # java command that doesn't do anything except for test compilation
+      {
+        java {
+          imports : "import java.util.*;"
+          code: """
+            List tags = record.get("javaWithImports");
+            return child.process(record);
+                """
+        }
+      }
+
+      # java command that doesn't do anything except for test compilation
+      {
+        java {
+          code: """
+            List tags = record.get("javaWithoutImports");
+            return child.process(record);
+                """
+        }
+      }
+
+      {
+        # used for auto-detection if MIME type isn't explicitly supplied
+        detectMimeType {
+          includeDefaultMimeTypes : true
+          mimeTypesFiles : [target/test-classes/custom-mimetypes.xml]
+        }
+      }
+
+      {
+        tryRules {
+          throwExceptionIfAllRulesFailed : true
+          rules : [
+            # next top-level rule:
+            {
+              commands : [
+                { logDebug { format : "hello unpack" } }
+                { unpack {} }
+                { generateUUID {} }
+                { callParentPipe {} }
+              ]
+            }
+
+            {
+              commands : [
+                { logDebug { format : "hello decompress" } }
+                { decompress {} }
+                { callParentPipe {} }
+              ]
+            }
+
+            {
+              commands : [
+                {
+                  readAvroContainer {
+                    supportedMimeTypes : [avro/binary]
+                    # readerSchemaString : "<json can go here>" # optional, avro json
schema blurb for getSchema()
+                    # readerSchemaFile : /path/to/syslog.avsc
+                  }
+                }
+
+                { extractAvroTree {} }
+
+                {
+                  setValues {
+                    id : "@{/id}"
+                    user_screen_name : "@{/user_screen_name}"
+                    text : "@{/text}"
+                  }
+                }
+
+                {
+                  sanitizeUnknownSolrFields {
+                    solrLocator : ${SOLR_LOCATOR}
+                  }
+                }
+              ]
+            }
+
+            {
+              commands : [
+                {
+                  readJsonTestTweets {
+                    supportedMimeTypes : ["mytwittertest/json+delimited+length"]
+                  }
+                }
+
+                {
+                  sanitizeUnknownSolrFields {
+                    solrLocator : ${SOLR_LOCATOR}
+                  }
+                }
+              ]
+            }
+
+            # next top-level rule:
+            {
+              commands : [
+                { logDebug { format : "hello solrcell" } }
+                {
+                  # wrap SolrCell around an HTML Tika parser
+                  solrCell {
+                    solrLocator : ${SOLR_LOCATOR}
+                    # captureAttr : true # default is false
+                    capture : [
+
+                      # twitter feed schema
+                      user_friends_count
+                      user_location
+                      user_description
+                      user_statuses_count
+                      user_followers_count
+                      user_name
+                      user_screen_name
+                      created_at
+                      text
+                      retweet_count
+                      retweeted
+                      in_reply_to_user_id
+                      source
+                      in_reply_to_status_id
+                      media_url_https
+                      expanded_url
+
+                      # file metadata
+                      file_download_url
+                      file_upload_url
+                      file_scheme
+                      file_host
+                      file_port
+                      file_path
+                      file_name
+                      file_length
+                      file_last_modified
+                      file_owner
+                      file_group
+                      file_permissions_user
+                      file_permissions_group
+                      file_permissions_other
+                      file_permissions_stickybit
+                    ]
+
+                    fmap : { content : text, content-type : content_type } # rename "content"
field to "text" fields
+                    dateFormats : [ "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] # various java.text.SimpleDateFormat
+                    # xpath : "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"
+                    uprefix : "ignored_"
+                    lowernames : true
+                    # solrContentHandlerFactory : org.apache.solr.tika.TrimSolrContentHandlerFactory
+
+                    # Tika parsers to be registered. If multiple parsers support the same
MIME type,
+                    # the parser is chosen that is closest to the bottom in this list:
+                    parsers : [
+                      { parser : org.apache.tika.parser.asm.ClassParser }
+                      # { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes
: [audio/ogg] }
+                      { parser : org.gagravarr.tika.FlacParser }
+                      { parser : org.apache.tika.parser.audio.AudioParser }
+                      { parser : org.apache.tika.parser.audio.MidiParser }
+                      { parser : org.apache.tika.parser.crypto.Pkcs7Parser }
+                      { parser : org.apache.tika.parser.dwg.DWGParser }
+                      { parser : org.apache.tika.parser.epub.EpubParser }
+                      { parser : org.apache.tika.parser.executable.ExecutableParser }
+                      { parser : org.apache.tika.parser.feed.FeedParser }
+                      { parser : org.apache.tika.parser.font.AdobeFontMetricParser }
+                      { parser : org.apache.tika.parser.font.TrueTypeParser }
+                      { parser : org.apache.tika.parser.xml.XMLParser }
+                      { parser : org.apache.tika.parser.html.HtmlParser }
+                      { parser : org.apache.tika.parser.image.ImageParser }
+                      { parser : org.apache.tika.parser.image.PSDParser }
+                      { parser : org.apache.tika.parser.image.TiffParser }
+                      { parser : org.apache.tika.parser.iptc.IptcAnpaParser }
+                      { parser : org.apache.tika.parser.iwork.IWorkPackageParser }
+                      { parser : org.apache.tika.parser.jpeg.JpegParser }
+                      { parser : org.apache.tika.parser.mail.RFC822Parser }
+                      { parser : org.apache.tika.parser.mbox.MboxParser, additionalSupportedMimeTypes
: [message/x-emlx] }
+                      { parser : org.apache.tika.parser.microsoft.OfficeParser }
+                      { parser : org.apache.tika.parser.microsoft.TNEFParser }
+                      { parser : org.apache.tika.parser.microsoft.ooxml.OOXMLParser }
+                      { parser : org.apache.tika.parser.mp3.Mp3Parser }
+                      { parser : org.apache.tika.parser.mp4.MP4Parser }
+                      { parser : org.apache.tika.parser.hdf.HDFParser }
+                      { parser : org.apache.tika.parser.netcdf.NetCDFParser }
+                      { parser : org.apache.tika.parser.odf.OpenDocumentParser }
+                      { parser : org.apache.tika.parser.pdf.PDFParser }
+                      { parser : org.apache.tika.parser.pkg.CompressorParser }
+                      { parser : org.apache.tika.parser.pkg.PackageParser }
+                      { parser : org.apache.tika.parser.rtf.RTFParser }
+                      { parser : org.apache.tika.parser.txt.TXTParser }
+                      { parser : org.apache.tika.parser.video.FLVParser }
+                      { parser : org.apache.tika.parser.xml.DcXMLParser }
+                      { parser : org.apache.tika.parser.xml.FictionBookParser }
+                      { parser : org.apache.tika.parser.chm.ChmParser }
+                    ]
+                  }
+                }
+
+                { generateUUID { field : ignored_base_id } }
+
+                {
+                  generateSolrSequenceKey {
+                    baseIdField: ignored_base_id
+                    solrLocator : ${SOLR_LOCATOR}
+                  }
+                }
+
+              ]
+            }
+          ]
+        }
+      }
+
+      {
+        loadSolr {
+          solrLocator : ${SOLR_LOCATOR}
+        }
+      }
+
+      {
+        logDebug {
+          format : "My output record: {}"
+          args : ["@{}"]
+        }
+      }
+
+    ]
+  }
+]

http://git-wip-us.apache.org/repos/asf/flume/blob/cf629841/flume-ng-sinks/pom.xml
----------------------------------------------------------------------
diff --git a/flume-ng-sinks/pom.xml b/flume-ng-sinks/pom.xml
index 7170348..3ee75e5 100644
--- a/flume-ng-sinks/pom.xml
+++ b/flume-ng-sinks/pom.xml
@@ -45,5 +45,6 @@ limitations under the License.
     <module>flume-irc-sink</module>
     <module>flume-ng-hbase-sink</module>
     <module>flume-ng-elasticsearch-sink</module>
+    <module>flume-ng-morphline-solr-sink</module>
   </modules>
 </project>

http://git-wip-us.apache.org/repos/asf/flume/blob/cf629841/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 8026936..251f345 100644
--- a/pom.xml
+++ b/pom.xml
@@ -549,7 +549,7 @@ limitations under the License.
                   <exclude>**/*.avsc</exclude>
                   <exclude>**/*.avro</exclude>
                   <exclude>**/docs/**</exclude>
-                  <exclude>**/test/resources/test_command.txt</exclude>
+                  <exclude>**/test/resources/**</exclude>
                   <exclude>**/.settings/*</exclude>
                   <exclude>**/.classpath</exclude>
                   <exclude>**/.project</exclude>
@@ -942,6 +942,12 @@ limitations under the License.
       </dependency>
 
       <dependency>
+        <groupId>org.apache.flume.flume-ng-sinks</groupId>
+        <artifactId>flume-ng-morphline-solr-sink</artifactId>
+        <version>1.4.0-SNAPSHOT</version>
+      </dependency>
+
+      <dependency>
         <groupId>org.apache.flume.flume-ng-sources</groupId>
         <artifactId>flume-scribe-source</artifactId>
         <version>1.4.0-SNAPSHOT</version>


Mime
View raw message