oodt-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ric...@apache.org
Subject svn commit: r1628235 - in /oodt/trunk: ./ metadata/ metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ metadata/src/site/xdoc/user/ metadata/src/test/org/apache/oodt/cas/metadata/extractors/ metadata/src/testdata/
Date Mon, 29 Sep 2014 17:47:55 GMT
Author: rickdn
Date: Mon Sep 29 17:47:54 2014
New Revision: 1628235

URL: http://svn.apache.org/r1628235
Log:
OODT-754 contribute ProdTypePatternMetExtractor with unit tests and documentation.

Added:
    oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/AbstractSAXConfigReader.java
    oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.java
    oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestAbstractSAXConfigReader.java
    oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestProdTypePatternMetExtractor.java
    oodt/trunk/metadata/src/testdata/product-type-patterns-2.xml
    oodt/trunk/metadata/src/testdata/product-type-patterns.xml
Modified:
    oodt/trunk/CHANGES.txt
    oodt/trunk/metadata/pom.xml
    oodt/trunk/metadata/src/site/xdoc/user/basic.xml

Modified: oodt/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/oodt/trunk/CHANGES.txt?rev=1628235&r1=1628234&r2=1628235&view=diff
==============================================================================
--- oodt/trunk/CHANGES.txt (original)
+++ oodt/trunk/CHANGES.txt Mon Sep 29 17:47:54 2014
@@ -3,6 +3,8 @@ Apache OODT Change Log
 
 Release 0.8 - Current Development
 
+* OODT-754 Contribute ProdTypePatternMetExtractor (rickdn)
+
 * OODT-750 Issue with running mvn site:site due to old findbugs plugin (mattmann)
 
 

Modified: oodt/trunk/metadata/pom.xml
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/pom.xml?rev=1628235&r1=1628234&r2=1628235&view=diff
==============================================================================
--- oodt/trunk/metadata/pom.xml (original)
+++ oodt/trunk/metadata/pom.xml Mon Sep 29 17:47:54 2014
@@ -16,7 +16,7 @@ License for the specific language govern
 the License.
 -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  
+
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.oodt</groupId>
@@ -27,11 +27,11 @@ the License.
   <artifactId>cas-metadata</artifactId>
   <name>Catalog and Archive Service Generic Multi-valued Metadata Container</name>
   <description>A Multi-valued, generic Metadata container class. The class uses an
internal
-	Map of string keys pointing to vectors of strings. The data 
-	structure looks like the following: 
-	
+	Map of string keys pointing to vectors of strings. The data
+	structure looks like the following:
+
 	[std:string key]⇒std:vector of std:strings
-	
+
 	The multi-valued nature of the class is handled transparently by this Metadata
 	container. Since all values are stored internally as string vectors, the difference
 	between a scalar value and a non-scalar is handled by determining whether the list of
@@ -56,10 +56,11 @@ the License.
         <targetPath>org/apache/oodt/cas/metadata</targetPath>
         <directory>${basedir}/src/testdata</directory>
         <includes>
-             <include>extern-config.xml</include>
              <include>copyandrewrite.test.conf</include>
              <include>extern-config.xml</include>
              <include>met_extr_preconditions.xml</include>
+             <include>product-type-patterns.xml</include>
+             <include>product-type-patterns-2.xml</include>
              <include>samplemet.xml</include>
              <include>testExtractor</include>
              <include>testfile.txt</include>

Added: oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/AbstractSAXConfigReader.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/AbstractSAXConfigReader.java?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/AbstractSAXConfigReader.java
(added)
+++ oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/AbstractSAXConfigReader.java
Mon Sep 29 17:47:54 2014
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+import org.apache.oodt.cas.metadata.MetExtractorConfig;
+import org.apache.oodt.cas.metadata.MetExtractorConfigReader;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractorConfigReaderException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.File;
+
+/**
+ * Parses an xml config file for MetExtractors using SAX.
+ *
+ * @author rickdn (Ricky Nguyen)
+ */
+public class AbstractSAXConfigReader extends DefaultHandler implements MetExtractorConfigReader,
MetExtractorConfig {
+
+    @Override
+    public AbstractSAXConfigReader parseConfigFile(File configFile) throws MetExtractorConfigReaderException
{
+        try {
+            SAXParser p = SAXParserFactory.newInstance().newSAXParser();
+            p.parse(configFile, this);
+        } catch (Exception e) {
+            throw new MetExtractorConfigReaderException(e.getMessage());
+        }
+        return this;
+    }
+
+}

Added: oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.java?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.java
(added)
+++ oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.java
Mon Sep 29 17:47:54 2014
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractorConfigReaderException;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Assigns a ProductType based on a filename pattern, while simultaneously assigning values
to metadata elements
+ * embedded in the filename pattern.
+ * <p/>
+ * Suppose I have files in the staging area ready to be ingested. These files usually have
information encoded into the
+ * filename in order to distinguish the contents of one file from other files. For example
book-1234567890.txt might be
+ * the contents of a book with ISBN 1234567890. Or page-1234567890-12.txt might be the text
on page 12 of book with ISBN
+ * 1234567890.
+ * <p/>
+ * It would be useful to generate metadata from the information encoded in the filename (think:
filename => metadata).
+ * The {@link ProdTypePatternMetExtractor} allows this in a flexible manner using regular
expressions. Let's take a look
+ * at the config file for this met extractor.
+ * <p/>
+ * <pre>
+ * product-type-patterns.xml
+ *
+ * {@code
+ * <config>
+ *   <!-- <element> MUST be defined before <product-type> so their patterns
can be resolved -->
+ *   <!-- name MUST be an element defined in elements.xml (also only upper and lower case
alpha chars) -->
+ *   <!-- regexp MUST be valid input to java.util.regex.Pattern.compile() -->
+ *   <element name="ISBN" regexp="[0-9]{10}"/>
+ *   <element name="Page" regexp="[0-9]*"/>
+ *
+ *   <!-- name MUST be a ProductType name defined in product-types.xml -->
+ *   <!-- metadata elements inside brackets MUST be mapped to the ProductType,
+ *        as defined in product-type-element-map.xml -->
+ *   <product-type name="Book" template="book-[ISBN].txt"/>
+ *   <product-type name="BookPage" template="page-[ISBN]-[Page].txt"/>
+ * </config>
+ * }
+ * </pre>
+ * <p/>
+ * <p/>
+ * This file defines a regular expression for the "ISBN" metadata element, in this case,
a 10-digit number. Also, the
+ * "Page" metadata element is defined as a sequence of 0 or more digits.
+ * <p/>
+ * Next, the file defines a filename pattern for the "Book" product type. The pattern is
compiled into a regular
+ * expression, substituting the previously defined regexes as capture groups. For example,
"book-[ISBN].txt" compiles to
+ * "book-([0-9]{10}).txt", and the ISBN met element is assigned to capture group 1. When
the filename matches this
+ * pattern, 2 metadata assignments occur: (1) the ISBN met element is set to the matched
regex group, and (2) the
+ * ProductType met element is set to "Book".
+ * <p/>
+ * Similarly, the second pattern sets ISBN, Page, and ProductType for files matching "page-([0-9]{10})-([0-9]*).txt".
+ * <p/>
+ * This achieves several things: <ol> <li>assigning met elements based on regular
expressions</li> <li>assigning product
+ * type based on easy-to-understand pattern with met elements clearly indicated</li>
<li>reuse of met element regular
+ * expressions</li> </ol>
+ * <p/>
+ * Differences from {@link FilenameTokenMetExtractor}:
+ * <ol>
+ *     <li>Allows dynamic length metadata (does not rely on offset and length of metadata)</li>
+ *     <li>Assigns ProductType</li>
+ * </ol>
+ * <p/>
+ * Differences from {@link org.apache.oodt.cas.crawl.AutoDetectProductCrawler}:
+ * <ol>
+ *     <li>Does not require definition of custom MIME type and MIME-type regex. Really,
all you want is to assign a
+ *     ProductType, rather than indirectly assigning a custom MIME type that maps to a Product
Type.</li>
+ * </ol>
+ * <p/>
+ * Differences from {@link org.apache.oodt.cas.filemgr.metadata.extractors.examples.FilenameRegexMetExtractor}:
+ * <ol>
+ *     <li>Assigns ProductType. FilenameRegexMetExtractor runs after ProductType is
already determined.</li>
+ *     <li>Runs on the client-side (crawler). FilenameRegexMetExtractor runs on the
server-side (filemgr).</li>
+ *     <li>Different patterns for different ProductTypes. FilenameRegexMetExtractor
config applies the same pattern to
+ *     all files.</li>
+ * </ol>
+ * <p/>
+ * Prerequisites:
+ * <ol>
+ *     <li>{@code <element>} tag occurs before {@code <product-type>} tag</li>
+ *     <li>{@code <element> @name} attribute <strong>MUST</strong>
be defined in FileManager policy elements.xml</li>
+ *     <li>{@code <element> @regexp} attribute <strong>MUST</strong>
be valid input to
+ *     {@link java.util.regex.Pattern#compile(String)}</li>
+ *     <li>{@code <product-type> @name} attribute <strong>MUST</strong>
be a ProductType name (not ID) defined in
+ *     product-types.xml</li>
+ *     <li>met elements used in {@code <product-type> @template} attribute <strong>MUST</strong>
be
+ *     mapped to the ProductType, as defined in product-type-element-map.xml</li>
+ * </ol>
+ * <p/>
+ * <strong>Words of Caution</strong>
+ * <ul>
+ *     <li><strong>Does not support nested met elements.</strong></li>
+ *     <li><strong>Each pattern should map to one product type.</strong>
Watch out for similar patterns. Don't do this:
+ * <pre>
+ * {@code
+ * <element name="Page" regexp="[0-9]*"/>
+ * <element name="Chapter" regexp="[0-9]*"/>
+ *
+ * <product-type name="Page" template="data-[Page].txt"/>
+ * <product-type name="Chapter" template="data-[Chapter].txt"/>
+ * }</pre>
+ * Instead, encode the product type information into the filename, for example:
+ * <pre>
+ * {@code
+ * <element name="Page" regexp="[0-9]*"/>
+ * <element name="Chapter" regexp="[0-9]*"/>
+ *
+ * <product-type name="Page" template="page-[Page].txt"/>
+ * <product-type name="Chapter" template="chapter-[Chapter].txt"/>
+ * }</pre>
+ * </li>
+ * </ul>
+ *
+ * @author rickdn (Ricky Nguyen)
+ */
+public class ProdTypePatternMetExtractor extends CmdLineMetExtractor {
+
+    static class ConfigReader extends AbstractSAXConfigReader {
+        private static final String ELEMENT_TAG = "element";
+        private static final String ELEMENT_NAME_ATTR = "name";
+        private static final String ELEMENT_REGEXP_ATTR = "regexp";
+        private static final String PRODUCT_TYPE_TAG = "product-type";
+        private static final String PRODUCT_TYPE_NAME_ATTR = "name";
+        private static final String PRODUCT_TYPE_TEMPLATE_ATTR = "template";
+        private static final Pattern MET_TOKEN = Pattern.compile("\\[([A-Za-z]*)\\]");
+
+        /*
+         * full file name reg exp => prod type
+         */
+        private final Map<Pattern, String> prodTypePatterns = new HashMap<Pattern,
String>();
+
+        /*
+         * prod type => list of met elements in the file name
+         */
+        private final Map<String, List<String>> prodTypeElements = new HashMap<String,
List<String>>();
+
+        /*
+         * met elements => element reg exp patterns
+         */
+        private final Map<String, Pattern> elementPatterns = new HashMap<String,
Pattern>();
+
+
+        Map<Pattern, String> getProdTypePatterns() {
+            return prodTypePatterns;
+        }
+
+        Map<String, List<String>> getProdTypeElements() {
+            return prodTypeElements;
+        }
+
+        void addProductType(String id, String template) {
+            template = template.replaceAll("\\.", "\\\\.");
+            Matcher m = MET_TOKEN.matcher(template);
+            List<String> elemList = prodTypeElements.get(id);
+            if (elemList == null) {
+                elemList = new ArrayList<String>();
+                prodTypeElements.put(id, elemList);
+            }
+            String newTemplate = template;
+            while (m.find()) {
+                String elem = m.group(1);
+                String regex = elementPatterns.get(elem).toString();
+                newTemplate = newTemplate.replaceAll("\\[" + elem + "\\]", "(" + regex +
")");
+                elemList.add(elem);
+            }
+            prodTypePatterns.put(Pattern.compile(newTemplate), id);
+        }
+
+        void addElement(String name, String regexp) {
+            elementPatterns.put(name, Pattern.compile(regexp));
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
+            if (qName.equals(ELEMENT_TAG)) {
+                String name = attributes.getValue(ELEMENT_NAME_ATTR);
+                String regexp = attributes.getValue(ELEMENT_REGEXP_ATTR);
+                addElement(name, regexp);
+            } else if (qName.equals(PRODUCT_TYPE_TAG)) {
+                String id = attributes.getValue(PRODUCT_TYPE_NAME_ATTR);
+                String template = attributes.getValue(PRODUCT_TYPE_TEMPLATE_ATTR);
+                addProductType(id, template);
+            }
+        }
+
+        @Override
+        public AbstractSAXConfigReader parseConfigFile(File configFile) throws MetExtractorConfigReaderException
{
+            // reset internal state whenever parsing a new config file
+            prodTypePatterns.clear();
+            prodTypeElements.clear();
+            elementPatterns.clear();
+            return super.parseConfigFile(configFile);
+        }
+    }
+
+    private static final String PRODUCT_TYPE_MET_KEY = "ProductType";
+
+    public ProdTypePatternMetExtractor() {
+        super(new ConfigReader());
+    }
+
+    @Override
+    protected Metadata extrMetadata(File file) throws MetExtractionException {
+        Metadata met = new Metadata();
+        ConfigReader mConfig = (ConfigReader) config;
+
+        for (Pattern p : mConfig.getProdTypePatterns().keySet()) {
+            Matcher m = p.matcher(file.getName());
+            if (m.matches()) {
+                String prodType = mConfig.getProdTypePatterns().get(p);
+                met.addMetadata(PRODUCT_TYPE_MET_KEY, prodType);
+                List<String> elemList = mConfig.getProdTypeElements().get(prodType);
+                for (int i = 0; i < m.groupCount(); i++) {
+                    met.addMetadata(elemList.get(i), m.group(i + 1));
+                }
+            }
+        }
+
+        return met;
+    }
+
+    public static void main(String[] args) throws Exception {
+        processMain(args, new ProdTypePatternMetExtractor());
+    }
+
+}

Modified: oodt/trunk/metadata/src/site/xdoc/user/basic.xml
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/site/xdoc/user/basic.xml?rev=1628235&r1=1628234&r2=1628235&view=diff
==============================================================================
--- oodt/trunk/metadata/src/site/xdoc/user/basic.xml (original)
+++ oodt/trunk/metadata/src/site/xdoc/user/basic.xml Mon Sep 29 17:47:54 2014
@@ -329,6 +329,56 @@ public class Metadata {
      <i>DataVersion</i>, <i>CollectionName</i>, and <i>DataProvider</i>.</p>
      
      </subsection>
+
+     <subsection name="Product Type Pattern Metadata Extractor">
+     <p>The <code>ProdTypePatternMetExtractor</code> can also be used to
extract
+     metadata from the filename.  Unlike the <code>FilenameTokenMetExtractor</code>,
+     metadata elements do not have to be fixed-offset and fixed-length.  Instead,
+     metadata elements are represented by regular expressions.  These elements are
+     used in filename templates that, when matched, dynamically determine the
+     ProductType of the file.</p>
+
+     <p>Below is an example of a <code>product-type-patterns.xml</code>
configuration
+     file used by the <code>ProdTypePatternMetExtractor:</code></p>
+
+<source><![CDATA[
+<config>
+  <!-- <element> MUST be defined before <product-type> so their patterns can
be resolved -->
+  <!-- name MUST be an element defined in elements.xml (also only upper and lower case
alpha chars) -->
+  <!-- regexp MUST be valid input to java.util.regex.Pattern.compile() -->
+  <element name="ISBN" regexp="[0-9]{10}"/>
+  <element name="Page" regexp="[0-9]*"/>
+
+  <!-- name MUST be a ProductType name defined in product-types.xml -->
+  <!-- metadata elements inside brackets MUST be mapped to the ProductType,
+       as defined in product-type-element-map.xml -->
+  <product-type name="Book" template="book-[ISBN].txt"/>
+  <product-type name="BookPage" template="page-[ISBN]-[Page].txt"/>
+</config>
+]]></source>
+
+     <p>This file defines a regular expression for the "ISBN" metadata element, in
this case, a 10-digit number. Also, the
+     "Page" metadata element is defined as a sequence of 0 or more digits.
+     <p/>
+     Next, the file defines a filename pattern for the "Book" product type. The pattern is
compiled into a regular
+     expression, substituting the previously defined regexes as capture groups. For example,
"book-[ISBN].txt" compiles to
+     "book-([0-9]{10}).txt", and the ISBN met element is assigned to capture group 1. When
the filename matches this
+     pattern, 2 metadata assignments occur: (1) the ISBN met element is set to the matched
regex group, and (2) the
+     ProductType met element is set to "Book".
+     <p/>
+     Similarly, the second pattern sets ISBN, Page, and ProductType for files matching "page-([0-9]{10})-([0-9]*).txt".
+     <p/>
+     This achieves several things:
+     <ol>
+         <li>assigning met elements based on regular expressions</li>
+         <li>assigning product type based on easy-to-understand pattern with met elements
clearly indicated</li>
+         <li>reuse of met element regular expressions</li>
+     </ol>
+     <p>See the <a href="../apidocs/org/apache/oodt/cas/metadata/extractors/ProdTypePatternMetExtractor.html">JavaDoc</a>
+     for more detailed information about using the <code>ProdTypePatternMetExtractor</code></p>
+     </p>
+     </subsection>
+
      <subsection name="Metadata Reader Extractor">
      
      <p>The <code>MetReaderExtractor</code>, part of the OODT CAS-Metadata
project,
@@ -433,4 +483,4 @@ filemgr-client --url <url to xml rpc ser
     </section>
     
   </body>
-</document>
\ No newline at end of file
+</document>

Added: oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestAbstractSAXConfigReader.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestAbstractSAXConfigReader.java?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestAbstractSAXConfigReader.java
(added)
+++ oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestAbstractSAXConfigReader.java
Mon Sep 29 17:47:54 2014
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import java.io.File;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestAbstractSAXConfigReader {
+
+    private AbstractSAXConfigReader configReader;
+    private File configFile;
+
+    private List<String> uris = new ArrayList<String>();
+    private List<String> localNames = new ArrayList<String>();
+    private List<String> qNames = new ArrayList<String>();
+    private List<Integer> attributes = new ArrayList<Integer>();
+
+    @Before
+    public void setup() throws Exception {
+        URL url = getClass().getResource("/org/apache/oodt/cas/metadata/product-type-patterns.xml");
+        configFile = new File(url.toURI());
+        configReader = new AbstractSAXConfigReader() {
+            @Override
+            public void startElement(String uri, String localName, String qName, Attributes
attrs) throws SAXException {
+                uris.add(uri);
+                localNames.add(localName);
+                qNames.add(qName);
+                attributes.add(attrs.getLength());
+            }
+        };
+    }
+
+    @After
+    public void teardown() {
+        uris.clear();
+        localNames.clear();
+        qNames.clear();
+        attributes.clear();
+    }
+
+    @Test
+    public void testParseConfigFile() throws Exception {
+        configReader.parseConfigFile(configFile);
+
+        assertEquals(5, uris.size());
+        for (String uri : uris)
+            assertEquals("", uri);
+
+        assertEquals(5, localNames.size());
+        for (String local : localNames)
+            assertEquals("", local);
+
+        assertEquals(5, qNames.size());
+        assertEquals("config", qNames.get(0));
+        assertEquals("element", qNames.get(1));
+        assertEquals("element", qNames.get(2));
+        assertEquals("product-type", qNames.get(3));
+        assertEquals("product-type", qNames.get(4));
+
+        assertEquals(5, attributes.size());
+
+        assertEquals(0, (int)attributes.get(0));
+        assertEquals(2, (int)attributes.get(1));
+        assertEquals(2, (int)attributes.get(2));
+        assertEquals(2, (int)attributes.get(3));
+        assertEquals(2, (int)attributes.get(4));
+    }
+
+}

Added: oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestProdTypePatternMetExtractor.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestProdTypePatternMetExtractor.java?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestProdTypePatternMetExtractor.java
(added)
+++ oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/extractors/TestProdTypePatternMetExtractor.java
Mon Sep 29 17:47:54 2014
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+import com.google.common.io.Files;
+import org.apache.commons.io.FileUtils;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.net.URL;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestProdTypePatternMetExtractor {
+
+    private ProdTypePatternMetExtractor extractor;
+    private File configFile;
+    private File configFile2;
+
+    private File tmpDir;
+    private File book1;
+    private File book2;
+    private File page1;
+    private File page2;
+    private File page1a;
+    private File page2a;
+
+    @Before
+    public void setup() throws Exception {
+        URL url = getClass().getResource("/org/apache/oodt/cas/metadata/product-type-patterns.xml");
+        configFile = new File(url.toURI());
+        extractor = new ProdTypePatternMetExtractor();
+        extractor.setConfigFile(configFile);
+
+        tmpDir = Files.createTempDir();
+        book1 = new File(tmpDir, "book-1234567890.txt");
+        book2 = new File(tmpDir, "book-0987654321.txt");
+        page1 = new File(tmpDir, "page-1234567890-111.txt");
+        page2 = new File(tmpDir, "page-0987654321-222.txt");
+        Files.touch(book1);
+        Files.touch(book2);
+        Files.touch(page1);
+        Files.touch(page2);
+
+        url = getClass().getResource("/org/apache/oodt/cas/metadata/product-type-patterns-2.xml");
+        configFile2 = new File(url.toURI());
+        page1a = new File(tmpDir, "page-111-1234567890.txt");
+        page2a = new File(tmpDir, "page-222-0987654321.txt");
+        Files.touch(page1a);
+        Files.touch(page2a);
+    }
+
+    @After
+    public void teardown() {
+        FileUtils.deleteQuietly(tmpDir);
+    }
+
+    @Test
+    public void testExtractMetadata() throws Exception {
+        Metadata met = extractor.extractMetadata(book1);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals("Book", met.getMetadata("ProductType"));
+        assertEquals("1234567890", met.getMetadata("ISBN"));
+
+        met = extractor.extractMetadata(book2);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals("Book", met.getMetadata("ProductType"));
+        assertEquals("0987654321", met.getMetadata("ISBN"));
+
+        met = extractor.extractMetadata(page1);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals("BookPage", met.getMetadata("ProductType"));
+        assertEquals("1234567890", met.getMetadata("ISBN"));
+        assertEquals("111", met.getMetadata("Page"));
+
+        met = extractor.extractMetadata(page2);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals("BookPage", met.getMetadata("ProductType"));
+        assertEquals("0987654321", met.getMetadata("ISBN"));
+        assertEquals("222", met.getMetadata("Page"));
+    }
+
+    @Test
+    public void testNewConfigFile() throws Exception {
+        // make sure that duplicate met entries do not exist when re-parsing a new config
file
+        Metadata met = extractor.extractMetadata(book1, configFile);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals(1, met.getAllMetadata("ProductType").size());
+        assertEquals(1, met.getAllMetadata("ISBN").size());
+
+        met = extractor.extractMetadata(book2, configFile);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals(1, met.getAllMetadata("ProductType").size());
+        assertEquals(1, met.getAllMetadata("ISBN").size());
+
+        met = extractor.extractMetadata(page1, configFile);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals(1, met.getAllMetadata("ProductType").size());
+        assertEquals(1, met.getAllMetadata("ISBN").size());
+        assertEquals(1, met.getAllMetadata("Page").size());
+
+        met = extractor.extractMetadata(page2, configFile);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals(1, met.getAllMetadata("ProductType").size());
+        assertEquals(1, met.getAllMetadata("ISBN").size());
+        assertEquals(1, met.getAllMetadata("Page").size());
+    }
+
+    @Test
+    public void testElementDeclarationOrder() throws Exception {
+        // the relative order of element declarations shouldn't matter
+        extractor.setConfigFile(configFile2);
+
+        Metadata met = extractor.extractMetadata(book1);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals("Book", met.getMetadata("ProductType"));
+        assertEquals("1234567890", met.getMetadata("ISBN"));
+
+        met = extractor.extractMetadata(book2);
+        assertEquals(2, met.getAllKeys().size());
+        assertEquals("Book", met.getMetadata("ProductType"));
+        assertEquals("0987654321", met.getMetadata("ISBN"));
+
+        met = extractor.extractMetadata(page1a);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals("BookPage", met.getMetadata("ProductType"));
+        assertEquals("1234567890", met.getMetadata("ISBN"));
+        assertEquals("111", met.getMetadata("Page"));
+
+        met = extractor.extractMetadata(page2a);
+        assertEquals(3, met.getAllKeys().size());
+        assertEquals("BookPage", met.getMetadata("ProductType"));
+        assertEquals("0987654321", met.getMetadata("ISBN"));
+        assertEquals("222", met.getMetadata("Page"));
+    }
+}

Added: oodt/trunk/metadata/src/testdata/product-type-patterns-2.xml
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/testdata/product-type-patterns-2.xml?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/testdata/product-type-patterns-2.xml (added)
+++ oodt/trunk/metadata/src/testdata/product-type-patterns-2.xml Mon Sep 29 17:47:54 2014
@@ -0,0 +1,12 @@
+<config>
+    <!-- <element> MUST be defined before <product-type> so their patterns
can be resolved -->
+    <!-- name MUST be an element defined in elements.xml (also only upper and lower case
alpha chars) -->
+    <!-- regexp MUST be valid input to java.util.regex.Pattern.compile() -->
+    <element name="ISBN" regexp="[0-9]{10}"/>
+    <element name="Page" regexp="[0-9]*"/>
+
+    <!-- name MUST be a ProductType name defined in product-types.xml -->
+    <!-- metadata elements inside brackets MUST be mapped to the ProductType, as defined
in product-type-element-map.xml -->
+    <product-type name="Book" template="book-[ISBN].txt"/>
+    <product-type name="BookPage" template="page-[Page]-[ISBN].txt"/>
+</config>

Added: oodt/trunk/metadata/src/testdata/product-type-patterns.xml
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/testdata/product-type-patterns.xml?rev=1628235&view=auto
==============================================================================
--- oodt/trunk/metadata/src/testdata/product-type-patterns.xml (added)
+++ oodt/trunk/metadata/src/testdata/product-type-patterns.xml Mon Sep 29 17:47:54 2014
@@ -0,0 +1,12 @@
+<config>
+    <!-- <element> MUST be defined before <product-type> so their patterns
can be resolved -->
+    <!-- name MUST be an element defined in elements.xml (also only upper and lower case
alpha chars) -->
+    <!-- regexp MUST be valid input to java.util.regex.Pattern.compile() -->
+    <element name="ISBN" regexp="[0-9]{10}"/>
+    <element name="Page" regexp="[0-9]*"/>
+
+    <!-- name MUST be a ProductType name defined in product-types.xml -->
+    <!-- metadata elements inside brackets MUST be mapped to the ProductType, as defined
in product-type-element-map.xml -->
+    <product-type name="Book" template="book-[ISBN].txt"/>
+    <product-type name="BookPage" template="page-[ISBN]-[Page].txt"/>
+</config>



Mime
View raw message