lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r1712629 - in /lucene/dev/trunk/solr: ./ contrib/extraction/src/java/org/apache/solr/handler/extraction/ contrib/extraction/src/test-files/extraction/ contrib/extraction/src/test-files/extraction/solr/collection1/conf/ contrib/extraction/sr...
Date Wed, 04 Nov 2015 20:13:41 GMT
Author: uschindler
Date: Wed Nov  4 20:13:40 2015
New Revision: 1712629

URL: http://svn.apache.org/viewvc?rev=1712629&view=rev
Log:
SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader

Added:
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
  (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
  (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
  (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
  (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
    lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1712629&r1=1712628&r2=1712629&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Nov  4 20:13:40 2015
@@ -195,6 +195,10 @@ New Features
 
 * SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
 
+* SOLR-8166: Introduce possibility to configure ParseContext in
+  ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
+  via Uwe Schindler)
+
 Bug Fixes
 ----------------------
 

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1712629&r1=1712628&r2=1712629&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Wed Nov  4 20:13:40 2015
@@ -91,13 +91,16 @@ public class ExtractingDocumentLoader ex
   private final AddUpdateCommand templateAdd;
 
   protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;
   protected SolrContentHandlerFactory factory;
 
   public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
-                           TikaConfig config, SolrContentHandlerFactory factory) {
+                           TikaConfig config, ParseContextConfig parseContextConfig,
+                                  SolrContentHandlerFactory factory) {
     this.params = req.getParams();
     this.core = req.getCore();
     this.config = config;
+    this.parseContextConfig = parseContextConfig;
     this.processor = processor;
 
     templateAdd = new AddUpdateCommand(req);
@@ -199,7 +202,10 @@ public class ExtractingDocumentLoader ex
 
         try{
           //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
for getting the document.
-          ParseContext context = new ParseContext();//TODO: should we design a way to pass
in parse context?
+          ParseContext context = parseContextConfig.create();
+
+
+          context.set(Parser.class, parser);
           context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
 
           // Password handling

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=1712629&r1=1712628&r2=1712629&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
Wed Nov  4 20:13:40 2015
@@ -48,10 +48,12 @@ public class ExtractingRequestHandler ex
 
   private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
 
+  public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
   public static final String CONFIG_LOCATION = "tika.config";
   public static final String DATE_FORMATS = "date.formats";
 
   protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;
 
 
   protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
@@ -79,6 +81,16 @@ public class ExtractingRequestHandler ex
           throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
       }
+
+      String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
+      if (parseContextConfigLoc != null) {
+        try {
+          parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
+        } catch (Exception e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+      }
+
       NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
       if (configDateFormats != null && configDateFormats.size() > 0) {
         dateFormats = new HashSet<>();
@@ -97,6 +109,9 @@ public class ExtractingRequestHandler ex
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
       }
     }
+    if (parseContextConfig == null) {
+      parseContextConfig = new ParseContextConfig();
+    }
     factory = createFactory();
   }
 
@@ -111,7 +126,7 @@ public class ExtractingRequestHandler ex
 
   @Override
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor)
{
-    return new ExtractingDocumentLoader(req, processor, config, factory);
+    return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
   }
 
   // ////////////////////// SolrInfoMBeans methods //////////////////////

Added: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java?rev=1712629&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
(added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
Wed Nov  4 20:13:40 2015
@@ -0,0 +1,114 @@
+package org.apache.solr.handler.extraction;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.beans.BeanInfo;
+import java.beans.Introspector;
+import java.beans.PropertyDescriptor;
+import java.beans.PropertyEditor;
+import java.beans.PropertyEditorManager;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+public class ParseContextConfig {
+  private final Map<Class<?>, Object> entries = new HashMap<>();
+
+  /** Creates an empty Config without any settings (used as placeholder). */
+  public ParseContextConfig() {
+  }
+
+  /** Creates a {@code ParseContextConfig} from the given XML DOM element. */
+  public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception
{
+    extract(element, resourceLoader);
+  }
+
+  /** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given
{@link SolrResourceLoader}. */
+  public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc)
throws Exception {
+    this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement());
+  }
+  
+  private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc)
throws Exception {
+    try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) {
+      return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc);
+    }
+  }
+
+  private void extract(Element element, SolrResourceLoader loader) throws Exception {
+    final NodeList xmlEntries = element.getElementsByTagName("entry");
+    for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) {
+      final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes();
+      final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue();
+      final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue();
+
+      final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property");
+
+      final Class<?> interfaceClass = loader.findClass(className, Object.class);
+      final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO);
+      
+      final HashMap<String, PropertyDescriptor> descriptorMap = new HashMap<>();
+      for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
+        descriptorMap.put(pd.getName(), pd);
+      }
+
+      final Object instance = loader.newInstance(implementationName, Object.class);
+      if (!interfaceClass.isInstance(instance)) {
+        throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName());
+      }
+
+      for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
+        final Node xmlProperty = xmlProperties.item(j);
+        final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes();
+
+        final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue();
+        final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue();
+
+        final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName);
+        propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(),
propertyValue));
+      }
+
+      entries.put(interfaceClass, instance);
+    }
+  }
+
+  private Object getValueFromString(Class<?> targetType, String text) {
+    final PropertyEditor editor = PropertyEditorManager.findEditor(targetType);
+    editor.setAsText(text);
+    return editor.getValue();
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  public ParseContext create() {
+    final ParseContext result = new ParseContext();
+
+    for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
+      result.set((Class) entry.getKey(), entry.getValue());
+    }
+
+    return result;
+  }
+}

Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf?rev=1712629&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml?rev=1712629&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
(added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
Wed Nov  4 20:13:40 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<entries>
+  <entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
+    <property name="extractInlineImages" value="true"/>
+  </entry>
+</entries>

Modified: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml?rev=1712629&r1=1712628&r2=1712629&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
Wed Nov  4 20:13:40 2015
@@ -185,7 +185,9 @@
     </lst>
   </requestHandler>
 
-  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
+  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+    <str name="parseContext.config">parseContext.xml</str>
+  </requestHandler>
 
   <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
     <lst name="defaults">

Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1712629&r1=1712628&r2=1712629&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
Wed Nov  4 20:13:40 2015
@@ -657,6 +657,28 @@ public class ExtractingRequestHandlerTes
   }
 
   @Test
+  public void testPdfWithImages() throws Exception {
+    //Tests possibility to configure ParseContext (by example to extract embedded images
from pdf)
+    loadLocal("extraction/pdf-with-image.pdf",
+        "fmap.created", "extractedDate",
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator",
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfWithImage",
+        "resource.name", "pdf-with-image.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+  }
+
+  @Test
   public void testPasswordProtected() throws Exception {
     // PDF, Passwords from resource.password
     loadLocal("extraction/encrypted-password-is-solrRules.pdf",
@@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTes
 
     // DOCX, Passwords from file
     loadLocal("extraction/password-is-Word2010.docx", 
-        "fmap.created", "extractedDate", 
+        "fmap.created", "extractedDate",
         "fmap.producer", "extractedProducer",
         "fmap.creator", "extractedCreator", 
         "fmap.Keywords", "extractedKeywords",

Added: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java?rev=1712629&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
(added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
Wed Nov  4 20:13:40 2015
@@ -0,0 +1,53 @@
+package org.apache.solr.handler.extraction;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public class ParseContextConfigTest extends SolrTestCaseJ4 {
+
+  public void  testAll() throws Exception {
+    Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
+    Element entries = document.createElement("entries");
+    Element entry = document.createElement("entry");
+
+
+    entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
+    entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
+
+    Element property = document.createElement("property");
+
+    property.setAttribute("name", "extractInlineImages");
+    property.setAttribute("value", "true");
+    entry.appendChild(property);
+    entries.appendChild(entry);
+
+    ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create();
+
+    PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+
+    assertEquals(true, pdfParserConfig.getExtractInlineImages());
+  }
+
+}



Mime
View raw message