manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1446153 - in /manifoldcf/trunk: ./ connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/
Date Thu, 14 Feb 2013 13:03:24 GMT
Author: kwright
Date: Thu Feb 14 13:03:24 2013
New Revision: 1446153

URL: http://svn.apache.org/r1446153
Log:
Revisit the fix for CONNECTORS-623.  Override the HttpSolrServer class instead, and force
multipart post, in order to guarantee delivery of content type, name, and length to SolrCell.

Added:
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
  (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1446153&r1=1446152&r2=1446153&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Feb 14 13:03:24 2013
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 1.2-dev =====================
 
+CONNECTORS-623: A better fix, overriding the posting method for
+SolrJ and forcing multipart post.  Although still a hack, this should cause
+the behavior to work like MCF 1.0.1 now.
+(Karl Wright)
+
 CONNECTORS-645: Handle null data properly in worker thread.
 (Maciej Li¿ewski, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1446153&r1=1446152&r2=1446153&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
(original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Thu Feb 14 13:03:24 2013
@@ -243,9 +243,7 @@ public class HttpPoster
     }
 
     String httpSolrServerUrl = protocol + "://" + server + ":" + port + location;
-    HttpSolrServer httpSolrServer = new HttpSolrServer(httpSolrServerUrl, localClient);
-    // For portability with older versions of Solr
-    httpSolrServer.setParser(new XMLResponseParser());
+    HttpSolrServer httpSolrServer = new ModifiedHttpSolrServer(httpSolrServerUrl, localClient,
new XMLResponseParser());
     // Set the solrj instance we want to use
     solrServer = httpSolrServer;
   }
@@ -812,9 +810,10 @@ public class HttpPoster
               writeField(out,LITERAL+newFieldName,values);
             }
           }
-               
-          writeField(out,LITERAL+"stream_size",String.valueOf(length));
-          writeField(out,LITERAL+"stream_name",document.getFileName());
+             
+          // These are unnecessary now in the case of non-solrcloud setups, because we overrode
the SolrJ posting method to use multipart.
+          //writeField(out,LITERAL+"stream_size",String.valueOf(length));
+          //writeField(out,LITERAL+"stream_name",document.getFileName());
           
           // Write the commitWithin parameter
           if (commitWithin != null)

Added: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java?rev=1446153&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
(added)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
Thu Feb 14 13:03:24 2013
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.manifoldcf.agents.output.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.http.Header;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.NameValuePair;
+import org.apache.http.NoHttpResponseException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.client.methods.HttpRequestBase;
+import org.apache.http.client.params.ClientPNames;
+import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.mime.FormBodyPart;
+import org.apache.http.entity.mime.HttpMultipartMode;
+import org.apache.http.entity.mime.MultipartEntity;
+import org.apache.http.entity.mime.content.InputStreamBody;
+import org.apache.http.entity.mime.content.StringBody;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.message.BasicHeader;
+import org.apache.http.message.BasicNameValuePair;
+import org.apache.http.util.EntityUtils;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.client.solrj.ResponseParser;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.RequestWriter;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.UpdateResponse;
+import org.apache.solr.client.solrj.util.ClientUtils;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.NamedList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/** This class overrides and somewhat changes the behavior of the
+* SolrJ HttpSolrServer class.  The point of all this is simply to get
+* the right information to Tika.  When SolrJ uses GET or POST but not
+* multipart-post, it does not include multipart headers that Tika uses -
+* specifically, the name of the document and the length of the document.
+* Patches have been submitted to the SOLR ticket queue to address this
+* problem in a method-insensitive way, but so far there has been no sign that
+* the Solr team is interested in committing them.
+*/
+public class ModifiedHttpSolrServer extends HttpSolrServer
+{
+  // Here we duplicate all the private fields we need
+  
+  private static final String UTF_8 = "UTF-8";
+  private static final String DEFAULT_PATH = "/select";
+
+  private final HttpClient httpClient;
+  private boolean followRedirects = false;
+  private int maxRetries = 0;
+  private boolean useMultiPartPost = true;
+
+  public ModifiedHttpSolrServer(String baseURL, HttpClient client, ResponseParser parser)
{
+    super(baseURL, client, parser);
+    httpClient = client;
+  }
+  
+  @Override
+  public NamedList<Object> request(final SolrRequest request,
+      final ResponseParser processor) throws SolrServerException, IOException {
+    HttpRequestBase method = null;
+    InputStream is = null;
+    SolrParams params = request.getParams();
+    Collection<ContentStream> streams = requestWriter.getContentStreams(request);
+    String path = requestWriter.getPath(request);
+    if (path == null || !path.startsWith("/")) {
+      path = DEFAULT_PATH;
+    }
+    
+    ResponseParser parser = request.getResponseParser();
+    if (parser == null) {
+      parser = this.parser;
+    }
+    
+    // The parser 'wt=' and 'version=' params are used instead of the original
+    // params
+    ModifiableSolrParams wparams = new ModifiableSolrParams(params);
+    if (parser != null) {
+      wparams.set(CommonParams.WT, parser.getWriterType());
+      wparams.set(CommonParams.VERSION, parser.getVersion());
+    }
+    if (invariantParams != null) {
+      wparams.add(invariantParams);
+    }
+    params = wparams;
+    
+    int tries = maxRetries + 1;
+    try {
+      while( tries-- > 0 ) {
+        // Note: since we aren't do intermittent time keeping
+        // ourselves, the potential non-timeout latency could be as
+        // much as tries-times (plus scheduling effects) the given
+        // timeAllowed.
+        try {
+          if( SolrRequest.METHOD.GET == request.getMethod() ) {
+            if( streams != null ) {
+              throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "GET can't send
streams!" );
+            }
+            method = new HttpGet( baseUrl + path + ClientUtils.toQueryString( params, false
) );
+          }
+          else if( SolrRequest.METHOD.POST == request.getMethod() ) {
+
+            String url = baseUrl + path;
+            boolean isMultipart = ( streams != null && streams.size() > 1 );
+
+            LinkedList<NameValuePair> postParams = new LinkedList<NameValuePair>();
+            if (streams == null || isMultipart) {
+              HttpPost post = new HttpPost(url);
+              post.setHeader("Content-Charset", "UTF-8");
+              if (!this.useMultiPartPost && !isMultipart) {
+                post.addHeader("Content-Type",
+                    "application/x-www-form-urlencoded; charset=UTF-8");
+              }
+
+              List<FormBodyPart> parts = new LinkedList<FormBodyPart>();
+              Iterator<String> iter = params.getParameterNamesIterator();
+              while (iter.hasNext()) {
+                String p = iter.next();
+                String[] vals = params.getParams(p);
+                if (vals != null) {
+                  for (String v : vals) {
+                    if (this.useMultiPartPost || isMultipart) {
+                      parts.add(new FormBodyPart(p, new StringBody(v, Charset.forName("UTF-8"))));
+                    } else {
+                      postParams.add(new BasicNameValuePair(p, v));
+                    }
+                  }
+                }
+              }
+
+              if (isMultipart) {
+                for (ContentStream content : streams) {
+                  String contentType = content.getContentType();
+                  if(contentType==null) {
+                    contentType = "application/octet-stream"; // default
+                  }
+                  parts.add(new FormBodyPart(content.getName(), 
+                       new InputStreamBody(
+                           content.getStream(), 
+                           contentType, 
+                           content.getName())));
+                }
+              }
+              
+              if (parts.size() > 0) {
+                MultipartEntity entity = new MultipartEntity(HttpMultipartMode.STRICT);
+                for(FormBodyPart p: parts) {
+                  entity.addPart(p);
+                }
+                post.setEntity(entity);
+              } else {
+                //not using multipart
+                post.setEntity(new UrlEncodedFormEntity(postParams, "UTF-8"));
+              }
+
+              method = post;
+            }
+            // It is has one stream, it is the post body, put the params in the URL
+            else {
+              String pstr = ClientUtils.toQueryString(params, false);
+              HttpPost post = new HttpPost(url + pstr);
+
+              // Single stream as body
+              // Using a loop just to get the first one
+              final ContentStream[] contentStream = new ContentStream[1];
+              for (ContentStream content : streams) {
+                contentStream[0] = content;
+                break;
+              }
+              if (contentStream[0] instanceof RequestWriter.LazyContentStream) {
+                post.setEntity(new InputStreamEntity(contentStream[0].getStream(), -1) {
+                  @Override
+                  public Header getContentType() {
+                    return new BasicHeader("Content-Type", contentStream[0].getContentType());
+                  }
+                  
+                  @Override
+                  public boolean isRepeatable() {
+                    return false;
+                  }
+                  
+                });
+              } else {
+                post.setEntity(new InputStreamEntity(contentStream[0].getStream(), -1) {
+                  @Override
+                  public Header getContentType() {
+                    return new BasicHeader("Content-Type", contentStream[0].getContentType());
+                  }
+                  
+                  @Override
+                  public boolean isRepeatable() {
+                    return false;
+                  }
+                });
+              }
+              method = post;
+            }
+          }
+          else {
+            throw new SolrServerException("Unsupported method: "+request.getMethod() );
+          }
+        }
+        catch( NoHttpResponseException r ) {
+          method = null;
+          if(is != null) {
+            is.close();
+          }
+          // If out of tries then just rethrow (as normal error).
+          if (tries < 1) {
+            throw r;
+          }
+        }
+      }
+    } catch (IOException ex) {
+      throw new SolrServerException("error reading streams", ex);
+    }
+    
+    // XXX client already has this set, is this needed?
+    method.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS,
+        followRedirects);
+    method.addHeader("User-Agent", AGENT);
+    
+    InputStream respBody = null;
+    boolean shouldClose = true;
+    
+    try {
+      // Execute the method.
+      final HttpResponse response = httpClient.execute(method);
+      int httpStatus = response.getStatusLine().getStatusCode();
+      
+      // Read the contents
+      respBody = response.getEntity().getContent();
+      
+      // handle some http level checks before trying to parse the response
+      switch (httpStatus) {
+        case HttpStatus.SC_OK:
+        case HttpStatus.SC_BAD_REQUEST:
+        case HttpStatus.SC_CONFLICT:  // 409
+          break;
+        case HttpStatus.SC_MOVED_PERMANENTLY:
+        case HttpStatus.SC_MOVED_TEMPORARILY:
+          if (!followRedirects) {
+            throw new SolrServerException("Server at " + getBaseURL()
+                + " sent back a redirect (" + httpStatus + ").");
+          }
+          break;
+        default:
+          throw new SolrException(SolrException.ErrorCode.getErrorCode(httpStatus), "Server
at " + getBaseURL()
+              + " returned non ok status:" + httpStatus + ", message:"
+              + response.getStatusLine().getReasonPhrase());
+          
+      }
+      if (processor == null) {
+        // no processor specified, return raw stream
+        NamedList<Object> rsp = new NamedList<Object>();
+        rsp.add("stream", respBody);
+        // Only case where stream should not be closed
+        shouldClose = false;
+        return rsp;
+      }
+      String charset = EntityUtils.getContentCharSet(response.getEntity());
+      NamedList<Object> rsp = processor.processResponse(respBody, charset);
+      if (httpStatus != HttpStatus.SC_OK) {
+        String reason = null;
+        try {
+          NamedList err = (NamedList) rsp.get("error");
+          if (err != null) {
+            reason = (String) err.get("msg");
+            // TODO? get the trace?
+          }
+        } catch (Exception ex) {}
+        if (reason == null) {
+          StringBuilder msg = new StringBuilder();
+          msg.append(response.getStatusLine().getReasonPhrase());
+          msg.append("\n\n");
+          msg.append("request: " + method.getURI());
+          reason = java.net.URLDecoder.decode(msg.toString(), UTF_8);
+        }
+        throw new SolrException(
+            SolrException.ErrorCode.getErrorCode(httpStatus), reason);
+      }
+      return rsp;
+    } catch (ConnectException e) {
+      throw new SolrServerException("Server refused connection at: "
+          + getBaseURL(), e);
+    } catch (SocketTimeoutException e) {
+      throw new SolrServerException(
+          "Timeout occured while waiting response from server at: "
+              + getBaseURL(), e);
+    } catch (IOException e) {
+      throw new SolrServerException(
+          "IOException occured when talking to server at: " + getBaseURL(), e);
+    } finally {
+      if (respBody != null && shouldClose) {
+        try {
+          respBody.close();
+        } catch (Throwable t) {} // ignore
+      }
+    }
+  }
+
+  @Override
+  public void setFollowRedirects(boolean followRedirects) {
+    super.setFollowRedirects(followRedirects);
+    this.followRedirects = followRedirects;
+  }
+
+}

Propchange: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message