Return-Path: X-Original-To: apmail-manifoldcf-commits-archive@www.apache.org Delivered-To: apmail-manifoldcf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 3F1E017FFC for ; Sat, 16 May 2015 11:34:16 +0000 (UTC) Received: (qmail 64172 invoked by uid 500); 16 May 2015 11:34:16 -0000 Delivered-To: apmail-manifoldcf-commits-archive@manifoldcf.apache.org Received: (qmail 64121 invoked by uid 500); 16 May 2015 11:34:15 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 64103 invoked by uid 99); 16 May 2015 11:34:15 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 16 May 2015 11:34:15 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id A8635AC02E6 for ; Sat, 16 May 2015 11:34:15 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1679730 [2/2] - in /manifoldcf/trunk: ./ connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/ Date: Sat, 16 May 2015 11:34:15 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150516113415.A8635AC02E6@hades.apache.org> Modified: manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java?rev=1679730&r1=1679729&r2=1679730&view=diff ============================================================================== --- manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java (original) +++ manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxDocument.java Sat May 16 11:34:14 2015 @@ -51,316 +51,316 @@ import java.util.*; */ public class SearchBloxDocument { - static final String API_KEY = "apikey"; - static final String SEARCHBLOX_COLLECTION = "collection"; - static final String DATE_FORMAT = "dd MMMM yyyy HH:mm:ss z"; - - public enum IndexingFormat { - JSON, XML - } - - public enum DocumentAction { - ADD_UPDATE, DELETE, STATUS, CREATE, CLEAR - } - static final List xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size", - "alpha","contenttype","category","meta","uid"); - - static final String COLNAME_ATTRIBUTE = "colname"; - static final String APIKEY_ATTRIBUTE = "apikey"; - static final String NAME_ATTRIBUTE = "name"; - static final String UID_ATTRIBUTE = "uid"; - static final String BOOST_ATTRIBUTE = "boost"; - - private Multimap data_fields = HashMultimap.create(); - - /** - * API key accessible in the SearchBlox Admin Console. - */ - String apiKey; - - /** - * Name of the Custom collection - */ - String colName; - - /** - * unique identifer for a document (default when unassigned is url location) - */ - String uid; - - public SearchBloxDocument(String apikey) { - this.apiKey = apikey; - } - - public SearchBloxDocument(String apikey, String documentURI, - RepositoryDocument rd, Map> args) { - this(apikey); - SimpleDateFormat dateFormat = new SimpleDateFormat(DATE_FORMAT); - - this.uid = documentURI; - this.colName = args.get(SEARCHBLOX_COLLECTION).get(0); - - Date date = rd.getModifiedDate(); - if(date!=null){ - data_fields.put(xmlElements.get(7), - dateFormat.format(rd.getModifiedDate())); - } - - // content - String content = ""; - try { - if (rd.getField(xmlElements.get(5)) != null) - content = (String) rd.getField(xmlElements.get(5))[0]; - else - content = this.buildString(rd.getBinaryStream()); - } catch (IOException e) { - Logging.connectors - .error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer", - e); - } - data_fields.put(xmlElements.get(5), this.clean(content)); - - // Content Type - data_fields.put(xmlElements.get(10), rd.getMimeType()); - - // Size - data_fields.put(xmlElements.get(8), "" + rd.getBinaryLength()); - - // Boosting - for(String boostId:args.keySet()){ - if(boostId.endsWith("_boost")){ - List argBoost = args.get(boostId); - if(argBoost!=null && !argBoost.isEmpty()) - data_fields.put(boostId,argBoost.get(0)); - } - } - - // Metadata - Multimap metadata = HashMultimap.create(); - Iterator it = rd.getFields(); - while (it.hasNext()) { - String name = it.next(); - try { - String[] values = rd.getFieldAsStrings(name); - for (String value : values) { - String key = name.toLowerCase(); - int indexOf = xmlElements.indexOf(key); - if(indexOf != 5) - if (indexOf != -1 && - indexOf != 0 && - indexOf != 7 && - indexOf != 8) { - data_fields.put(key, value); - } else - metadata.put(name, value); - } - } catch (IOException e) { - Logging.connectors.error( - "[Getting Field Values]Impossible to read value for metadata " - + name, e); - } - } - - // ACLS must be stored as metadata, as Searchblox use that construct to index custom named fields - //the approach has been implemented and tested live - Iterator aclTypes = rd.securityTypesIterator(); - while (aclTypes.hasNext()) { - String aclType = aclTypes.next(); - String[] allow_tokens = rd.getSecurityACL(aclType); - for (String token : allow_tokens) - metadata.put(aclType+"_allow", token); - String[] deny_tokens = rd.getSecurityDenyACL(aclType); - for (String token : deny_tokens) - metadata.put(aclType+"_deny", token); - } - data_fields.put(xmlElements.get(12), metadata); - } - - /** - * Clean a String from html tags or break lines - * @param content - * @return - */ - private String clean(String content) { - content = content.replaceAll("(\r\n|\n)", " "); - String cleanContent= Jsoup.parseBodyFragment(content).text(); - return cleanContent; - } - - private String buildString(InputStream binaryStream) throws IOException { - StringWriter writer = new StringWriter(); - IOUtils.copy(binaryStream, writer, "UTF-8"); - return writer.toString(); - } - - public String toString(IndexingFormat format, DocumentAction action) - throws SearchBloxException { - if(format == IndexingFormat.XML) - return toStringXML(action); - else - return toStringJSON(action); - } - - private String toStringJSON(DocumentAction action) throws SearchBloxException { - JSONObject result = new JSONObject(); - if (apiKey == null) - throw new SearchBloxException( - "The API Key for accessing SearchBlox Server CAN'T be NULL"); - try { - result.put(APIKEY_ATTRIBUTE, apiKey); - - JSONObject document = new JSONObject(); - if (colName == null) - throw new SearchBloxException( - "The Collection Name of the SearchBlox Server CAN'T be NULL"); - document.put(COLNAME_ATTRIBUTE, colName); - document.put(UID_ATTRIBUTE, uid); - - if(action == DocumentAction.ADD_UPDATE){ - for(String element:xmlElements){ - if (!element.equals(xmlElements.get(12))) { - Collection values = data_fields.get(element); - if (values!=null && values.size()>0) { - Object next = values.iterator() - .next(); - String value =(String) next; - if (value != null && !value.isEmpty()) { - if(element.equals("keywords")) - document.put(element, StringUtils.join(values, ',')); - else - document.put(element, value); -// } - - } - } - } - } - - // Metadata - Collection metadataSet = data_fields - .get(xmlElements.get(12)); - JSONObject metaObject = new JSONObject(); - if(metadataSet!=null && metadataSet.size()>0){ - Multimap metadata = (Multimap) metadataSet.iterator().next(); - if (metadata != null && !metadata.isEmpty()) { - for (String name : metadata.keySet()){ - JSONArray nextMetadata = new JSONArray(); - for (String value : metadata.get(name)) { - nextMetadata.put(value); - } - metaObject.put(name, nextMetadata); - } - } - } - document.put(xmlElements.get(12), metaObject); - } - - result.put(xmlElements.get(1), document); - - } catch (JSONException e) { - throw new SearchBloxException("Error while building Document JSON object", e); - } - return result.toString(); - } - - private String toStringXML(DocumentAction action) throws SearchBloxException{ - Document doc = null; - try { - doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument(); - - } catch (ParserConfigurationException e) { - throw new SearchBloxException(e); - } - - // Document Base Data - Element root = doc.createElement(xmlElements.get(0)); - if (apiKey == null) - throw new SearchBloxException( - "The API Key for accessing SearchBlox Server CAN'T be NULL"); - root.setAttribute(APIKEY_ATTRIBUTE, apiKey); - doc.appendChild(root); - Element document = doc.createElement(xmlElements.get(1)); - if (colName == null) - throw new SearchBloxException( - "The Collection Name of the SearchBlox Server CAN'T be NULL"); - document.setAttribute(COLNAME_ATTRIBUTE, colName); - if(action == DocumentAction.DELETE) - document.setAttribute(UID_ATTRIBUTE,uid); - root.appendChild(document); - - if (action == DocumentAction.ADD_UPDATE) { - // Uid - if (uid != null && !uid.isEmpty()) { - Element uidElement = doc.createElement(xmlElements.get(13)); - uidElement.setTextContent(uid); - document.appendChild(uidElement); - } - - for(String element:xmlElements){ - if (!element.equals(xmlElements.get(12))) { - Collection values = data_fields.get(element); - if (values!=null && values.size()>0) { - Object next = values.iterator() - .next(); - String value =(String) next; - if (value != null && !value.isEmpty()) { - Element eValue = doc.createElement(element); - if(element.equals("keywords")) - eValue.setTextContent(StringUtils.join(values, ',')); - else - eValue.setTextContent(value); - Collection boostElement = data_fields - .get(element + "_boost"); - if(boostElement!=null && boostElement.size()>0){ - String value_boost = (String) boostElement.iterator() - .next(); - eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost); - } - document.appendChild(eValue); - } - } - } - } - - // Metadata - Collection metadataSet = data_fields - .get(xmlElements.get(12)); - if(metadataSet!=null && metadataSet.size()>0){ - Multimap metadata = (Multimap) metadataSet.iterator().next(); - if (metadata != null && !metadata.isEmpty()) { - for (String name : metadata.keySet()) - for (String value : metadata.get(name)) { - Element metaElement = doc.createElement(xmlElements.get(12)); - metaElement.setAttribute(NAME_ATTRIBUTE, name); - metaElement.setTextContent(value); - document.appendChild(metaElement); - } - } } - } - - return getStringFromDocument(doc); - } - - /** - *

Transform a {@code Document} to its XML string representation

- * @param doc the document to transform - * @return the document in the XML-String format - */ - private String getStringFromDocument(Document doc) { - try { - DOMSource domSource = new DOMSource(doc); - StringWriter writer = new StringWriter(); - StreamResult result = new StreamResult(writer); - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer transformer = tf.newTransformer(); - // transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); - transformer.transform(domSource, result); - return writer.toString(); - } catch (TransformerException ex) { - ex.printStackTrace(); - return null; - } + static final String API_KEY = "apikey"; + static final String SEARCHBLOX_COLLECTION = "collection"; + static final String DATE_FORMAT = "dd MMMM yyyy HH:mm:ss z"; + + public enum IndexingFormat { + JSON, XML + } + + public enum DocumentAction { + ADD_UPDATE, DELETE, STATUS, CREATE, CLEAR + } + static final List xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size", + "alpha","contenttype","category","meta","uid"); + + static final String COLNAME_ATTRIBUTE = "colname"; + static final String APIKEY_ATTRIBUTE = "apikey"; + static final String NAME_ATTRIBUTE = "name"; + static final String UID_ATTRIBUTE = "uid"; + static final String BOOST_ATTRIBUTE = "boost"; + + private Multimap data_fields = HashMultimap.create(); + + /** + * API key accessible in the SearchBlox Admin Console. + */ + String apiKey; + + /** + * Name of the Custom collection + */ + String colName; + + /** + * unique identifer for a document (default when unassigned is url location) + */ + String uid; + + public SearchBloxDocument(String apikey) { + this.apiKey = apikey; + } + + public SearchBloxDocument(String apikey, String documentURI, + RepositoryDocument rd, Map> args) { + this(apikey); + SimpleDateFormat dateFormat = new SimpleDateFormat(DATE_FORMAT); + + this.uid = documentURI; + this.colName = args.get(SEARCHBLOX_COLLECTION).get(0); + + Date date = rd.getModifiedDate(); + if(date!=null){ + data_fields.put(xmlElements.get(7), + dateFormat.format(rd.getModifiedDate())); + } + + // content + String content = ""; + try { + if (rd.getField(xmlElements.get(5)) != null) + content = (String) rd.getField(xmlElements.get(5))[0]; + else + content = this.buildString(rd.getBinaryStream()); + } catch (IOException e) { + Logging.connectors + .error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer", + e); + } + data_fields.put(xmlElements.get(5), this.clean(content)); + + // Content Type + data_fields.put(xmlElements.get(10), rd.getMimeType()); + + // Size + data_fields.put(xmlElements.get(8), "" + rd.getBinaryLength()); + + // Boosting + for(String boostId:args.keySet()){ + if(boostId.endsWith("_boost")){ + List argBoost = args.get(boostId); + if(argBoost!=null && !argBoost.isEmpty()) + data_fields.put(boostId,argBoost.get(0)); + } + } + + // Metadata + Multimap metadata = HashMultimap.create(); + Iterator it = rd.getFields(); + while (it.hasNext()) { + String name = it.next(); + try { + String[] values = rd.getFieldAsStrings(name); + for (String value : values) { + String key = name.toLowerCase(); + int indexOf = xmlElements.indexOf(key); + if(indexOf != 5) + if (indexOf != -1 && + indexOf != 0 && + indexOf != 7 && + indexOf != 8) { + data_fields.put(key, value); + } else + metadata.put(name, value); + } + } catch (IOException e) { + Logging.connectors.error( + "[Getting Field Values]Impossible to read value for metadata " + + name, e); + } + } + + // ACLS must be stored as metadata, as Searchblox use that construct to index custom named fields + //the approach has been implemented and tested live + Iterator aclTypes = rd.securityTypesIterator(); + while (aclTypes.hasNext()) { + String aclType = aclTypes.next(); + String[] allow_tokens = rd.getSecurityACL(aclType); + for (String token : allow_tokens) + metadata.put(aclType+"_allow", token); + String[] deny_tokens = rd.getSecurityDenyACL(aclType); + for (String token : deny_tokens) + metadata.put(aclType+"_deny", token); + } + data_fields.put(xmlElements.get(12), metadata); + } + + /** + * Clean a String from html tags or break lines + * @param content + * @return + */ + private String clean(String content) { + content = content.replaceAll("(\r\n|\n)", " "); + String cleanContent= Jsoup.parseBodyFragment(content).text(); + return cleanContent; + } + + private String buildString(InputStream binaryStream) throws IOException { + StringWriter writer = new StringWriter(); + IOUtils.copy(binaryStream, writer, "UTF-8"); + return writer.toString(); + } + + public String toString(IndexingFormat format, DocumentAction action) + throws SearchBloxException { + if(format == IndexingFormat.XML) + return toStringXML(action); + else + return toStringJSON(action); + } + + private String toStringJSON(DocumentAction action) throws SearchBloxException { + JSONObject result = new JSONObject(); + if (apiKey == null) + throw new SearchBloxException( + "The API Key for accessing SearchBlox Server CAN'T be NULL"); + try { + result.put(APIKEY_ATTRIBUTE, apiKey); + + JSONObject document = new JSONObject(); + if (colName == null) + throw new SearchBloxException( + "The Collection Name of the SearchBlox Server CAN'T be NULL"); + document.put(COLNAME_ATTRIBUTE, colName); + document.put(UID_ATTRIBUTE, uid); + + if(action == DocumentAction.ADD_UPDATE){ + for(String element:xmlElements){ + if (!element.equals(xmlElements.get(12))) { + Collection values = data_fields.get(element); + if (values!=null && values.size()>0) { + Object next = values.iterator() + .next(); + String value =(String) next; + if (value != null && !value.isEmpty()) { + if(element.equals("keywords")) + document.put(element, StringUtils.join(values, ',')); + else + document.put(element, value); + + } + } + } + } + + // Metadata + Collection metadataSet = data_fields + .get(xmlElements.get(12)); + JSONObject metaObject = new JSONObject(); + if(metadataSet!=null && metadataSet.size()>0){ + Multimap metadata = (Multimap) metadataSet.iterator().next(); + if (metadata != null && !metadata.isEmpty()) { + for (String name : metadata.keySet()){ + JSONArray nextMetadata = new JSONArray(); + for (String value : metadata.get(name)) { + nextMetadata.put(value); + } + metaObject.put(name, nextMetadata); + } + } + } + document.put(xmlElements.get(12), metaObject); + } + + result.put(xmlElements.get(1), document); + + } catch (JSONException e) { + throw new SearchBloxException("Error while building Document JSON object", e); + } + return result.toString(); + } + + private String toStringXML(DocumentAction action) throws SearchBloxException{ + Document doc = null; + try { + doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument(); + + } catch (ParserConfigurationException e) { + throw new SearchBloxException(e); + } + + // Document Base Data + Element root = doc.createElement(xmlElements.get(0)); + if (apiKey == null) + throw new SearchBloxException( + "The API Key for accessing SearchBlox Server CAN'T be NULL"); + root.setAttribute(APIKEY_ATTRIBUTE, apiKey); + doc.appendChild(root); + Element document = doc.createElement(xmlElements.get(1)); + if (colName == null) + throw new SearchBloxException( + "The Collection Name of the SearchBlox Server CAN'T be NULL"); + document.setAttribute(COLNAME_ATTRIBUTE, colName); + if(action == DocumentAction.DELETE) + document.setAttribute(UID_ATTRIBUTE,uid); + root.appendChild(document); + + if (action == DocumentAction.ADD_UPDATE) { + // Uid + if (uid != null && !uid.isEmpty()) { + Element uidElement = doc.createElement(xmlElements.get(13)); + uidElement.setTextContent(uid); + document.appendChild(uidElement); + } + + for(String element:xmlElements){ + if (!element.equals(xmlElements.get(12))) { + Collection values = data_fields.get(element); + if (values!=null && values.size()>0) { + Object next = values.iterator() + .next(); + String value =(String) next; + if (value != null && !value.isEmpty()) { + Element eValue = doc.createElement(element); + if(element.equals("keywords")) + eValue.setTextContent(StringUtils.join(values, ',')); + else + eValue.setTextContent(value); + Collection boostElement = data_fields + .get(element + "_boost"); + if(boostElement!=null && boostElement.size()>0){ + String value_boost = (String) boostElement.iterator() + .next(); + eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost); + } + document.appendChild(eValue); + } + } + } + } + + // Metadata + Collection metadataSet = data_fields + .get(xmlElements.get(12)); + if(metadataSet!=null && metadataSet.size()>0){ + Multimap metadata = (Multimap) metadataSet.iterator().next(); + if (metadata != null && !metadata.isEmpty()) { + for (String name : metadata.keySet()) + for (String value : metadata.get(name)) { + Element metaElement = doc.createElement(xmlElements.get(12)); + metaElement.setAttribute(NAME_ATTRIBUTE, name); + metaElement.setTextContent(value); + document.appendChild(metaElement); + } + } + } + } + + return getStringFromDocument(doc); + } + + /** + *

Transform a {@code Document} to its XML string representation

+ * @param doc the document to transform + * @return the document in the XML-String format + */ + private String getStringFromDocument(Document doc) { + try { + DOMSource domSource = new DOMSource(doc); + StringWriter writer = new StringWriter(); + StreamResult result = new StreamResult(writer); + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer transformer = tf.newTransformer(); + // transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); + transformer.transform(domSource, result); + return writer.toString(); + } catch (TransformerException ex) { + ex.printStackTrace(); + return null; + } - } + } } Modified: manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxException.java URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxException.java?rev=1679730&r1=1679729&r2=1679730&view=diff ============================================================================== --- manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxException.java (original) +++ manifoldcf/trunk/connectors/searchblox/connector/src/main/java/org/apache/manifoldcf/agents/output/searchblox/SearchBloxException.java Sat May 16 11:34:14 2015 @@ -21,19 +21,19 @@ package org.apache.manifoldcf.agents.out * @author Rafa Haro */ public class SearchBloxException - extends Exception { + extends Exception { - private static final long serialVersionUID = -6792055510634993398L; + private static final long serialVersionUID = -6792055510634993398L; - public SearchBloxException(String reason, Throwable cause) { - super(reason, cause); - } + public SearchBloxException(String reason, Throwable cause) { + super(reason, cause); + } - public SearchBloxException(String reason) { - super(reason); - } + public SearchBloxException(String reason) { + super(reason); + } - public SearchBloxException(Throwable cause) { - super(cause); - } + public SearchBloxException(Throwable cause) { + super(cause); + } }