manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1831269 - in /manifoldcf/trunk/connectors/html-extractor/connector/src/main: java/org/apache/manifoldcf/agents/transformation/htmlextractor/ native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/ resources/org/apache/manif...
Date Wed, 09 May 2018 16:24:06 GMT
Author: kwright
Date: Wed May  9 16:24:06 2018
New Revision: 1831269

URL: http://svn.apache.org/viewvc?rev=1831269&view=rev
Log:
CONNECTORS-1500: Update the connector with latest code from the contributor

Modified:
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
    manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java Wed May  9 16:24:06 2018
@@ -38,9 +38,6 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
-/** This connector works as a transformation connector, but does nothing other than logging.
- *
- */
 public class HtmlExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
 {
 
@@ -61,6 +58,18 @@ public class HtmlExtractor extends org.a
 	private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
 
 
+
+	protected static final int HTML_STRIP_NONE = 0;
+	protected static final int HTML_STRIP_ALL = 1;
+
+	protected static int html_strip_usage = HTML_STRIP_ALL;
+
+	public static final String NODE_KEEPMETADATA = "striphtml";
+	public static final String NODE_FILTEREMPTY = "filterEmpty";
+	public static final String ATTRIBUTE_SOURCE = "source";
+	public static final String ATTRIBUTE_TARGET = "target";
+	public static final String ATTRIBUTE_VALUE = "value";
+
 	/** We handle up to 64K in memory; after that we go to disk. */
 	protected static final long inMemoryMaximumFile = 65536;
 
@@ -119,9 +128,7 @@ public class HtmlExtractor extends org.a
 				length =  new Long(binaryLength);
 
 				/*
-
 				DestinationStorage ds;
-
 				if (document.getBinaryLength() <= inMemoryMaximumFile)
 				{
 					ds = new MemoryDestinationStorage((int)document.getBinaryLength());
@@ -142,7 +149,8 @@ public class HtmlExtractor extends org.a
 				 * 
 				 */
 				Hashtable<String,String> metadataExtracted = new Hashtable<String,String>();
-				metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0), sp.excludeFilters);
+				
+				metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0), sp.excludeFilters, sp.striphtml);
 				InputStream newStream = new ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
 				int lenghtNewStream = newStream.available();
 				document.setBinary(newStream, lenghtNewStream);
@@ -482,6 +490,7 @@ public class HtmlExtractor extends org.a
 		final List<String> excludeFilters = new ArrayList<String>();
 
 
+		String striphtmlValue = "true";
 
 
 		// Fill in context
@@ -499,255 +508,301 @@ public class HtmlExtractor extends org.a
 				if (excludeFilter != null) {
 					excludeFilters.add(excludeFilter);
 				}
-			}
 
 
-		}
+			} else if (sn.getType().equals(NODE_KEEPMETADATA))
+			{
+				striphtmlValue = sn.getAttributeValue(ATTRIBUTE_VALUE);
+			}
 
-		paramMap.put("INCLUDEFILTERS", includeFilters);
-		paramMap.put("EXCLUDEFILTERS", excludeFilters);
+		
 	}
 
-	/**
-	 * Output the specification header section. This method is called in the head
-	 * section of a job page which has selected a pipeline connection of the
-	 * current type. Its purpose is to add the required tabs to the list, and to
-	 * output any javascript methods that might be needed by the job editing HTML.
-	 *
-	 * @param out
-	 *          is the output to which any HTML should be sent.
-	 * @param locale
-	 * @param os
-	 *          is the current pipeline specification for this connection.
-	 * @param connectionSequenceNumber
-	 *          is the unique number of this connection within the job.
-	 * @param tabsArray
-	 *          is an array of tab names. Add to this array any tab names that are
-	 *          specific to the connector.
-	 */
-	@Override
-	public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification os,
-			final int connectionSequenceNumber, final List<String> tabsArray) throws ManifoldCFException, IOException {
-		final Map<String, Object> paramMap = new HashMap<>();
-		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
 
-		tabsArray.add(Messages.getString(locale, "HtmlExtractor.HtmlExtractorTabName"));
 
-		// Fill in the specification header map, using data from all tabs.
-		fillInHtmlExtractorSpecification(paramMap, os);
+	paramMap.put("INCLUDEFILTERS", includeFilters);
+	paramMap.put("EXCLUDEFILTERS", excludeFilters);
+	paramMap.put("HTMLTAGUSAGE", html_strip_usage);
+	paramMap.put("STRIPHTML",striphtmlValue);
 
-		Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
-	}
+}
 
-	/**
-	 * Output the specification body section. This method is called in the body
-	 * section of a job page which has selected a pipeline connection of the
-	 * current type. Its purpose is to present the required form elements for
-	 * editing. The coder can presume that the HTML that is output from this
-	 * configuration will be within appropriate <html>, <body>, and <form> tags.
-	 * The name of the form is "editjob".
-	 *
-	 * @param out
-	 *          is the output to which any HTML should be sent.
-	 * @param locale
-	 *          is the preferred local of the output.
-	 * @param os
-	 *          is the current pipeline specification for this job.
-	 * @param connectionSequenceNumber
-	 *          is the unique number of this connection within the job.
-	 * @param actualSequenceNumber
-	 *          is the connection within the job that has currently been selected.
-	 * @param tabName
-	 *          is the current tab name.
-	 */
-	@Override
-	public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification os,
-			final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName)
-					throws ManifoldCFException, IOException {
-		final Map<String, Object> paramMap = new HashMap<>();
-
-		// Set the tab name
-		paramMap.put("TABNAME", tabName);
-		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
-		paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
+/**
+ * Output the specification header section. This method is called in the head
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to add the required tabs to the list, and to
+ * output any javascript methods that might be needed by the job editing HTML.
+ *
+ * @param out
+ *          is the output to which any HTML should be sent.
+ * @param locale
+ * @param os
+ *          is the current pipeline specification for this connection.
+ * @param connectionSequenceNumber
+ *          is the unique number of this connection within the job.
+ * @param tabsArray
+ *          is an array of tab names. Add to this array any tab names that are
+ *          specific to the connector.
+ */
+@Override
+public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification os,
+		final int connectionSequenceNumber, final List<String> tabsArray) throws ManifoldCFException, IOException {
+	final Map<String, Object> paramMap = new HashMap<>();
+	paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
 
-		// Fill in the field mapping tab data
-		fillInHtmlExtractorSpecification(paramMap, os);
+	tabsArray.add(Messages.getString(locale, "HtmlExtractorTransformationConnector.HtmlExtractorTabName"));
 
-		Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
-	}
+	// Fill in the specification header map, using data from all tabs.
+	fillInHtmlExtractorSpecification(paramMap, os);
 
-	/**
-	 * Process a specification post. This method is called at the start of job's
-	 * edit or view page, whenever there is a possibility that form data for a
-	 * connection has been posted. Its purpose is to gather form information and
-	 * modify the transformation specification accordingly. The name of the posted
-	 * form is "editjob".
-	 *
-	 * @param variableContext
-	 *          contains the post data, including binary file-upload information.
-	 * @param locale
-	 *          is the preferred local of the output.
-	 * @param os
-	 *          is the current pipeline specification for this job.
-	 * @param connectionSequenceNumber
-	 *          is the unique number of this connection within the job.
-	 * @return null if all is well, or a string error message if there is an error
-	 *         that should prevent saving of the job (and cause a redirection to
-	 *         an error page).
-	 */
-	@Override
-	public String processSpecificationPost(final IPostParameters variableContext, final Locale locale,
-			final Specification os, final int connectionSequenceNumber) throws ManifoldCFException {
+	Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
+}
 
-		final String seqPrefix = "s" + connectionSequenceNumber + "_";
+/**
+ * Output the specification body section. This method is called in the body
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to present the required form elements for
+ * editing. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html>, <body>, and <form> tags.
+ * The name of the form is "editjob".
+ *
+ * @param out
+ *          is the output to which any HTML should be sent.
+ * @param locale
+ *          is the preferred local of the output.
+ * @param os
+ *          is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ *          is the unique number of this connection within the job.
+ * @param actualSequenceNumber
+ *          is the connection within the job that has currently been selected.
+ * @param tabName
+ *          is the current tab name.
+ */
+@Override
+public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification os,
+		final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName)
+				throws ManifoldCFException, IOException {
+	final Map<String, Object> paramMap = new HashMap<>();
+
+	// Set the tab name
+	paramMap.put("TABNAME", tabName);
+	paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+	paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
 
-		String x;
+	// Fill in the field mapping tab data
+	fillInHtmlExtractorSpecification(paramMap, os);
 
-		// Include filters
-		x = variableContext.getParameter(seqPrefix + "includefilter_count");
-		if (x != null && x.length() > 0) {
-			// About to gather the includefilter nodes, so get rid of the old ones.
-			int i = 0;
-			while (i < os.getChildCount()) {
-				final SpecificationNode node = os.getChild(i);
-				if (node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
-					os.removeChild(i);
-				} else {
-					i++;
-				}
-			}
-			final int count = Integer.parseInt(x);
-			i = 0;
-			while (i < count) {
-				final String prefix = seqPrefix + "includefilter_";
-				final String suffix = "_" + Integer.toString(i);
-				final String op = variableContext.getParameter(prefix + "op" + suffix);
-				if (op == null || !op.equals("Delete")) {
-					// Gather the includefilters etc.
-					final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
-					final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
-					node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
-					os.addChild(os.getChildCount(), node);
-				}
+	Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
+}
+
+/**
+ * Process a specification post. This method is called at the start of job's
+ * edit or view page, whenever there is a possibility that form data for a
+ * connection has been posted. Its purpose is to gather form information and
+ * modify the transformation specification accordingly. The name of the posted
+ * form is "editjob".
+ *
+ * @param variableContext
+ *          contains the post data, including binary file-upload information.
+ * @param locale
+ *          is the preferred local of the output.
+ * @param os
+ *          is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ *          is the unique number of this connection within the job.
+ * @return null if all is well, or a string error message if there is an error
+ *         that should prevent saving of the job (and cause a redirection to
+ *         an error page).
+ */
+@Override
+public String processSpecificationPost(final IPostParameters variableContext, final Locale locale,
+		final Specification os, final int connectionSequenceNumber) throws ManifoldCFException {
+
+
+	final String seqPrefix = "s" + connectionSequenceNumber + "_";
+
+	String x;
+
+	// Include filters
+	x = variableContext.getParameter(seqPrefix + "includefilter_count");
+	if (x != null && x.length() > 0) {
+		// About to gather the includefilter nodes, so get rid of the old ones.
+		int i = 0;
+		while (i < os.getChildCount()) {
+			final SpecificationNode node = os.getChild(i);
+			if (node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+				os.removeChild(i);
+			} else {
 				i++;
 			}
-
-			final String addop = variableContext.getParameter(seqPrefix + "includefilter_op");
-			if (addop != null && addop.equals("Add")) {
-				final String regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
+		}
+		final int count = Integer.parseInt(x);
+		i = 0;
+		while (i < count) {
+			final String prefix = seqPrefix + "includefilter_";
+			final String suffix = "_" + Integer.toString(i);
+			final String op = variableContext.getParameter(prefix + "op" + suffix);
+			if (op == null || !op.equals("Delete")) {
+				// Gather the includefilters etc.
+				final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
 				final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
 				node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
 				os.addChild(os.getChildCount(), node);
 			}
+			i++;
 		}
 
-		// Exclude filters
-		x = variableContext.getParameter(seqPrefix + "excludefilter_count");
-		if (x != null && x.length() > 0) {
-			// About to gather the excludefilter nodes, so get rid of the old ones.
-			int i = 0;
-			while (i < os.getChildCount()) {
-				final SpecificationNode node = os.getChild(i);
-				if (node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
-					os.removeChild(i);
-				} else {
-					i++;
-				}
-			}
-			final int count = Integer.parseInt(x);
-			i = 0;
-			while (i < count) {
-				final String prefix = seqPrefix + "excludefilter_";
-				final String suffix = "_" + Integer.toString(i);
-				final String op = variableContext.getParameter(prefix + "op" + suffix);
-				if (op == null || !op.equals("Delete")) {
-					// Gather the excludefilters etc.
-					final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
-					final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
-					node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
-					os.addChild(os.getChildCount(), node);
-				}
+		final String addop = variableContext.getParameter(seqPrefix + "includefilter_op");
+		if (addop != null && addop.equals("Add")) {
+			final String regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
+			final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+			node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+			os.addChild(os.getChildCount(), node);
+		}
+	}
+
+	// Exclude filters
+	x = variableContext.getParameter(seqPrefix + "excludefilter_count");
+	if (x != null && x.length() > 0) {
+		// About to gather the excludefilter nodes, so get rid of the old ones.
+		int i = 0;
+		while (i < os.getChildCount()) {
+			final SpecificationNode node = os.getChild(i);
+			if (node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+				os.removeChild(i);
+			} else {
 				i++;
 			}
-
-			final String addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
-			if (addop != null && addop.equals("Add")) {
-				final String regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
+		}
+		final int count = Integer.parseInt(x);
+		i = 0;
+		while (i < count) {
+			final String prefix = seqPrefix + "excludefilter_";
+			final String suffix = "_" + Integer.toString(i);
+			final String op = variableContext.getParameter(prefix + "op" + suffix);
+			if (op == null || !op.equals("Delete")) {
+				// Gather the excludefilters etc.
+				final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
 				final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
 				node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
 				os.addChild(os.getChildCount(), node);
 			}
+			i++;
 		}
 
-		return null;
+		final String addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
+		if (addop != null && addop.equals("Add")) {
+			final String regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
+			final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+			node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+			os.addChild(os.getChildCount(), node);
+		}
 	}
 
-	/**
-	 * View specification. This method is called in the body section of a job's
-	 * view page. Its purpose is to present the pipeline specification information
-	 * to the user. The coder can presume that the HTML that is output from this
-	 * configuration will be within appropriate <html> and <body> tags.
-	 *
-	 * @param out
-	 *          is the output to which any HTML should be sent.
-	 * @param locale
-	 *          is the preferred local of the output.
-	 * @param connectionSequenceNumber
-	 *          is the unique number of this connection within the job.
-	 * @param os
-	 *          is the current pipeline specification for this job.
-	 */
-	@Override
-	public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification os,
-			final int connectionSequenceNumber) throws ManifoldCFException, IOException {
-		final Map<String, Object> paramMap = new HashMap<>();
-		paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
-
-		// Fill in the map with data from all tabs
-		fillInHtmlExtractorSpecification(paramMap, os);
+	x = variableContext.getParameter(seqPrefix+"striphtml_present");
+    if (x != null && x.length() > 0)
+    {
+      String keepAll = variableContext.getParameter(seqPrefix+"striphtml");
+      if (keepAll == null)
+        keepAll = "false";
+      // About to gather the fieldmapping nodes, so get rid of the old ones.
+      int i = 0;
+      while (i < os.getChildCount())
+      {
+        SpecificationNode node = os.getChild(i);
+        if (node.getType().equals(NODE_KEEPMETADATA))
+          os.removeChild(i);
+        else
+          i++;
+      }
+
+      // Gather the keep all metadata parameter to be the last one
+      SpecificationNode node = new SpecificationNode(NODE_KEEPMETADATA);
+      node.setAttribute(ATTRIBUTE_VALUE, keepAll);
+      // Add the new striphtml config parameter 
+      os.addChild(os.getChildCount(), node);
+    }
 
-		Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
 
-	}
-	protected static class SpecPacker {
+	return null;
+}
 
-		private final List<String> includeFilters = new ArrayList<>();
-		private final List<String> excludeFilters = new ArrayList<>();
+/**
+ * View specification. This method is called in the body section of a job's
+ * view page. Its purpose is to present the pipeline specification information
+ * to the user. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html> and <body> tags.
+ *
+ * @param out
+ *          is the output to which any HTML should be sent.
+ * @param locale
+ *          is the preferred local of the output.
+ * @param connectionSequenceNumber
+ *          is the unique number of this connection within the job.
+ * @param os
+ *          is the current pipeline specification for this job.
+ */
+@Override
+public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification os,
+		final int connectionSequenceNumber) throws ManifoldCFException, IOException {
+	final Map<String, Object> paramMap = new HashMap<>();
+	paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
 
+	// Fill in the map with data from all tabs
+	fillInHtmlExtractorSpecification(paramMap, os);
 
-		public SpecPacker(final Specification os) {
-			for (int i = 0; i < os.getChildCount(); i++) {
-				final SpecificationNode sn = os.getChild(i);
+	Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
 
-				if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
-					final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
-					includeFilters.add(regex);
-				}
+}
+protected static class SpecPacker {
 
-				if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
-					final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
-					excludeFilters.add(regex);
-				}
+	private final List<String> includeFilters = new ArrayList<>();
+	private final List<String> excludeFilters = new ArrayList<>();
+	private final boolean striphtml;
 
+	public SpecPacker(final Specification os) {
+		boolean striphtml = true;
+		for (int i = 0; i < os.getChildCount(); i++) {
+			final SpecificationNode sn = os.getChild(i);
 
+			if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+				final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+				includeFilters.add(regex);
 			}
 
-			if (includeFilters.isEmpty()) {
-				includeFilters.add(HtmlExtractorConfig.WHITELIST_DEFAULT);
+			if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+				final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+				excludeFilters.add(regex);
+			}
+			if(sn.getType().equals(NODE_KEEPMETADATA)) {
+				String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
+				striphtml = Boolean.parseBoolean(value);
 			}
+
+		}
+
+		if (includeFilters.isEmpty()) {
+			includeFilters.add(HtmlExtractorConfig.WHITELIST_DEFAULT);
 		}
 
-		public String toPackedString() {
-			final StringBuilder sb = new StringBuilder();
+		this.striphtml = striphtml;
+	}
 
-			packList(sb, includeFilters, '+');
-			packList(sb, excludeFilters, '+');
+	public String toPackedString() {
+		final StringBuilder sb = new StringBuilder();
 
-			return sb.toString();
-		}
+		packList(sb, includeFilters, '+');
+		packList(sb, excludeFilters, '+');
+		if (striphtml)
+			sb.append('+');
+		else
+			sb.append('-');
 
+		return sb.toString();
 	}
+
 }
 
+}
 

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java Wed May  9 16:24:06 2018
@@ -24,12 +24,10 @@ package org.apache.manifoldcf.agents.tra
  */
 public class HtmlExtractorConfig {
 
-  // Configuration parameters
-	//TODO : remove the SOlr parameters
-  public static final String PARAM_SOLRUPDATEHANDLER = "solrUpdateHandler";
-  public static final String SOLRUPDATEHANDLER_DEFAULT = "/update/no-tika";
+ // Configuration parameters
   public static final String WHITELIST_DEFAULT = "body";
   public static final String BLACKLIST_DEFAULT = "";
+  public static final String PARAMETER_STRIP_HTML = "Strip HTML content";
   
   // Specification nodes and values
   public static final String NODE_INCLUDEFILTER = "includefilter";
@@ -37,5 +35,7 @@ public class HtmlExtractorConfig {
   public static final String INCLUDEFILTER_DEFAULT = "body";
   public static final String ATTRIBUTE_REGEX = "regex";
   public static final String ATTRIBUTE_VALUE = "value";
+  
+  
 
 }

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java Wed May  9 16:24:06 2018
@@ -38,10 +38,11 @@ public class JsoupProcessing {
 
 
 
-	public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist) throws IOException{
+	public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
 		Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
 		Hashtable<String,String> metadata = new Hashtable<String,String>();
 		for(Element meta : doc.select("meta")) {
+			Logging.root.warn("Name: " + meta.attr("name") + " - Content: " + meta.attr("content"));
 			metadata.put(meta.attr("name"), meta.attr("content"));
 		}
 
@@ -52,8 +53,10 @@ public class JsoupProcessing {
 		}
 
 		Element element_keywords = doc.select("meta[name='keywords']").first();
+		Logging.root.warn("keywordsjsoupnounet");
 		if (element_keywords != null) {
 			String keywords = (element_keywords.attr("content"));
+			Logging.root.warn("keyyyyyywords"+keywords);
 			metadata.put("keywords",keywords);
 		}
 
@@ -164,13 +167,15 @@ public class JsoupProcessing {
 			}
 		}
 
-		//finalDoc = docToKeep.text();
-		finalDoc = docToKeep.html();
+		if (stripHtml)
+			finalDoc = docToKeep.text();
+		else
+			finalDoc = docToKeep.html();
+		
+		
 		metadata.put("extractedDoc",finalDoc);
 
 		return metadata;
 	}
 
-}
-
-
+}
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties Wed May  9 16:24:06 2018
@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-HtmlExtractor.HtmlExtractorTabName=HTML Extractor
-HtmlExtractor.SolrUpdateHandler=Update handler:
-HtmlExtractor.IncludeFilters=Englobing tag :
-HtmlExtractor.ExcludeFilters=Tags to remove :
-HtmlExtractor.RegularExpression=CSS selector
-HtmlExtractor.Delete=Delete
-HtmlExtractor.DeleteIncludeFilter=Delete englobing tag
-HtmlExtractor.DeleteExcludeFilter=Delete blacklist tag
-HtmlExtractor.NoIncludeFilterSpecified=No englobing tag specified
-HtmlExtractor.NoExcludeFilterSpecified=No blacklist tag specified
-HtmlExtractor.Add=Add
-HtmlExtractor.AddIncludeFilter=Add englobing tag
-HtmlExtractor.AddExcludeFilter=Add blacklist tag
-HtmlExtractor.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+HtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+HtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+HtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+HtmlExtractorTransformationConnector.RegularExpression=CSS selector
+HtmlExtractorTransformationConnector.Delete=Delete
+HtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+HtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+HtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+HtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+HtmlExtractorTransformationConnector.Add=Add
+HtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+HtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+HtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.StripHTML=Strip HTML tags

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties Wed May  9 16:24:06 2018
@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-HtmlExtractor.HtmlExtractorTabName=HTML Extractor
-HtmlExtractor.SolrUpdateHandler=Update handler:
-HtmlExtractor.IncludeFilters=Englobing tag :
-HtmlExtractor.ExcludeFilters=Tags to remove :
-HtmlExtractor.RegularExpression=CSS selector
-HtmlExtractor.Delete=Delete
-HtmlExtractor.DeleteIncludeFilter=Delete englobing tag
-HtmlExtractor.DeleteExcludeFilter=Delete blacklist tag
-HtmlExtractor.NoIncludeFilterSpecified=No englobing tag specified
-HtmlExtractor.NoExcludeFilterSpecified=No blacklist tag specified
-HtmlExtractor.Add=Add
-HtmlExtractor.AddIncludeFilter=Add englobing tag
-HtmlExtractor.AddExcludeFilter=Add blacklist tag
-HtmlExtractor.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+HtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+HtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+HtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+HtmlExtractorTransformationConnector.RegularExpression=CSS selector
+HtmlExtractorTransformationConnector.Delete=Delete
+HtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+HtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+HtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+HtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+HtmlExtractorTransformationConnector.Add=Add
+HtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+HtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+HtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.StripHTML=Strip HTML tags

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties Wed May  9 16:24:06 2018
@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-HtmlExtractor.HtmlExtractorTabName=HTML Extractor
-HtmlExtractor.SolrUpdateHandler=Update handler:
-HtmlExtractor.IncludeFilters=Englobing tag :
-HtmlExtractor.ExcludeFilters=Tags to remove :
-HtmlExtractor.RegularExpression=CSS selector
-HtmlExtractor.Delete=Delete
-HtmlExtractor.DeleteIncludeFilter=Delete englobing tag
-HtmlExtractor.DeleteExcludeFilter=Delete blacklist tag
-HtmlExtractor.NoIncludeFilterSpecified=No englobing tag specified
-HtmlExtractor.NoExcludeFilterSpecified=No blacklist tag specified
-HtmlExtractor.Add=Add
-HtmlExtractor.AddIncludeFilter=Add englobing tag
-HtmlExtractor.AddExcludeFilter=Add blacklist tag
-HtmlExtractor.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+HtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+HtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+HtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+HtmlExtractorTransformationConnector.RegularExpression=CSS selector
+HtmlExtractorTransformationConnector.Delete=Delete
+HtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+HtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+HtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+HtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+HtmlExtractorTransformationConnector.Add=Add
+HtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+HtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+HtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.StripHTML=Strip HTML tags

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties Wed May  9 16:24:06 2018
@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-HtmlExtractor.HtmlExtractorTabName=HTML Extractor
-HtmlExtractor.SolrUpdateHandler=Update handler:
-HtmlExtractor.IncludeFilters=Englobing tag :
-HtmlExtractor.ExcludeFilters=Tags to remove :
-HtmlExtractor.RegularExpression=CSS selector
-HtmlExtractor.Delete=Delete
-HtmlExtractor.DeleteIncludeFilter=Delete englobing tag
-HtmlExtractor.DeleteExcludeFilter=Delete blacklist tag
-HtmlExtractor.NoIncludeFilterSpecified=No englobing tag specified
-HtmlExtractor.NoExcludeFilterSpecified=No blacklist tag specified
-HtmlExtractor.Add=Add
-HtmlExtractor.AddIncludeFilter=Add englobing tag
-HtmlExtractor.AddExcludeFilter=Add blacklist tag
-HtmlExtractor.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML Extractor
+HtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+HtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+HtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+HtmlExtractorTransformationConnector.RegularExpression=CSS selector
+HtmlExtractorTransformationConnector.Delete=Delete
+HtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete englobing tag
+HtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete blacklist tag
+HtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No englobing tag specified
+HtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No blacklist tag specified
+HtmlExtractorTransformationConnector.Add=Add
+HtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+HtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+HtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector specified
+HtmlExtractorTransformationConnector.StripHTML=Strip HTML tags

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html Wed May  9 16:24:06 2018
@@ -5,9 +5,7 @@
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at
-
      http://www.apache.org/licenses/LICENSE-2.0
-
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,18 +13,30 @@
  limitations under the License.
 -->
 
-#if($TABNAME == $ResourceBundle.getString('HtmlExtractor.HtmlExtractorTabName') && ${SEQNUM} == ${SELECTEDNUM})
+#if($TABNAME == $ResourceBundle.getString('HtmlExtractorTransformationConnector.HtmlExtractorTabName') && ${SEQNUM} == ${SELECTEDNUM})
 
 <table class="displaytable">
+
+<tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.StripHTML'))</nobr></td>
+    <td class="value" colspan="3">
+      <input type="hidden" name="s${SEQNUM}_striphtml_present" value="true"/>
+  #if($STRIPHTML == 'true')
+       <input type="checkbox" checked="true" name="s${SEQNUM}_striphtml" value="true"/>
+  #else
+       <input type="checkbox" name="s${SEQNUM}_striphtml" value="true"/>
+  #end
+    </td>
+  </tr>
 <tr>
     <td class="description">
-      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.IncludeFilters'))</nobr>
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
     </td>
     <td class="boxcell">
       <table class="formtable">
         <tr class="formheaderrow">
           <td class="formcolumnheader"></td>
-          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.RegularExpression'))</nobr></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
         </tr>
 
   #set($includecounter = 0)
@@ -39,8 +49,9 @@
     #end
           <td class="formcolumncell">
             <a name="s${SEQNUM}_includefilter_$includecounter">
-              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.DeleteIncludeFilter'))$includecounterdisplay" onclick='javascript:s${SEQNUM}_deleteIncludeFilter("$includecounter");'/>
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.DeleteIncludeFilter'))$includecounterdisplay" onclick='javascript:s${SEQNUM}_deleteIncludeFilter("$includecounter");'/>
               <input type="hidden" name="s${SEQNUM}_includefilter_op_$includecounter" value="Continue"/>
+              <input type="hidden" name="s${SEQNUM}_includefilter_regex_$includecounter" value="$Encoder.attributeEscape($includefilter)"/>
             </a>
           </td>
           <td class="formcolumncell">
@@ -49,17 +60,18 @@
         </tr>
     #set($includecounter = $includecounter + 1)
   #end
+
   
   
   #if($includecounter == 0)
-        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.NoIncludeFilterSpecified'))</td></tr>
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
   #end
       
         <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
         <tr class="formrow">
           <td class="formcolumncell">
             <a name="includefilter">
-              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.AddIncludeFilter'))" onclick="javascript:s${SEQNUM}_addIncludeFilter();"/>
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.AddIncludeFilter'))" onclick="javascript:s${SEQNUM}_addIncludeFilter();"/>
             </a>
             <input type="hidden" name="s${SEQNUM}_includefilter_count" value="$includecounter"/>
             <input type="hidden" name="s${SEQNUM}_includefilter_op" value="Continue"/>
@@ -73,13 +85,13 @@
   </tr>
   <tr>
     <td class="description">
-      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.ExcludeFilters'))</nobr>
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
     </td>
     <td class="boxcell">
       <table class="formtable">
         <tr class="formheaderrow">
           <td class="formcolumnheader"></td>
-          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.RegularExpression'))</nobr></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
         </tr>
 
   #set($excludecounter = 0)
@@ -92,7 +104,7 @@
     #end
           <td class="formcolumncell">
             <a name="s${SEQNUM}_excludefilter_$excludecounter">
-              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.DeleteExcludeFilter'))$excludecounterdisplay" onclick='javascript:s${SEQNUM}_deleteExcludeFilter("$excludecounter");'/>
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.Delete'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.DeleteExcludeFilter'))$excludecounterdisplay" onclick='javascript:s${SEQNUM}_deleteExcludeFilter("$excludecounter");'/>
               <input type="hidden" name="s${SEQNUM}_excludefilter_op_$excludecounter" value="Continue"/>
               <input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" value="$Encoder.attributeEscape($excludefilter)"/>
             </a>
@@ -105,14 +117,14 @@
   #end
   
   #if($excludecounter == 0)
-        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.NoExcludeFilterSpecified'))</td></tr>
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
   #end
       
         <tr class="formrow"><td class="formseparator" colspan="3"><hr/></td></tr>
         <tr class="formrow">
           <td class="formcolumncell">
             <a name="excludefilter">
-              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractor.AddExcludeFilter'))" onclick="javascript:s${SEQNUM}_addExcludeFilter();"/>
+              <input type="button" value="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.Add'))" alt="$Encoder.attributeEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.AddExcludeFilter'))" onclick="javascript:s${SEQNUM}_addExcludeFilter();"/>
             </a>
             <input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
             <input type="hidden" name="s${SEQNUM}_excludefilter_op" value="Continue"/>
@@ -122,7 +134,7 @@
           </td>
         </tr>
       </table>
-    </td>
+   </td>
   </tr>
 </table>
 
@@ -142,5 +154,8 @@
   #end
 <input type="hidden" name="s${SEQNUM}_excludefilter_count" value="$excludecounter"/>
 
+<input type="hidden" name="s${SEQNUM}_striphtml_present" value="true"/>
+<input type="hidden" name="s${SEQNUM}_striphtml" value="$Encoder.attributeEscape($STRIPHTML)"/>
+
 
-#end
+#end
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html?rev=1831269&r1=1831268&r2=1831269&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html Wed May  9 16:24:06 2018
@@ -5,9 +5,7 @@
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at
-
      http://www.apache.org/licenses/LICENSE-2.0
-
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,12 +19,12 @@
  
   <tr>
     <td class="description">
-      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.IncludeFilters'))</nobr>
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
     </td>
     <td class="boxcell">
       <table class="formtable">
         <tr class="formheaderrow">
-          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.RegularExpression'))</nobr></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
         </tr>
 
   #set($includecounter = 0)
@@ -44,19 +42,19 @@
   #end
   
   #if($includecounter == 0)
-        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.NoIncludeFilterSpecified'))</td></tr>
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
   #end
       </table>
     </td>
   </tr>
   <tr>
     <td class="description">
-      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.ExcludeFilters'))</nobr>
+      <nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.HtmlExtractorTabName')) $Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
     </td>
     <td class="boxcell">
       <table class="formtable">
         <tr class="formheaderrow">
-          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.RegularExpression'))</nobr></td>
+          <td class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
         </tr>
 
   #set($excludecounter = 0)
@@ -74,9 +72,15 @@
   #end
   
   #if($excludecounter == 0)
-        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractor.NoExcludeFilterSpecified'))</td></tr>
+        <tr class="formrow"><td class="formmessage" colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
   #end
       </table>
     </td>
   </tr>
-</table>
+  <tr><td class="separator" colspan="4"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HtmlExtractorTransformationConnector.StripHTML'))</nobr></td>
+    <td class="value"><nobr>$Encoder.bodyEscape($STRIPHTML)</nobr></td>
+  </tr>
+ 
+</table>
\ No newline at end of file



Mime
View raw message