Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@ctakes.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r1500511 [1/6] - in
 /ctakes/sandbox/ctakes-scrubber-deid/src: ./
 main/ main/java/ main/java/org/ main/java/org/apache/
 main/java/org/apache/uima/ main/java/org/apache/uima/examples/
 main/java/org/spin/ main/java/org/spin/scrubber/ main/java...
Date: Sun, 07 Jul 2013 19:23:07 -0000
To: commits@ctakes.apache.org
From: brittfitch@apache.org
Message-Id: <20130707192309.D035723888CD@eris.apache.org>

Author: brittfitch
Date: Sun Jul  7 19:23:05 2013
New Revision: 1500511

URL: http://svn.apache.org/r1500511
Log:
ctakes-64
add main & test code dirs

Added:
    ctakes/sandbox/ctakes-scrubber-deid/src/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReaderXML.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/Calculation.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/Calculation_Type.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/KnownPHI.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/KnownPHI_Type.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/OntologyMatch.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/type/OntologyMatch_Type.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/main/resources/
    ctakes/sandbox/ctakes-scrubber-deid/src/main/resources/log4j.properties   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/ScrubberPropertiesTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/TemplateFileProcessorTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/classification/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtegeTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/spin/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/spin/scrubber/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/spin/scrubber/uima/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/spin/scrubber/uima/consumer/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/org/spin/scrubber/uima/consumer/CSVAnnotationTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/uima/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/uima/annotator/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/java/org/spin/scrubber/uima/annotator/DictionaryAnnotatorTest.java   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/resources/
    ctakes/sandbox/ctakes-scrubber-deid/src/test/resources/log4j.properties   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/src/test/resources/scrubber.properties   (with props)

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,150 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/* First created by JCasGen Tue Aug 09 12:54:49 EDT 2011 */
+package org.apache.uima.examples;
+
+import org.apache.uima.jcas.JCas; 
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+import org.apache.uima.jcas.tcas.Annotation;
+
+
+/** Stores detailed information about the original source document from which the current CAS was initialized. All information (like size) refers to the source document and not to the document in the CAS which may be converted and filtered by a CAS Initializer. For example this information will be written to the Semantic Search index so that the original document contents can be retrieved by queries.
+ * Updated by JCasGen Tue Aug 09 14:30:26 EDT 2011
+ * XML source: C:/dev/scrubber-pipeline_tmp/desc/consumer/consumer_printer_debug.xml
+ * @generated */
+public class SourceDocumentInformation extends Annotation {
+  /** @generated
+   * @ordered 
+   */
+  public final static int typeIndexID = JCasRegistry.register(SourceDocumentInformation.class);
+  /** @generated
+   * @ordered 
+   */
+  public final static int type = typeIndexID;
+  /** @generated  */
+  public              int getTypeIndexID() {return typeIndexID;}
+ 
+  /** Never called.  Disable default constructor
+   * @generated */
+  protected SourceDocumentInformation() {}
+    
+  /** Internal - constructor used by generator 
+   * @generated */
+  public SourceDocumentInformation(int addr, TOP_Type type) {
+    super(addr, type);
+    readObject();
+  }
+  
+  /** @generated */
+  public SourceDocumentInformation(JCas jcas) {
+    super(jcas);
+    readObject();   
+  } 
+
+  /** @generated */  
+  public SourceDocumentInformation(JCas jcas, int begin, int end) {
+    super(jcas);
+    setBegin(begin);
+    setEnd(end);
+    readObject();
+  }   
+
+  /** <!-- begin-user-doc -->
+    * Write your own initialization here
+    * <!-- end-user-doc -->
+  @generated modifiable */
+  private void readObject() {}
+     
+ 
+    
+  //*--------------*
+  //* Feature: uri
+
+  /** getter for uri - gets URI of document. (For example, file:///MyDirectory/myFile.txt for a simple file or http://incubator.apache.org/uima/index.html for content from a web source.)
+   * @generated */
+  public String getUri() {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_uri == null)
+      jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.examples.SourceDocumentInformation");
+    return jcasType.ll_cas.ll_getStringValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_uri);}
+    
+  /** setter for uri - sets URI of document. (For example, file:///MyDirectory/myFile.txt for a simple file or http://incubator.apache.org/uima/index.html for content from a web source.) 
+   * @generated */
+  public void setUri(String v) {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_uri == null)
+      jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.examples.SourceDocumentInformation");
+    jcasType.ll_cas.ll_setStringValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_uri, v);}    
+   
+    
+  //*--------------*
+  //* Feature: offsetInSource
+
+  /** getter for offsetInSource - gets Byte offset of the start of document content within original source file or other input source. Only used if the CAS document was retrieved from an source where one physical source file contained several conceptual documents. Zero otherwise.
+   * @generated */
+  public int getOffsetInSource() {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_offsetInSource == null)
+      jcasType.jcas.throwFeatMissing("offsetInSource", "org.apache.uima.examples.SourceDocumentInformation");
+    return jcasType.ll_cas.ll_getIntValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_offsetInSource);}
+    
+  /** setter for offsetInSource - sets Byte offset of the start of document content within original source file or other input source. Only used if the CAS document was retrieved from an source where one physical source file contained several conceptual documents. Zero otherwise. 
+   * @generated */
+  public void setOffsetInSource(int v) {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_offsetInSource == null)
+      jcasType.jcas.throwFeatMissing("offsetInSource", "org.apache.uima.examples.SourceDocumentInformation");
+    jcasType.ll_cas.ll_setIntValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_offsetInSource, v);}    
+   
+    
+  //*--------------*
+  //* Feature: documentSize
+
+  /** getter for documentSize - gets Size of original document in bytes before processing by CAS Initializer. Either absolute file size of size within file or other source.
+   * @generated */
+  public int getDocumentSize() {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_documentSize == null)
+      jcasType.jcas.throwFeatMissing("documentSize", "org.apache.uima.examples.SourceDocumentInformation");
+    return jcasType.ll_cas.ll_getIntValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_documentSize);}
+    
+  /** setter for documentSize - sets Size of original document in bytes before processing by CAS Initializer. Either absolute file size of size within file or other source. 
+   * @generated */
+  public void setDocumentSize(int v) {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_documentSize == null)
+      jcasType.jcas.throwFeatMissing("documentSize", "org.apache.uima.examples.SourceDocumentInformation");
+    jcasType.ll_cas.ll_setIntValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_documentSize, v);}    
+   
+    
+  //*--------------*
+  //* Feature: lastSegment
+
+  /** getter for lastSegment - gets For a CAS that represents a segment of a larger source document, this flag indicates whether this CAS is the final segment of the source document.  This is useful for downstream components that want to take some action after having seen all of the segments of a particular source document.
+   * @generated */
+  public boolean getLastSegment() {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_lastSegment == null)
+      jcasType.jcas.throwFeatMissing("lastSegment", "org.apache.uima.examples.SourceDocumentInformation");
+    return jcasType.ll_cas.ll_getBooleanValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_lastSegment);}
+    
+  /** setter for lastSegment - sets For a CAS that represents a segment of a larger source document, this flag indicates whether this CAS is the final segment of the source document.  This is useful for downstream components that want to take some action after having seen all of the segments of a particular source document. 
+   * @generated */
+  public void setLastSegment(boolean v) {
+    if (SourceDocumentInformation_Type.featOkTst && ((SourceDocumentInformation_Type)jcasType).casFeat_lastSegment == null)
+      jcasType.jcas.throwFeatMissing("lastSegment", "org.apache.uima.examples.SourceDocumentInformation");
+    jcasType.ll_cas.ll_setBooleanValue(addr, ((SourceDocumentInformation_Type)jcasType).casFeatCode_lastSegment, v);}    
+  }
+
+    

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,162 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/* First created by JCasGen Tue Aug 09 12:54:49 EDT 2011 */
+package org.apache.uima.examples;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.Annotation_Type;
+
+/** Stores detailed information about the original source document from which the current CAS was initialized. All information (like size) refers to the source document and not to the document in the CAS which may be converted and filtered by a CAS Initializer. For example this information will be written to the Semantic Search index so that the original document contents can be retrieved by queries.
+ * Updated by JCasGen Tue Aug 09 14:30:26 EDT 2011
+ * @generated */
+public class SourceDocumentInformation_Type extends Annotation_Type {
+  /** @generated */
+  protected FSGenerator getFSGenerator() {return fsGenerator;}
+  /** @generated */
+  private final FSGenerator fsGenerator = 
+    new FSGenerator() {
+      public FeatureStructure createFS(int addr, CASImpl cas) {
+  			 if (SourceDocumentInformation_Type.this.useExistingInstance) {
+  			   // Return eq fs instance if already created
+  		     FeatureStructure fs = SourceDocumentInformation_Type.this.jcas.getJfsFromCaddr(addr);
+  		     if (null == fs) {
+  		       fs = new SourceDocumentInformation(addr, SourceDocumentInformation_Type.this);
+  			   SourceDocumentInformation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+  			   return fs;
+  		     }
+  		     return fs;
+        } else return new SourceDocumentInformation(addr, SourceDocumentInformation_Type.this);
+  	  }
+    };
+  /** @generated */
+  public final static int typeIndexID = SourceDocumentInformation.typeIndexID;
+  /** @generated 
+     @modifiable */
+  public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.examples.SourceDocumentInformation");
+ 
+  /** @generated */
+  final Feature casFeat_uri;
+  /** @generated */
+  final int     casFeatCode_uri;
+  /** @generated */ 
+  public String getUri(int addr) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.examples.SourceDocumentInformation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_uri);
+  }
+  /** @generated */    
+  public void setUri(int addr, String v) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.examples.SourceDocumentInformation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_uri, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_offsetInSource;
+  /** @generated */
+  final int     casFeatCode_offsetInSource;
+  /** @generated */ 
+  public int getOffsetInSource(int addr) {
+        if (featOkTst && casFeat_offsetInSource == null)
+      jcas.throwFeatMissing("offsetInSource", "org.apache.uima.examples.SourceDocumentInformation");
+    return ll_cas.ll_getIntValue(addr, casFeatCode_offsetInSource);
+  }
+  /** @generated */    
+  public void setOffsetInSource(int addr, int v) {
+        if (featOkTst && casFeat_offsetInSource == null)
+      jcas.throwFeatMissing("offsetInSource", "org.apache.uima.examples.SourceDocumentInformation");
+    ll_cas.ll_setIntValue(addr, casFeatCode_offsetInSource, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_documentSize;
+  /** @generated */
+  final int     casFeatCode_documentSize;
+  /** @generated */ 
+  public int getDocumentSize(int addr) {
+        if (featOkTst && casFeat_documentSize == null)
+      jcas.throwFeatMissing("documentSize", "org.apache.uima.examples.SourceDocumentInformation");
+    return ll_cas.ll_getIntValue(addr, casFeatCode_documentSize);
+  }
+  /** @generated */    
+  public void setDocumentSize(int addr, int v) {
+        if (featOkTst && casFeat_documentSize == null)
+      jcas.throwFeatMissing("documentSize", "org.apache.uima.examples.SourceDocumentInformation");
+    ll_cas.ll_setIntValue(addr, casFeatCode_documentSize, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_lastSegment;
+  /** @generated */
+  final int     casFeatCode_lastSegment;
+  /** @generated */ 
+  public boolean getLastSegment(int addr) {
+        if (featOkTst && casFeat_lastSegment == null)
+      jcas.throwFeatMissing("lastSegment", "org.apache.uima.examples.SourceDocumentInformation");
+    return ll_cas.ll_getBooleanValue(addr, casFeatCode_lastSegment);
+  }
+  /** @generated */    
+  public void setLastSegment(int addr, boolean v) {
+        if (featOkTst && casFeat_lastSegment == null)
+      jcas.throwFeatMissing("lastSegment", "org.apache.uima.examples.SourceDocumentInformation");
+    ll_cas.ll_setBooleanValue(addr, casFeatCode_lastSegment, v);}
+    
+  
+
+
+
+  /** initialize variables to correspond with Cas Type and Features
+	* @generated */
+  public SourceDocumentInformation_Type(JCas jcas, Type casType) {
+    super(jcas, casType);
+    casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+ 
+    casFeat_uri = jcas.getRequiredFeatureDE(casType, "uri", "uima.cas.String", featOkTst);
+    casFeatCode_uri  = (null == casFeat_uri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_uri).getCode();
+
+ 
+    casFeat_offsetInSource = jcas.getRequiredFeatureDE(casType, "offsetInSource", "uima.cas.Integer", featOkTst);
+    casFeatCode_offsetInSource  = (null == casFeat_offsetInSource) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_offsetInSource).getCode();
+
+ 
+    casFeat_documentSize = jcas.getRequiredFeatureDE(casType, "documentSize", "uima.cas.Integer", featOkTst);
+    casFeatCode_documentSize  = (null == casFeat_documentSize) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_documentSize).getCode();
+
+ 
+    casFeat_lastSegment = jcas.getRequiredFeatureDE(casType, "lastSegment", "uima.cas.Boolean", featOkTst);
+    casFeatCode_lastSegment  = (null == casFeat_lastSegment) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_lastSegment).getCode();
+
+  }
+}
+
+
+
+    

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/apache/uima/examples/SourceDocumentInformation_Type.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,511 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber;
+
+import org.apache.log4j.Logger;
+import org.apache.uima.util.FileUtils;
+import org.spin.scrubber.uima.dao.BaseDAO;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Properties;
+
+/**
+ * @author Andrew McMurry, MS
+ *         <p/>
+ *         With primary support from Children's Hospital Informatics Program @
+ *         Harvard-MIT Health Sciences and Technology and
+ *         <p/>
+ *         Secondary support from the Harvard Medical School
+ *         Center for BioMedical Informatics
+ *         <p/>
+ *         PHD candidate, Boston University Bioinformatics
+ *         Member, I2b2 National Center for Biomedical Computing
+ *         <p/>
+ *         All works licensed under LGPL
+ *         <p/>
+ *         User: andy
+ *         Date: 6/10/12
+ *         Time: 5:17 PM
+ */
+public class ScrubberProperties
+{
+    private static Logger log =  Logger.getLogger(ScrubberProperties.class);
+    private static Properties   config          = null;
+
+    public  static final String PROPERTIES_FILE = "scrubber.properties";
+
+    public static final String CRLF   = System.getProperty("line.separator");
+    public static final String SLASH  = System.getProperty("file.separator");
+    public static final String TOKEN  = "\\$";
+
+    static
+    {
+        try
+        {
+            if(config==null)
+            {
+                log.debug("Loading " + PROPERTIES_FILE);
+
+                ClassLoader loader = ScrubberProperties.class.getClassLoader();
+
+                InputStream inputStream = loader.getResourceAsStream(PROPERTIES_FILE);
+
+                config = new Properties();
+                config.load(inputStream);
+
+                log.debug("Loaded scrubber properties.");
+            }
+        }
+        catch(Exception e)
+        {
+            log.fatal("Could not load " + PROPERTIES_FILE, e);
+        }
+    }
+
+    public static void main(String[] args)
+    {
+        //TODO: more documentation
+        if(args.length==0)
+        {
+            System.out.println("Usage: ScrubberProperties validate");
+            System.out.println("Usage: ScrubberProperties export");
+        }
+        else
+        {
+            try
+            {
+                if(args[0].equalsIgnoreCase("export"))
+                {
+                    if(isWindows()) writeWindowsShellScript();
+                    else            writeUnixShellScript();
+
+                }
+                else
+                {
+                    System.out.println("scrubber.properties is valid?"+validate());
+                }
+            }
+            catch(Exception e)
+            {
+                System.out.println("Error:" + e.getMessage());
+            }
+        }
+    }
+
+    public enum Param
+    {
+        /**
+         * JDBC Driver for the Scrubber database.
+         * Note that Only MySQL 5+ has been tested
+         * @see BaseDAO and its subclasses.
+         */
+        DB_DRIVER,
+
+        /**
+         * MySQL Database name
+         * @see BaseDAO and its subclasses.
+         */
+        DB_NAME,
+
+        /**
+         * MySQL user for Insert,selects, etc.
+         * @see BaseDAO and its subclasses.
+         */
+        DB_USER,
+
+        /**
+         * MySQL password for Insert,selects, etc.
+         * @see BaseDAO and its subclasses.
+         */
+        DB_PWD,
+
+        /**
+         * MySQL JDBC Connection String
+         * @see BaseDAO and its subclasses.
+         */
+        DB_URI,
+
+        /**
+         * MySQL needs access to a temp directory to write the output of "select" statements.
+         * These temp files are later used as feature sets that do not include any PHI.
+         * These temp files are used for training and testing (classification).
+         */
+        DIR_DB_TEMP,
+
+        /**
+         * Files annotated by a real person, such as from Protege or from the i2b2 challenge.
+         *
+         *
+         * @see org.spin.scrubber.classification.HumanAnnotationsExtractorProtege
+         * @see org.spin.scrubber.classification.HumanAnnotationsExtractorI2B2
+         * @see org.spin.scrubber.uima.dao.AnnotationsDAO
+         * @see org.spin.scrubber.classification.FeatureSetGenerator
+         */
+        DIR_INPUT_HUMAN_ANNOTATIONS_TRAIN,
+        DIR_INPUT_HUMAN_ANNOTATIONS_TEST,
+        HUMAN_ANNOTATIONS_IMPLEMENTATION,
+
+        /**
+         * Dir containing raw XML of open access publications.
+         * These are processed for you already, this param is optional if you want to reprocess them yourself.
+         *
+         */
+        DIR_INPUT_PUBS_XML,
+
+        /**
+         * Dir containing raw Text of open access publications.
+         * By default this is done by processing the @see DIR_INPUT_PUBLICATIONS_XML
+         */
+        DIR_INPUT_PUBS_TXT,
+
+        /**
+         * Dir containing processed Text of open access publications.
+         * This involves stripping escape characters, etc.
+         */
+        DIR_INPUT_PUBS_PROCESSED,
+
+        /**
+         * Dir containing input cases to scrub.
+         */
+        DIR_INPUT_TRAIN,
+        DIR_INPUT_TEST,
+
+        /**
+         * Dir containing scrubbed cases
+         */
+        //DIR_OUTPUT_TRAIN,
+        DIR_OUTPUT_TEST,
+
+        /**
+         *
+         */
+        DIR_MODELS,
+        FILE_MODEL_TRAIN,
+        FILE_MODEL_TEST,
+
+        UIMA_READER_FILE_TRAIN,
+        UIMA_READER_FILE_TEST,
+        UIMA_READER_FILE_PUBS,
+
+        UIMA_READER_IMPL_TRAIN,
+        UIMA_READER_IMPL_TEST,
+        UIMA_READER_IMPL_PUBS,
+
+        CLASSIFICATION_COST_MATRIX,
+
+        LOCALHOST_NUM_THREADS,
+        TEST_DB_AVAILABLE
+    }
+
+    public static String getFileModelTrain()
+    {
+    	return get(Param.FILE_MODEL_TRAIN);
+    }
+
+    public static String getFileModelTrainAbsolutePath()
+    {
+        String dir      = getDirModels();
+    	String filename = getFileModelTrain();
+
+        return dir + File.separator + filename;
+    }
+
+    public static String getFileModelTest()
+    {
+    	return get(Param.FILE_MODEL_TEST);
+    }
+
+    public static String getFileModelTestAbsolutePath()
+    {
+        String dir      = getDirModels();
+    	String filename = getFileModelTest();
+
+        return dir + File.separator + filename;
+    }
+
+    public static String getDirModels()
+    {
+    	return get(Param.DIR_MODELS);
+    }
+    
+    public static String getDbName()
+    {
+        return get(Param.DB_NAME);
+    }
+
+    public static String getDbDriver()
+    {
+        return get(Param.DB_DRIVER);
+    }
+
+    public static String getDbUser()
+    {
+        return get(Param.DB_USER);
+    }
+
+    public static String getDbPassword()
+    {
+        return get(Param.DB_PWD);
+    }
+
+    public static String getDbURI()
+    {
+        return get(Param.DB_URI);
+    }
+
+    public static String getDirInputHumanAnnotationsTrain()
+    {
+        return get(Param.DIR_INPUT_HUMAN_ANNOTATIONS_TRAIN);
+    }
+    
+    public static String getHumanAnnotationsImpl()
+    {
+        return get(Param.HUMAN_ANNOTATIONS_IMPLEMENTATION);
+    }
+    
+    public static String getDirInputHumanAnnotationsTest()
+    {
+        return get(Param.DIR_INPUT_HUMAN_ANNOTATIONS_TEST);
+    }
+
+    public static String getDirInputTrain()
+    {
+        return get(Param.DIR_INPUT_TRAIN);
+    }
+
+    public static String getDirInputTest()
+    {
+        return get(Param.DIR_INPUT_TEST);
+    }
+
+    public static String getDirInputPublicationsXML()
+    {
+        return get(Param.DIR_INPUT_PUBS_XML);
+    }
+
+    public static String getDirInputPublicationsTXT()
+    {
+        return get(Param.DIR_INPUT_PUBS_TXT);
+    }
+
+    public static String getDirInputPublicationsProcessed()
+    {
+        return get(Param.DIR_INPUT_PUBS_PROCESSED);
+    }
+
+    public static String getDirOuputTest()
+    {
+        return get(Param.DIR_OUTPUT_TEST);
+    }
+
+    public static String getUimaReaderFilePublications()
+    {
+        return get(Param.UIMA_READER_FILE_PUBS);
+    }
+
+    public static String getUimaReaderFileTrain()
+    {
+        return get(Param.UIMA_READER_FILE_TRAIN);
+    }
+
+    public static String getUimaReaderFileTest()
+    {
+        return get(Param.UIMA_READER_FILE_TEST);
+    }
+
+    public static String getUimaReaderImplTrain()
+    {
+        return get(Param.UIMA_READER_IMPL_TRAIN);
+    }
+
+    public static String getUimaReaderImplTest()
+    {
+        return get(Param.UIMA_READER_IMPL_TEST);
+    }
+
+    public static String getUimaReaderImplPubs()
+    {
+        return get(Param.UIMA_READER_IMPL_PUBS);
+    }
+
+    public static String getClassificationCostMatrix()
+    {
+        return get(Param.CLASSIFICATION_COST_MATRIX);
+    }
+
+    public static boolean isDBAvailableForTesting()
+    {
+        return new Boolean(get(Param.TEST_DB_AVAILABLE));
+    }
+
+    public static String getOSName()
+    {
+        return System.getProperty("os.name").toLowerCase();
+    }
+
+
+    public static boolean isWindows()
+    {
+        return getOSName().indexOf("win") >= 0;
+    }
+
+    public static boolean isUnix()
+    {
+        String os = getOSName();
+
+        return (os.indexOf("nix") >= 0 || os.indexOf("nux") >= 0);
+    }
+
+    public static int getLocalhostNumThreads()
+    {
+        return Integer.parseInt(get(Param.LOCALHOST_NUM_THREADS));
+    }
+
+    private static String get(Param param)
+    {
+     return get(param.name());
+    }
+
+    private static String get(String name)
+    {
+        return config.getProperty(name);
+    }
+
+    public static String asString()
+    {
+        return asString(Param.values());
+    }
+
+    public static String asString(Param[] params)
+    {
+        StringBuilder all = new StringBuilder();
+
+        for(Param p : params)
+        {
+            all.append(p + "="+get(p) + CRLF);
+        }
+
+        return all.toString();
+    }
+
+    public static String asUnixShellScript()
+    {
+        return asShell(Param.values(), "export");
+    }
+
+    public static void writeUnixShellScript() throws IOException
+    {
+        FileUtils.saveString2File(asUnixShellScript(), new File("scrubber.properties.sh"));
+    }
+
+    public static String asWindowsShellScript()
+    {
+        return asShell(Param.values(), "set");
+    }
+
+    public static void writeWindowsShellScript() throws IOException
+    {
+        FileUtils.saveString2File(asWindowsShellScript(), new File("scrubber.properties.cmd"));
+    }
+
+    public static String asShell(Param[] params, String export)
+    {
+        StringBuilder all = new StringBuilder();
+
+        for(Param p : params)
+        {
+            all.append(export + " SCRUBBER_"+ p+"="+get(p) + CRLF);
+        }
+
+        return all.toString();
+    }
+
+    public static HashMap<String, String> asTokenMap()
+    {
+        return asTokenMap(Param.values());
+    }
+
+    public static HashMap<String, String> asTokenMap(Param[] params)
+    {
+
+        HashMap<String, String> map = new HashMap<String, String>();
+
+        for(Param p : params)
+        {
+            map.put(TOKEN+p.name(), get(p));
+        }
+
+        return map;
+    }
+
+    public static Properties asProperties()
+    {
+        return new Properties(config);
+    }
+
+    public static boolean validate()
+    {
+        boolean isValid = true;
+
+        for(Param p : Param.values())
+        {
+            String userValue = get(p);
+
+            if(userValue == null)
+            {
+                log.warn("Missing from scrubber.properties " + p.name());
+                isValid = false;
+            }
+
+            if(p.name().startsWith("DIR_"))
+            {
+                if(userValue!=null)
+                {
+                    File dir = new File(get(p));
+
+                    if(!dir.exists())
+                    {
+                        log.warn("Directory does not exist "+ dir.getAbsolutePath());
+                        isValid = false;
+                    }
+                }
+            }
+
+            if(isDBAvailableForTesting())
+            {
+                try
+                {
+                    BaseDAO.getConnectionToScrubber();
+                }
+                catch(Exception e)
+                {
+                    log.warn("Check database connection");
+
+                    log.warn(asString(new Param[]{Param.DB_NAME, Param.DB_DRIVER, Param.DB_URI, Param.DB_USER, Param.DB_PWD}));
+                    isValid = false;
+                }
+            }
+        }
+
+        return isValid;
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/ScrubberProperties.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * @author Andrew McMurry, MS
+ *         <p/>
+ *         With primary support from Children's Hospital Informatics Program @
+ *         Harvard-MIT Health Sciences and Technology and
+ *         <p/>
+ *         Secondary support from the Harvard Medical School
+ *         Center for BioMedical Informatics
+ *         <p/>
+ *         PHD candidate, Boston University Bioinformatics
+ *         Member, I2b2 National Center for Biomedical Computing
+ *         <p/>
+ *         All works licensed under LGPL
+ *         <p/>
+ *         User: andy
+ *         Date: 7/10/12
+ *         Time: 7:27 PM
+ */
+public class Setup
+{
+    public static final String CREATE_DB_AND_USER = "create_database_and_user.sql";
+    public static final String CREATE_TABLES      = "create_tables.sql";
+    public static final String INSERT_CENSUS      = "insert_census_names.sql";
+    public static final String INSERT_UMLS        = "scrubber_umls_lookup.sql";
+    public static final String INSERT_TF          = "insert_lookup_term_frequency.sql";
+
+    public static void main(String[] args)
+    {
+        Map<String, String> env = System.getenv();
+        for (String envName : env.keySet()) {
+             System.out.format("%s=%s%n", envName, env.get(envName));
+        }
+
+        try
+        {
+//            process(getMysqlCommandAsRoot() + " < sql/"+ CREATE_DB_AND_USER);
+            process(getMysqlCommandAsRoot());
+        }
+        catch(Exception e)
+        {
+            System.out.println("Could not setup scrubber");
+            e.printStackTrace();
+        }
+    }
+
+
+    private static String getMysqlCommand()
+    {
+        String dbName = ScrubberProperties.getDbName();
+        String dbPass = ScrubberProperties.getDbPassword();
+        String dbUser = ScrubberProperties.getDbUser();
+
+        return "mysql -u "+dbUser + " -p"+dbPass + " -D "+dbName;
+    }
+
+    private static String getMysqlCommandAsRoot()
+    {
+        String dbName = ScrubberProperties.getDbName();
+
+//        return "mysql -u root -p -D "+dbName;
+        return "mysql";
+    }
+
+    public static Process process(String command) throws IOException
+    {
+        return Runtime.getRuntime().exec(command);
+    }
+}
+

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/Setup.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,117 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.beans;
+
+public class Annot 
+{
+	private String filename_long;
+	private String filename_short;
+	private String annot_type_long;
+	private String annot_type_short;
+	private String token;
+	private int id;
+	private int startIdx;
+	private int endIdx;
+	private String match_value;
+	private String match_source;
+
+    //	private String pos;
+	
+	public String getFilename_long()
+	{
+		return filename_long;
+	}
+	public void setFilename_long(String filename_long)
+	{
+		this.filename_long = filename_long;
+	}
+	public String getFilename_short()
+	{
+		return filename_short;
+	}
+	public void setFilename_short(String filename_short)
+	{
+		this.filename_short = filename_short;
+	}
+	public String getAnnot_type_long()
+	{
+		return annot_type_long;
+	}
+	public void setAnnot_type_long(String annot_type_long)
+	{
+		this.annot_type_long = annot_type_long;
+	}
+	public String getAnnot_type_short()
+	{
+		return annot_type_short;
+	}
+	public void setAnnot_type_short(String annot_type_short)
+	{
+		this.annot_type_short = annot_type_short;
+	}
+	public String getToken()
+	{
+		return token;
+	}
+	public void setToken(String token)
+	{
+		this.token = token;
+	}
+	public int getId()
+	{
+		return id;
+	}
+	public void setId(int id)
+	{
+		this.id = id;
+	}
+	public int getStartIdx()
+	{
+		return startIdx;
+	}
+	public void setStartIdx(int startIdx)
+	{
+		this.startIdx = startIdx;
+	}
+	public int getEndIdx()
+	{
+		return endIdx;
+	}
+	public void setEndIdx(int endIdx)
+	{
+		this.endIdx = endIdx;
+	}
+	public String getMatch_value()
+	{
+		return match_value;
+	}
+	public void setMatch_value(String match_value)
+	{
+		this.match_value = match_value;
+	}
+	public String getMatch_source()
+	{
+		return match_source;
+	}
+	public void setMatch_source(String match_source)
+	{
+		this.match_source = match_source;
+	}
+	
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/Annot.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,523 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.beans;
+
+public class CaseFeature
+{
+    //The instance word token
+    private int id;
+    private String token;
+
+    //Character Positions inside of the input file
+    private int startIdx;
+    private int endIdx;
+    private String filename_short;
+
+    //Part of speech, including if this word contains a capital or not
+    private String pos;
+    private String pos_bin = "unknown";
+    private int has_capital = 0;
+
+    //Number of reqular expressions matching each of the 8 types of PHI
+    private int cnt_regex_phon;
+    private int cnt_regex_date;
+    private int cnt_regex_age;
+    private int cnt_regex_id;
+    private int cnt_regex_pat;
+    private int cnt_regex_doc;
+    private int cnt_regex_loc;
+    private int cnt_regex_hosp;
+
+    //Number of private ditionary matches
+    private int cnt_priv;
+    private int cnt_hosp;
+    private int cnt_name;
+
+    //Nubmer of Medical contept (UMLS) dictionary matches
+    private int cnt_dict_costar;
+    private int cnt_dict_hl7v25;
+    private int cnt_dict_hl7v30;
+    private int cnt_dict_icd10cm;
+    private int cnt_dict_icd10pcs;
+    private int cnt_dict_icd9cm;
+    private int cnt_dict_lnc;
+    private int cnt_dict_msh;
+    private int cnt_dict_rxnorm;
+    private int cnt_dict_snomedct;
+
+    //Term frequencies
+    private float cnt_ham_w_pos;
+    private float cnt_ham_wo_pos;
+
+    /**
+     * Class: IS PHI or not
+     */
+    private int is_phi;
+
+    /**
+     * Classified as (response from classifier)
+     */
+    private String classified_as;
+
+    /**
+     * Type of PHI (doctor, patient, hospital name, etc.)
+     */
+    private String phi_type;
+
+    /**
+     * ID (not phi) for this instance
+     */
+	public int getId()
+	{
+		return id;
+	}
+	public void setId(int id)
+	{
+		this.id = id;
+	}
+
+    /**
+     * The word token
+     */
+	public String getToken()
+	{
+		return token;
+	}
+	public void setToken(String token)
+	{
+		this.token = token;
+	}
+
+    /**
+     * Filename for which this token refers
+     */
+	public String getFilename_short()
+	{
+		return filename_short;
+	}
+	public void setFilename_short(String filename_short)
+	{
+		this.filename_short = filename_short;
+	}
+
+
+    /**
+     * Part of Speech
+     */
+	public String getPos()
+	{
+		return pos;
+	}
+	public void setPos(String pos)
+	{
+		this.pos = pos;
+	}
+
+    /**
+     * Part of Speech "bin", defaults to unknown.
+     */
+	public String getPos_bin()
+	{
+		return pos_bin;
+	}
+
+	public void setPos_bin(String pos_bin)
+	{
+		this.pos_bin = pos_bin;
+	}
+
+    /**
+     * Count number of regex, type = phone
+     */
+	public int getCnt_regex_phon()
+	{
+		return cnt_regex_phon;
+	}
+	public void setCnt_regex_phon(int cnt_regex_phon)
+	{
+		this.cnt_regex_phon = cnt_regex_phon;
+	}
+
+
+    /**
+     * Count number of regex, type = date
+     */
+	public int getCnt_regex_date()
+	{
+		return cnt_regex_date;
+	}
+	public void setCnt_regex_date(int cnt_regex_date)
+	{
+		this.cnt_regex_date = cnt_regex_date;
+	}
+
+
+    /**
+     * Count number of regex, type = age
+     */
+	public int getCnt_regex_age()
+	{
+		return cnt_regex_age;
+	}
+	public void setCnt_regex_age(int cnt_regex_age)
+	{
+		this.cnt_regex_age = cnt_regex_age;
+	}
+
+    /**
+     * Count number of regex, type = id
+     */
+	public int getCnt_regex_id()
+	{
+		return cnt_regex_id;
+	}
+	public void setCnt_regex_id(int cnt_regex_id)
+	{
+		this.cnt_regex_id = cnt_regex_id;
+	}
+
+    /**
+     * Count number of regex, type = patient
+     */
+	public int getCnt_regex_pat()
+	{
+		return cnt_regex_pat;
+	}
+	public void setCnt_regex_pat(int cnt_regex_pat)
+	{
+		this.cnt_regex_pat = cnt_regex_pat;
+	}
+
+
+    /**
+     * Count number of regex, type = doctor
+     */
+	public int getCnt_regex_doc()
+	{
+		return cnt_regex_doc;
+	}
+	public void setCnt_regex_doc(int cnt_regex_doc)
+	{
+		this.cnt_regex_doc = cnt_regex_doc;
+	}
+
+    /**
+     * Count number of regex, type = location
+     */
+	public int getCnt_regex_loc()
+	{
+		return cnt_regex_loc;
+	}
+	public void setCnt_regex_loc(int cnt_regex_loc)
+	{
+		this.cnt_regex_loc = cnt_regex_loc;
+	}
+
+    /**
+     * Count number of private dictionary matches
+     */
+	public int getCnt_priv()
+	{
+		return cnt_priv;
+	}
+	public void setCnt_priv(int cnt_priv)
+	{
+		this.cnt_priv = cnt_priv;
+	}
+
+    /**
+     * Count number of hospital dictionary matches
+     */
+	public int getCnt_hosp()
+	{
+		return cnt_hosp;
+	}
+	public void setCnt_hosp(int cnt_hosp)
+	{
+		this.cnt_hosp = cnt_hosp;
+	}
+
+    /**
+     * Count number of name dictionary matches
+     */
+	public int getCnt_name()
+	{
+		return cnt_name;
+	}
+	public void setCnt_name(int cnt_name)
+	{
+		this.cnt_name = cnt_name;
+	}
+
+    /**
+     * Count number of dictionary (UMLS) matches = Costar
+     */
+	public int getCnt_dict_costar()
+	{
+		return cnt_dict_costar;
+	}
+	public void setCnt_dict_costar(int cnt_dict_costar)
+	{
+		this.cnt_dict_costar = cnt_dict_costar;
+	}
+
+    /**
+     * Count number of dictionary (UMLS) matches = HL7 version 2.5
+     */
+	public int getCnt_dict_hl7v25()
+	{
+		return cnt_dict_hl7v25;
+	}
+	public void setCnt_dict_hl7v25(int cnt_dict_hl7v25)
+	{
+		this.cnt_dict_hl7v25 = cnt_dict_hl7v25;
+	}
+
+    /**
+     * Count number of dictionary (UMLS) matches = HL7 version 3.0
+     */
+	public int getCnt_dict_hl7v30()
+	{
+		return cnt_dict_hl7v30;
+	}
+	public void setCnt_dict_hl7v30(int cnt_dict_hl7v30)
+	{
+		this.cnt_dict_hl7v30 = cnt_dict_hl7v30;
+	}
+
+    /**
+     * @return Count number of dictionary (UMLS) matches =ICD10 CM
+     */
+	public int getCnt_dict_icd10cm()
+	{
+		return cnt_dict_icd10cm;
+	}
+	public void setCnt_dict_icd10cm(int cnt_dict_icd10cm)
+	{
+		this.cnt_dict_icd10cm = cnt_dict_icd10cm;
+	}
+
+    /**
+     *
+     * @return Count number of dictionary (UMLS) matches = ICD10 PCS
+     */
+	public int getCnt_dict_icd10pcs()
+	{
+		return cnt_dict_icd10pcs;
+	}
+	public void setCnt_dict_icd10pcs(int cnt_dict_icd10pcs)
+	{
+		this.cnt_dict_icd10pcs = cnt_dict_icd10pcs;
+	}
+
+    /**
+     *
+     * @return Count number of dictionary (UMLS) matches = ICD9 CM
+     */
+	public int getCnt_dict_icd9cm()
+	{
+		return cnt_dict_icd9cm;
+	}
+	public void setCnt_dict_icd9cm(int cnt_dict_icd9cm)
+	{
+		this.cnt_dict_icd9cm = cnt_dict_icd9cm;
+	}
+
+    /**
+     *
+     * @return Count number of dictionary (UMLS) matches = LOINC
+     */
+	public int getCnt_dict_lnc()
+	{
+		return cnt_dict_lnc;
+	}
+	public void setCnt_dict_lnc(int cnt_dict_lnc)
+	{
+		this.cnt_dict_lnc = cnt_dict_lnc;
+	}
+
+    /**
+     * @return Count number of dictionary (UMLS) matches = MESH
+     */
+	public int getCnt_dict_msh()
+	{
+		return cnt_dict_msh;
+	}
+	public void setCnt_dict_msh(int cnt_dict_msh)
+	{
+		this.cnt_dict_msh = cnt_dict_msh;
+	}
+
+    /**
+     * @return Count number of dictionary (UMLS) matches = RXNORM
+     */
+	public int getCnt_dict_rxnorm()
+	{
+		return cnt_dict_rxnorm;
+	}
+	public void setCnt_dict_rxnorm(int cnt_dict_rxnorm)
+	{
+		this.cnt_dict_rxnorm = cnt_dict_rxnorm;
+	}
+
+    /**
+     * @return Count number of dictionary (UMLS) matches = SNOMEDCT
+     */
+	public int getCnt_dict_snomedct()
+	{
+		return cnt_dict_snomedct;
+	}
+	public void setCnt_dict_snomedct(int cnt_dict_snomedct)
+	{
+		this.cnt_dict_snomedct = cnt_dict_snomedct;
+	}
+
+    /**
+     * @return THE class label: is this phi or not
+     */
+	public int getIs_phi()
+	{
+		return is_phi;
+	}
+
+    /**
+     * @param is_phi THE class label: is this phi or not
+     */
+	public void setIs_phi(int is_phi)
+	{
+		this.is_phi = is_phi;
+	}
+
+    /**
+     * Start Position in the INPUT text
+     */
+	public int getStartIdx()
+	{
+		return startIdx;
+	}
+	public void setStartIdx(int startIdx)
+	{
+		this.startIdx = startIdx;
+	}
+
+    /**
+     * Has a capital in the part of speech
+     */
+	public int getHas_capital()
+	{
+		return has_capital;
+	}
+	public void setHas_capital(int has_capital)
+	{
+		this.has_capital = has_capital;
+	}
+
+
+    /**
+     * Count number of "ham" words (non phi) *with* POS
+     * This is a term frequency.
+     */
+	public void setCnt_ham_w_pos(float cnt_ham_w_pos)
+	{
+		this.cnt_ham_w_pos = cnt_ham_w_pos;
+	}
+
+    /**
+     * Count number of "ham" words (non phi) WITHOUT POS
+     * This is a term frequency.
+     */
+	public void setCnt_ham_wo_pos(float cnt_ham_wo_pos)
+	{
+		this.cnt_ham_wo_pos = cnt_ham_wo_pos;
+	}
+
+    /**
+     * Count number of "ham" words (non phi) *with* POS
+     * This is a term frequency.
+     */
+	public float getCnt_ham_w_pos()
+	{
+		return cnt_ham_w_pos;
+	}
+
+    /**
+     * Count number of "ham" words (non phi) WITHOUT POS
+     * This is a term frequency.
+     */
+	public float getCnt_ham_wo_pos()
+	{
+		return cnt_ham_wo_pos;
+	}
+
+    /**
+     * Count number of regex, type = hopsital
+     */
+	public int getCnt_regex_hosp()
+	{
+		return cnt_regex_hosp;
+	}
+	public void setCnt_regex_hosp(int cnt_regex_hosp)
+	{
+		this.cnt_regex_hosp = cnt_regex_hosp;
+	}
+
+    /**
+     * End position in the INPUT text
+     */
+	public int getEndIdx()
+	{
+		return endIdx;
+	}
+	public void setEndIdx(int endIdx)
+	{
+		this.endIdx = endIdx;
+	}
+
+    /**
+     * @return Classified as (response from classifier)
+     */
+	public String getClassified_as()
+	{
+		return classified_as;
+	}
+
+    /**
+     * @param classified_as Classified as (response from classifier)
+     */
+	public void setClassified_as(String classified_as)
+	{
+		this.classified_as = classified_as;
+	}
+
+    /**
+     * @return type of PHI (patient, doctor, hospital, etc. )
+     */
+	public String getPhi_type()
+	{
+		return phi_type;
+	}
+
+    /**
+     * @param phi_type @return type of PHI (patient, doctor, hospital, etc. )
+     */
+	public void setPhi_type(String phi_type)
+	{
+		this.phi_type = phi_type;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/beans/CaseFeature.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,554 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.beans.Annot;
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.AnnotationsDAO;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+
+import java.util.*;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * TODO: documentation
+ * @see CaseFeature
+ */
+public class FeatureSetGenerator implements Runnable
+{
+    private static Logger log =  Logger.getLogger(FeatureSetGenerator.class);
+
+	public static AnnotationsDAO       dao;
+	public static FeatureMatrixDAO       cfDao;
+	public static HumanAnnotationsDAO  phiDao;
+	public static Map<String,String>   posBinMap;
+
+	public static List<String> regexPatList;
+	public static List<String> regexLocList;
+	public static List<String> regexIDList;
+	public static List<String> regexAgeList;
+	public static List<String> regexDoctorList;
+	public static List<String> regexDateList;
+	public static List<String> regexPhoneList;
+	public static List<String> regexHospitalList;
+	
+	public static int maxActiveThreads;
+	private static String tableSuffix;
+	
+	private String       filename;
+	private List<Annot>  fileAnnotList;
+		
+	public FeatureSetGenerator(String filename)
+	{
+		this.filename = filename;
+	}
+	
+	public static void main(String[] args) throws Exception
+	{
+		if(args.length!=1)
+        {
+        	System.out.println("USAGE:\t\t FeatureSetGenerator {_test|_train}");
+        }
+        else
+        {            
+        	tableSuffix = args[0];
+        	
+			//allow configurable max thread count.
+			maxActiveThreads = ScrubberProperties.getLocalhostNumThreads();
+					
+			dao = new AnnotationsDAO(tableSuffix);
+			cfDao = new FeatureMatrixDAO(tableSuffix);
+			phiDao = new HumanAnnotationsDAO(tableSuffix);
+			posBinMap = getPosBinMap();
+			regexPatList = getRegexPatientList();
+			regexLocList = getRegexLocationList();
+			regexIDList = getRegexIDList();
+			regexAgeList = getRegexAgeList();
+			regexDoctorList = getRegexDoctorList();
+			regexDateList = getRegexDateList();
+			regexPhoneList = getRegexPhoneList();
+			regexHospitalList = getRegexHospitalList();
+			
+			//get list of case filenames
+			List<String> filenameList = dao.selectDistinctFilenames();
+			
+			log.info("Total files retrieved for from annotations tb: " + filenameList.size());
+					
+			process(filenameList);
+        }
+	}
+	
+	public static void process(List<String> filenameList)
+	{
+		ThreadPoolExecutor pool = new ThreadPoolExecutor(5, Integer.MAX_VALUE, 30, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+
+		int cnt = 0;
+		
+		try 
+		{
+			for (String filename : filenameList)
+			{
+				FeatureSetGenerator thread = new FeatureSetGenerator(filename);
+				pool.execute(thread);
+				Thread.sleep(50); //quick nap to make sure getActiveCount() can update before all threads are spawned.
+				cnt++;
+				
+				log.debug("STATUS: " + pool.getActiveCount() + " active threads out of " + cnt + " total threads.");
+				
+				//sleep while active threads run.
+				while(pool.getActiveCount()>=maxActiveThreads)
+				{
+					log.debug("SLEEPING: " + pool.getActiveCount() + " active threads out of " + cnt + " total threads.");
+					try
+					{
+						Thread.sleep(200);
+					} 
+					catch (InterruptedException e)
+					{
+						e.printStackTrace();
+					}				
+				}
+			}
+						
+			//if all tasks are complete, exit. 
+			log.info("STATUS: all threads completed. shutting down.");
+			pool.shutdown();
+		}
+		catch(Exception e)
+		{
+			log.error("ERROR: executing from thread pool.", e);
+		}
+	}
+
+	public void run()
+	{
+		try 
+		{
+			//get list of annotations by filename 
+			this.setFileAnnotList(dao.selectAllAnnotByFilename(filename));
+			
+			List<CaseFeature> batchInsertList = new ArrayList<CaseFeature>();
+			
+			log.debug("file: " + this.getFilename() + " : " + new Date(System.currentTimeMillis()));
+			//get all WordToken and NumToken annotations for a file 
+			//(selecting pos annot because there are 2 kinds of wordTokens - pos or cap, and i dont want to double count. 
+			List<Annot> baseAnnotList = dao.selectPosWordAnnotations(filename);
+			baseAnnotList.addAll(dao.selectNumAnnotations(filename));
+			
+			//get Map of knownPHI. this is used to force classify tokens matching xml header sections containing known phi.
+			Map<String,Annot> knownPHIMap = dao.selectKnownPHIAnnotations(filename);
+			
+			//add knownPHI Annot obj to baseAnnotList to be added to case_feature table.
+			for (String key : knownPHIMap.keySet())
+			{
+				baseAnnotList.add(knownPHIMap.get(key));
+			}
+			
+			log.debug("Total annotations retrieved for " + filename + ": " + baseAnnotList.size());
+			
+			//get a list of all annotations matching this word/num
+			for(Annot a : baseAnnotList)
+			{
+				Map<String,String> wordAnnotMap = getFeaturesByFilenameIdx(a.getStartIdx(), this.getFileAnnotList());
+				
+				//create feature obj
+				CaseFeature cf = new CaseFeature();
+				cf.setFilename_short(a.getFilename_short());
+				cf.setToken(a.getToken());
+				cf.setStartIdx(a.getStartIdx());
+				cf.setEndIdx(a.getEndIdx());
+				
+				//check if isPhi
+	//			cf.setIs_phi(phiDao.isPHI(a.getFilename_short(), a.getStartIdx()));
+				String phiLabel = phiDao.selectPHIType(a.getFilename_short(), a.getStartIdx());
+				cf.setIs_phi((phiLabel==null) ? 0 : 1);
+				cf.setPhi_type((phiLabel==null) ? "NA" : phiLabel);
+				
+				//force classification if token is same as one found in xml header portion.
+				if (knownPHIMap.get(a.getToken().toLowerCase())!=null)
+				{
+					//get phi Annot with this token
+					Annot phi = knownPHIMap.get(a.getToken().toLowerCase());
+					
+					//set classifiedAs property according to what header type this token came from.
+					if (phi.getMatch_value().equalsIgnoreCase("FirstName") || phi.getMatch_value().equalsIgnoreCase("LastName"))
+					{
+						cf.setClassified_as("PATIENT");
+					}
+					else if (phi.getMatch_value().equalsIgnoreCase("DateOfBirth"))
+					{
+						cf.setClassified_as("DATE");
+					}
+					else if (phi.getMatch_value().equalsIgnoreCase("SSN") || phi.getMatch_value().equalsIgnoreCase("AccessionNumber") || phi.getMatch_value().equalsIgnoreCase("LocalMRN"))
+					{
+						cf.setClassified_as("ID");
+					}
+				}
+				else
+				{
+					cf.setClassified_as("NA");
+				}
+								
+				//populate feature obj based on remaining annotations
+				for(String matchValue : wordAnnotMap.keySet())
+				{
+					String matchSource = wordAnnotMap.get(matchValue);
+
+
+                    //TODO: REFACTOR
+
+					//group UMLS annotations
+					if(matchSource.equals("COSTAR")){ cf.setCnt_dict_costar(cf.getCnt_dict_costar() + 1); }
+
+					else if(matchSource.equals("HL7V2.5")){ cf.setCnt_dict_hl7v25(cf.getCnt_dict_hl7v25() + 1); }
+					else if(matchSource.equals("HL7V3.0")){ cf.setCnt_dict_hl7v30(cf.getCnt_dict_hl7v30() + 1); }
+					else if(matchSource.equals("ICD10CM")){ cf.setCnt_dict_icd10cm(cf.getCnt_dict_icd10cm() + 1); }
+					else if(matchSource.equals("ICD10PCS")){ cf.setCnt_dict_icd10pcs(cf.getCnt_dict_icd10pcs() + 1); }
+					
+					else if(matchSource.equals("ICD9CM")){ cf.setCnt_dict_icd9cm(cf.getCnt_dict_icd9cm() + 1); }
+					else if(matchSource.equals("LNC")){ cf.setCnt_dict_lnc(cf.getCnt_dict_lnc() + 1); }
+					else if(matchSource.equals("MSH")){ cf.setCnt_dict_msh(cf.getCnt_dict_msh() + 1); }
+					else if(matchSource.equals("RXNORM")){ cf.setCnt_dict_rxnorm(cf.getCnt_dict_rxnorm() + 1); }
+					else if(matchSource.equals("SNOMEDCT")){ cf.setCnt_dict_snomedct(cf.getCnt_dict_snomedct() + 1); }
+					
+					//group dict matches
+					else if(matchSource.equals("dict") && matchValue.equals("name")){ cf.setCnt_name(cf.getCnt_name() + 1); }
+					else if(matchSource.equals("regex") && matchValue.equals("hospital")){ cf.setCnt_hosp(cf.getCnt_hosp() + 1); }
+					else if(matchSource.equals("regex") && matchValue.equals("private")){ cf.setCnt_priv(cf.getCnt_priv() + 1); }
+					
+					//group regex matches
+					else if(matchSource.equals("regex") && regexPatList.contains(matchValue)){ cf.setCnt_regex_pat(cf.getCnt_regex_pat() + 1); }
+					else if(matchSource.equals("regex") && regexLocList.contains(matchValue)){ cf.setCnt_regex_loc(cf.getCnt_regex_loc() + 1); }
+					else if(matchSource.equals("regex") && regexAgeList.contains(matchValue)){ cf.setCnt_regex_age(cf.getCnt_regex_age() + 1); }
+					else if(matchSource.equals("regex") && regexDateList.contains(matchValue)){ cf.setCnt_regex_date(cf.getCnt_regex_date() + 1); }
+					else if(matchSource.equals("regex") && regexIDList.contains(matchValue)){ cf.setCnt_regex_id(cf.getCnt_regex_id() + 1); }
+					else if(matchSource.equals("regex") && regexDoctorList.contains(matchValue)){ cf.setCnt_regex_doc(cf.getCnt_regex_doc() + 1); }
+					else if(matchSource.equals("regex") && regexPhoneList.contains(matchValue)){ cf.setCnt_regex_phon(cf.getCnt_regex_phon() + 1); }
+					else if(matchSource.equals("regex") && regexHospitalList.contains(matchValue)){ cf.setCnt_regex_hosp(cf.getCnt_regex_hosp() + 1); }
+					
+					//set pos related features
+					else if(matchSource.equalsIgnoreCase("pos")) 
+					{ 
+						cf.setPos(matchValue); 
+						cf.setPos_bin((posBinMap.get(matchValue)==null) ? "unknown" : posBinMap.get(a.getMatch_value()));
+					} 
+					
+					//set capitalization feature.
+					else if(matchSource.equalsIgnoreCase("cap")) { cf.setHas_capital(Integer.parseInt(matchValue)); } 
+					
+					//get TF features
+					else if(matchSource.equalsIgnoreCase("tf_ham_with_pos")) { cf.setCnt_ham_w_pos(Float.parseFloat(matchValue)); }
+					else if(matchSource.equalsIgnoreCase("tf_ham_without_pos")) { cf.setCnt_ham_wo_pos(Float.parseFloat(matchValue)); }
+				}
+				
+				//add to batch insert list.
+				batchInsertList.add(cf);
+				
+				//save to feature table
+				//cfDao.insertCaseFeature(cf);
+				if (batchInsertList.size()==500)
+				{
+					cfDao.insertCaseFeatureBatch(batchInsertList);					
+					//reinitialize batchInsertList
+					batchInsertList.clear();
+				}
+			}
+			
+			//submit the final batch
+			cfDao.insertCaseFeatureBatch(batchInsertList);
+			//reinitialize batchInsertList
+			batchInsertList.clear();
+		}
+		catch(Exception e)
+		{
+			System.out.println("ERROR: issue executing thread for file: " + this.getFilename());
+			e.printStackTrace();
+		}
+	}
+	
+	private static Map<String, String> getFeaturesByFilenameIdx(int startIdx, List<Annot> fileAnnotList)
+	{
+		Map<String, String> results = new HashMap<String,String>();
+		
+		for (Annot a : fileAnnotList)
+		{
+			if(startIdx >= a.getStartIdx()
+					&& startIdx < a.getEndIdx())
+			{
+				results.put(a.getMatch_value(), a.getMatch_source());
+			}
+		}
+				
+		return results;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexPatientList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("TITLES");
+		list.add("TITLE_THIRD");
+		list.add("PATIENT_NAME");
+		list.add("MR");
+		list.add("MS");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexDoctorList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("SURGEON");
+		list.add("SURGEON2");
+		list.add("SURGEON3");
+		list.add("SURGEON4");
+		list.add("SURGEON5");
+		list.add("SURGEON6");
+		list.add("SURGEON7");
+		list.add("SURGEON8");
+		list.add("ASST");
+		list.add("ASST2");
+		list.add("ASST3");
+		list.add("PA");
+		list.add("CC");
+		list.add("CC2");
+		list.add("FROM");
+		list.add("CONSULTATION");
+		list.add("TECH");
+		list.add("DOCTOR0_00");
+		list.add("DOCTOR0_0");
+		list.add("DOCTOR0_1");
+		list.add("DOCTOR0_2");
+		list.add("DOCTOR0");
+		list.add("DOCTOR1");
+		list.add("DOCTOR2");
+		list.add("DOCTOR3");
+		list.add("DOCTOR4");
+		list.add("DOCTOR_OLDER");
+		list.add("DOCTOR_GEN");
+		list.add("DOCTOR_SUBHEAD");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexLocationList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("ADDRESS");
+		list.add("ADDRESS2");
+		list.add("ADDRESS4");
+		list.add("ADDRESS5");
+		list.add("ADDRESS6");
+		list.add("ADDRESS7");
+		list.add("ADDRESS8");
+		list.add("POBOX");
+		list.add("ROOM");
+		list.add("ZIP_CODE");
+		list.add("LOCATION_FLOOR_1");
+		list.add("LOCATION_FLOOR_2");
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexDateList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("DATE");
+		list.add("DATE1");
+		list.add("DATE2");
+		list.add("DATE3");
+		list.add("DATE4");
+		list.add("DATE5");
+		list.add("DATE5_U");
+		list.add("DATE6");
+		list.add("DATE7");
+		list.add("DATE8");
+		list.add("DATE9");
+		list.add("DATE11");
+		list.add("DATE12");
+		list.add("DATE13");
+		list.add("DATE14");
+		list.add("DATE15");
+		list.add("DATE16");
+		list.add("DATE17");
+		list.add("DATE18");
+		list.add("DATE19");
+		list.add("DATE20");
+		list.add("DATE21");
+		list.add("DATE22");
+		list.add("DATE_SEPARATORS");
+		list.add("YEAR_CENTURY");
+		list.add("MONTH");
+		list.add("MONTH2");
+		list.add("DISCHARGED");
+		list.add("DISCHARGED2");
+		list.add("DATE_23");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexIDList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("SSN");
+		list.add("ACCESSION");
+		list.add("ACCESSION_KP");
+		list.add("ACCESSION_KP2");
+		list.add("RN_KPNW");
+		list.add("MRN_KPNW");
+		list.add("SUSPICIOUS_NUM");
+		list.add("SUSPICIOUS_NUM2");
+		list.add("ID_I2B2_SMOK");
+		list.add("ID_I2B2_SMOK_2");
+		list.add("ID_I2B2_SMOK_3");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexAgeList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("WRITTEN_AGE_10_100_DIV10");
+		list.add("WRITTEN_AGE_110_TO_119");
+		list.add("WRITTEN_AGE_20_TO_99");
+		list.add("WRITTEN_AGE_1_TO_19");
+		list.add("AGE");
+		list.add("AGED_OVER");
+		list.add("AGE4");
+		list.add("AGE5");
+		list.add("AGE6");
+		list.add("AGE7");
+		list.add("AGE8");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexHospitalList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("HOSPITAL_1");
+		list.add("HOSPITAL_2");
+		
+		return list;
+	}
+
+    //TODO: refactor
+	private static List<String> getRegexPhoneList()
+	{
+		List<String> list = new ArrayList<String>();
+		list.add("EMAIL_ADDRESS");
+		list.add("TELEPHONE2");
+		list.add("TELEPHONE3");
+		list.add("TELEPHONE0");
+		list.add("TELEPHONE1");
+		list.add("EXTENSION");
+		list.add("IP");
+		
+		return list;
+	}
+	
+
+    //TODO: refactor
+	private static Map<String,String> getPosBinMap()
+	{
+		Map<String,String> posBinMap = new HashMap<String,String>();
+		posBinMap.put("CC", "com-dep-wd");
+		posBinMap.put("CT", "com-dep-wd");
+		posBinMap.put("DT", "com-dep-wd");
+		posBinMap.put("EX", "com-dep-wd");
+		posBinMap.put("IN", "com-dep-wd");
+		posBinMap.put("MD", "com-dep-wd");
+		posBinMap.put("PDT", "com-dep-wd");
+		posBinMap.put("RP", "com-dep-wd");
+		posBinMap.put("TO", "com-dep-wd");
+		posBinMap.put("UH", "com-dep-wd");
+		posBinMap.put("WDT", "com-dep-wd");
+		posBinMap.put("FW", "FW-Symb");
+		posBinMap.put("SYM", "FW-Symb");
+		posBinMap.put("JJ", "Adjectives");
+		posBinMap.put("JJR", "Adjectives");
+		posBinMap.put("JJS", "Adjectives");
+		posBinMap.put("NN", "Nouns");
+		posBinMap.put("NNS", "Nouns");
+		posBinMap.put("NNP", "Nouns");
+		posBinMap.put("NNPS", "Nouns");
+		posBinMap.put("WRB", "Adverbs");
+		posBinMap.put("RB", "Adverbs");
+		posBinMap.put("RBR", "Adverbs");
+		posBinMap.put("RBS", "Adverbs");
+		posBinMap.put("VB", "Verbs");
+		posBinMap.put("VBD", "Verbs");
+		posBinMap.put("VBG", "Verbs");
+		posBinMap.put("VBN", "Verbs");
+		posBinMap.put("VBP", "Verbs");
+		posBinMap.put("VBZ", "Verbs");
+		posBinMap.put("PRP", "Pronouns");
+		posBinMap.put("PRP$", "Pronouns");
+		posBinMap.put("WP", "Pronouns");
+		posBinMap.put("WP$", "Pronouns");
+		posBinMap.put("CD", "Numbers");
+		posBinMap.put("LS", "Numbers");
+		posBinMap.put(".", "period");
+		posBinMap.put(",", "comma");
+		posBinMap.put("'", "apos");
+		posBinMap.put("\\N", "none");
+		
+		return posBinMap;
+	}
+
+	public String getFilename()
+	{
+		return filename;
+	}
+
+	public void setFilename(String filename)
+	{
+		this.filename = filename;
+	}
+
+	public List<Annot> getFileAnnotList()
+	{
+		return fileAnnotList;
+	}
+
+	public void setFileAnnotList(List<Annot> fileAnnotList)
+	{
+		this.fileAnnotList = fileAnnotList;
+	}
+
+	public int getMaxActiveThreads()
+	{
+		return maxActiveThreads;
+	}
+
+	public void setMaxActiveThreads(int maxActiveThreads)
+	{
+		this.maxActiveThreads = maxActiveThreads;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/FeatureSetGenerator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+/**
+ * @author Andrew McMurry, MS
+ *         <p/>
+ *         With primary support from Children's Hospital Informatics Program @
+ *         Harvard-MIT Health Sciences and Technology and
+ *         <p/>
+ *         Secondary support from the Harvard Medical School
+ *         Center for BioMedical Informatics
+ *         <p/>
+ *         PHD candidate, Boston University Bioinformatics
+ *         Member, I2b2 National Center for Biomedical Computing
+ *         <p/>
+ *         All works licensed under LGPL
+ *         <p/>
+ *         User: andy
+ *         Date: 6/10/12
+ *         Time: 6:07 PM
+ */
+public interface HumanAnnotationsExtractor
+{
+    //TODO: https://open.med.harvard.edu/jira/browse/SCRUBBER-86
+    public void parseHumanAnnotations();
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain