ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From brittfi...@apache.org
Subject svn commit: r1500511 [3/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./ main/ main/java/ main/java/org/ main/java/org/apache/ main/java/org/apache/uima/ main/java/org/apache/uima/examples/ main/java/org/spin/ main/java/org/spin/scrubber/ main/java...
Date Sun, 07 Jul 2013 19:23:07 GMT
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+
+@XStreamAlias("classMention")
+public class ClassMention
+{
+	@XStreamAlias("id")
+	@XStreamAsAttribute
+	private String id;
+	
+	private MentionClass mentionClass;
+
+	public ClassMention(String id, MentionClass mentionClass)
+	{
+		this.setId(id);
+		this.setMentionClass(mentionClass);
+	}
+	public String getId()
+	{
+		return id;
+	}
+	public void setId(String id)
+	{
+		this.id = id;
+	}
+	public MentionClass getMentionClass()
+	{
+		return mentionClass;
+	}
+	public void setMentionClass(MentionClass mentionClass)
+	{
+		this.mentionClass = mentionClass;
+	}
+		
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+
+@XStreamAlias("mention")
+public class Mention
+{
+	@XStreamAlias("id")
+	@XStreamAsAttribute
+	private String id;
+
+	public Mention(String id)
+	{
+		this.setId(id);
+	}
+	public String getId()
+	{
+		return id;
+	}
+
+	public void setId(String id)
+	{
+		this.id = id;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,61 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+import com.thoughtworks.xstream.annotations.XStreamConverter;
+import com.thoughtworks.xstream.converters.extended.ToAttributedValueConverter;
+
+@XStreamAlias("mentionClass")
+@XStreamConverter(value=ToAttributedValueConverter.class, strings={"name"})
+public class MentionClass
+{
+	@XStreamAlias("id")
+	@XStreamAsAttribute
+	private String id;
+	
+	private String name;
+	
+	public MentionClass(String id, String name)
+	{
+		this.setId(id);
+		this.setName(name);
+	}
+
+	public String getId()
+	{
+		return id;
+	}
+
+	public void setId(String id)
+	{
+		this.id = id;
+	}
+
+	public String getName()
+	{
+		return name;
+	}
+
+	public void setName(String name)
+	{
+		this.name = name;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+
+@XStreamAlias("span")
+public class Span
+{
+	@XStreamAlias("start")
+	@XStreamAsAttribute
+	private String start;
+	
+	@XStreamAlias("end")
+	@XStreamAsAttribute
+	private String end;
+	
+	public Span(String start, String end)
+	{
+		this.setStart(start);
+		this.setEnd(end);
+	}
+
+	public String getStart()
+	{
+		return start;
+	}
+
+	public void setStart(String start)
+	{
+		this.start = start;
+	}
+
+	public String getEnd()
+	{
+		return end;
+	}
+
+	public void setEnd(String end)
+	{
+		this.end = end;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,171 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.protege.ontology;
+
+import com.thoughtworks.xstream.XStream;
+import org.apache.log4j.Logger;
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.protege.beans.*;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.util.List;
+
+/**
+ * @author britt fitch BF19
+ *
+ *	This class is used to help bootstrap a local annotation effort.
+ *	This class can be used to read classifications from "_test" tables that were generated using the out of the box training model.
+ *	These classifications are then converted to protege xml format and can be loaded into the protoge ontology tool.
+ *	The loaded annotations can then be corrected/removed/added and re-exported for the purpose of generating new annotated training data. 
+ */
+public class ProtegeOntologyGenerator implements Runnable
+{
+    private static Logger log    =  Logger.getLogger(ProtegeOntologyGenerator.class);
+    
+    private String tableSuffix = "_test"; //ProtegeOntologyGenerator always uses "_test" tables.
+	private String outDir;
+
+	public ProtegeOntologyGenerator(String outDir)
+	{
+		this.setOutDir(outDir);
+	}
+	
+	public static void main(String[] args)
+	{
+		if (args.length != 1)
+		{
+			System.out.println("USAGE: ProtegeOntologyGenerator <outDir>");
+		}
+        else
+        {
+            ProtegeOntologyGenerator pog = new ProtegeOntologyGenerator(args[0]);
+            pog.run();
+        }
+	}
+
+    /**
+     * Read from feature_matrix
+     * All tokens that were not classified as N/A
+     * Take all those PHI and create XML representation in Protege format
+     */
+	public void run()
+	{
+        log.info("Running Protege Ontology Generator...");
+
+		try
+		{
+			XStream xstream = new XStream();
+			xstream.autodetectAnnotations(true);
+			
+			//get list of distinct filenames from feature_matrix
+			FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix);
+			List<String> fileList = dao.selectDistinctFilenames();
+				
+			//init id for annotations
+			String baseMentionId = "nci_all_init_Instance_";
+			int incrementingMentionId = 60000;
+			
+			for (String filename : fileList)
+			{
+				//get features for the current filename
+				Annotations annots = new Annotations();
+				List<CaseFeature> cfList = dao.selectPHICaseFeaturesByFilename(filename);
+				
+				//create Annotation object for each feature for the current filename.
+				for(CaseFeature c : cfList)
+				{
+					String mentionId = baseMentionId + incrementingMentionId++;
+					
+					annots.setTextSource(c.getFilename_short());
+					
+					Annotation annot = new Annotation();
+					annot.setMention(new Mention(mentionId));
+					annot.setAnnotator(new Annotator("nci_all_init_Instance_4", "britt fitch, hms"));
+					annot.setSpan(new Span(Integer.toString(c.getStartIdx()), Integer.toString(c.getEndIdx())));
+					annot.setSpannedText(c.getToken());
+					
+					ClassMention cm = new ClassMention(mentionId, new MentionClass(c.getClassified_as().toLowerCase(), c.getClassified_as().toLowerCase()));
+					
+					annots.getAnnotList().add(annot);
+					annots.getCmList().add(cm);
+				}
+			
+				//get xml representation of pojos
+				String xml = xstream.toXML(annots);
+				
+				//write xml annotation import file to outDir
+				File file = new File(this.getOutDir() + File.separatorChar + filename + ".xml");
+				file.createNewFile();
+				FileWriter writer = new FileWriter(file);
+				writer.write(xml);
+				writer.flush();
+				writer.close();
+			}
+		}
+		catch(Exception e)
+		{
+            log.error("failed to generate protege xml import files.", e);
+		}
+	}
+	
+
+	public String getOutDir()
+	{
+		return outDir;
+	}
+
+	public void setOutDir(String outDir)
+	{
+		this.outDir = outDir;
+	}
+
+    /**
+     * method generates a protege xml annotation import file based on the example xml input supplied with the scrubber project.
+     */
+    @Deprecated
+    public void runTest()
+    {
+        XStream xstream = new XStream();
+        xstream.autodetectAnnotations(true);
+
+        Annotations annots = new Annotations();
+        annots.setTextSource("testcase.xml");
+
+        String mentionId = "nci_all_init_Instance_80000";
+
+        Annotation annot = new Annotation();
+        annot.setMention(new Mention(mentionId));
+        annot.setAnnotator(new Annotator("nci_all_init_Instance_4", "britt fitch, hms"));
+        annot.setSpan(new Span("170", "173"));
+        annot.setSpannedText("Doe");
+
+        ClassMention cm = new ClassMention(mentionId, new MentionClass("contact", "contact"));
+
+        annots.getAnnotList().add(annot);
+        annots.getCmList().add(cm);
+
+        String xml = xstream.toXML(annots);
+        System.out.println(xml);
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,204 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.publications;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.PubDAO;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author britt fitch (bf19)
+ * @link http://www.gnu.org/licenses/lgpl.html
+ *
+ *	to run: 
+ * 		java -classpath .:lucy.jar:uber-lucy-0.0.1-SNAPSHOT.jar edu.harvard.cbmi.lucy.PubExtractor outputDir
+ */
+public class PublicationsExtractorJDBC implements Runnable
+{
+    private static Logger log    =  Logger.getLogger(PublicationsExtractorJDBC.class);
+
+	public String dirInputPublicationsTXT;
+
+    public PublicationsExtractorJDBC()
+    {
+        this(ScrubberProperties.getDirInputPublicationsTXT());
+    }
+	
+	public PublicationsExtractorJDBC(String dirInputPublicationsTXT)
+	{
+		this.dirInputPublicationsTXT = dirInputPublicationsTXT;
+	}
+	
+	public static void main(String[] args) throws Exception
+	{
+		PublicationsExtractorJDBC runner = new PublicationsExtractorJDBC();
+		runner.run();
+	}
+
+	public void run()
+	{
+		extractAllFilterSet("all");
+	}
+	
+	public void extractAllFilterSet(String subDirName)
+	{
+		try 
+		{
+			File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName);
+			
+            if(!outDir.exists()) outDir.mkdir();
+
+			//purge dir, if has contents. 
+			for (File f : outDir.listFiles())
+			{
+				f.delete();
+			}
+			
+			//get pubs
+			PubDAO dao = new PubDAO();
+			FileOutputStream output = null;
+			List<Integer> pubIdList = dao.selectAllPubIds();
+			for (Integer pubId : pubIdList)
+			{
+				Map<Integer,String> pubMap = dao.selectAllPubContentById(pubId);
+				
+				//write output files
+				for (Integer id : pubMap.keySet())
+				{
+					log.info("writing file for pubId = " + subDirName + "/" + id);
+					String body = pubMap.get(id).trim();
+					
+					//ignore files we know have 'no content'
+					if (body.length()<1 
+							|| body.equals("(To access the full article, please see PDF)")
+							|| body.equals("\"To access the full article, please see PDF\"")
+							|| body.startsWith("None declared."))
+					{
+						continue;
+					}
+					
+					File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt");
+					output = new FileOutputStream(outFile);
+					output.write(body.getBytes());
+					output.close();
+			    }
+			}			
+		}
+		catch(Exception e)
+		{
+			log.error("extracting "+subDirName+" filter set ", e);
+		}
+	}
+
+    //TODO: Deprecated
+
+    @Deprecated
+	public void extractRandomFilterSet(String subDirName)
+	{
+		try 
+		{
+			PubDAO dao = new PubDAO();
+			FileOutputStream output = null;
+			Map<Integer,String> pubMap = dao.selectRandomPubContent();
+			
+			File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName);
+			
+			//create subdir, if not already existing
+			outDir.mkdir();
+			
+			//purge dir, if has contents. 
+			for (File f : outDir.listFiles())
+			{
+				f.delete();
+			}
+						
+			//write output files
+			for (Integer id : pubMap.keySet())
+			{
+				System.out.println("INFO: writing file for pubId = " + subDirName + "/" + id);
+				String body = pubMap.get(id).trim();
+				
+				//ignore files we know have 'no content'
+				if (body.length()<1 
+						|| body.equals("(To access the full article, please see PDF)")
+						|| body.equals("\"To access the full article, please see PDF\""))
+				{
+					continue;
+				}
+				
+				File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt");
+				output = new FileOutputStream(outFile);
+				output.write(body.getBytes());
+				output.close();
+		    }			
+		}
+		catch(Exception e)
+		{
+			System.out.println("ERROR: extracting "+subDirName+" filter set: " + e.getMessage());
+			e.printStackTrace();
+		}
+	}
+
+    @Deprecated
+	public void extractFilterSet(String[] keywordFragList, String subDirName)
+	{
+		try 
+		{
+			PubDAO dao = new PubDAO();
+			FileOutputStream output = null;
+			Map<Integer,String> pubMap = dao.selectPubContentByLikeKeywords(keywordFragList);
+			
+			//purge dir, if has contents. 
+			File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName);
+			for (File f : outDir.listFiles())
+			{
+				f.delete();
+			}
+			
+			//create subdir, if not already existing
+			outDir.mkdir();
+			
+			//write output files
+			for (Integer id : pubMap.keySet())
+			{
+				System.out.println("INFO: writing file for pubId = " + subDirName + "/" + id);
+				String body = pubMap.get(id).trim();
+								
+				File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt");
+				output = new FileOutputStream(outFile);
+				output.write(body.getBytes());
+				output.close();
+		    }			
+		}
+		catch(Exception e)
+		{
+			System.out.println("ERROR: extracting "+subDirName+" filter set: " + e.getMessage());
+			e.printStackTrace();
+		}
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,344 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.publications;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.PubDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.EntityResolver;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author bf19
+ *
+ * accepts 1 command line param which is a dir containing open access publications
+ * @link http://www.ncbi.nlm.nih.gov/pmc/tools/ftp/
+ */
+public class PublicationsParserOpenAccessXML implements Runnable
+{
+    private static Logger log    =  Logger.getLogger(PublicationsParserOpenAccessXML.class);
+
+	private String dirInputPublicationsXML;
+
+	public PublicationsParserOpenAccessXML()
+    {
+        this(ScrubberProperties.getDirInputPublicationsXML());
+    }
+
+	public PublicationsParserOpenAccessXML(String dirInputPublicationsXML)
+	{
+		this.dirInputPublicationsXML = dirInputPublicationsXML;
+	}
+	
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args)
+	{
+		PublicationsParserOpenAccessXML parser = new PublicationsParserOpenAccessXML();
+		parser.run();
+	}
+
+	public void run()
+	{
+		try
+		{
+			//extract fields
+			File inDir = new File(dirInputPublicationsXML);
+			List<File> files = getFileList(new ArrayList<File>(), inDir);
+			PubDAO dao = new PubDAO();
+			
+			for (File f : files)
+			{
+				log.debug("Parsing : " + f.getName());
+				
+				//read infile
+				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+				DocumentBuilder builder = factory.newDocumentBuilder();
+				builder.setEntityResolver(new EntityResolver() {
+		            public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
+		                if (systemId.contains("dtd")) {
+		                    return new InputSource(new StringReader(""));
+		                } else {
+		                    return null;
+		                }
+		            }
+		        });
+				Document doc = builder.parse(f);
+				XPathFactory xPathfactory = XPathFactory.newInstance();
+				XPath xpath = xPathfactory.newXPath();
+				
+				XPathExpression journalTitleXP = xpath.compile("//journal-title");
+				XPathExpression articleTitleXP = xpath.compile("//article-title");
+				XPathExpression articleIdPMCXP = xpath.compile("//article-id[@pub-id-type='pmc']");
+				XPathExpression articleIdPMIDXP = xpath.compile("//article-id[@pub-id-type='pmid']");
+				XPathExpression articleIdDOIXP = xpath.compile("//article-id[@pub-id-type='doi']");
+				XPathExpression keywordXP = xpath.compile("//kwd");
+				XPathExpression bodyXP = xpath.compile("//body//p");
+				XPathExpression authorsXP = xpath.compile("//contrib[@contrib-type='author']");
+				XPathExpression refsXP = xpath.compile("//ref-list//name");
+	
+				//read all matching nodes
+				NodeList jtitleList = (NodeList) journalTitleXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList atitleList = (NodeList) articleTitleXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList pmcList = (NodeList) articleIdPMCXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList pmidList = (NodeList) articleIdPMIDXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList doiList = (NodeList) articleIdDOIXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList keywordList = (NodeList) keywordXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList bodyList = (NodeList) bodyXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList authorList = (NodeList) authorsXP.evaluate(doc, XPathConstants.NODESET);
+				NodeList refList = (NodeList) refsXP.evaluate(doc, XPathConstants.NODESET);
+	
+				//populate pojo
+			    Pub pub = new Pub();
+			    pub.setJtitle(jtitleList);
+			    pub.setAtitle(atitleList);
+			    pub.setPmc(pmcList);
+			    pub.setPmid(pmidList);
+			    pub.setDoi(doiList);
+			    pub.setKeywords(keywordList);
+			    pub.setAuthors(authorList);
+			    pub.setRefs(refList);
+			    pub.setBody(bodyList);
+			    
+			    //insert records
+			    int id = dao.insertPub(pub.getJtitle(), pub.getAtitle(), pub.getPmc(), pub.getPmid(), pub.getDoi(), pub.getBody(), f.getAbsolutePath());
+			    if (id > 0)
+			    {
+				    for (Person p : pub.getAuthors())
+				    {
+				    	dao.insertAuthor(id, p.getSurname(), p.getGivenName());
+				    }
+				    for (Person p : pub.getRefs())
+				    {
+				    	dao.insertRef(id, p.getSurname(), p.getGivenName());
+				    }
+				    for (String s : pub.getKeywords())
+				    {
+				    	dao.insertKeyword(id, s);
+				    }
+			    }
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("unknown error parsing xml: ", e);
+		}
+	}
+
+	/**
+	 * recursively get files from directory structure.
+	 */
+	public List<File> getFileList(List<File> files, File file)
+	{
+		if (!file.isDirectory())
+		{
+			files.add(file);
+		}
+		else if(file.isDirectory())
+		{
+			for(File f : file.listFiles())
+			{
+				getFileList(files, f);
+			}
+		}
+		
+		return files;
+	}
+	
+	class Pub
+	{
+		String jtitle;
+		String atitle;
+		String pmc;
+		String pmid;
+		String doi;
+		String body = "";
+		List<String> keywords = new ArrayList<String>();
+		List<Person> authors = new ArrayList<Person>(); 
+		List<Person> refs = new ArrayList<Person>();
+		public String getJtitle()
+		{
+			return jtitle;
+		}
+		public void setJtitle(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				jtitle = item.getTextContent().trim();
+			}
+		}
+		public String getAtitle()
+		{
+			return atitle;
+		}
+		public void setAtitle(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				atitle = item.getTextContent().trim();
+			}
+		}
+		public String getPmc()
+		{
+			return pmc;
+		}
+		public void setPmc(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				pmc = item.getTextContent().trim();
+			}
+		}
+		public String getPmid()
+		{
+			return pmid;
+		}
+		public void setPmid(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				pmid = item.getTextContent().trim();
+			}
+		}
+		public String getDoi()
+		{
+			return doi;
+		}
+		public void setDoi(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				doi = item.getTextContent().trim();
+			}
+		}
+		public String getBody()
+		{
+			return body;
+		}
+		public void setBody(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				body += item.getTextContent().trim() + " ";				
+			}
+		}
+		public List<String> getKeywords()
+		{
+			return keywords;
+		}
+		public void setKeywords(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				keywords.add(item.getTextContent().trim());
+			}
+		}
+		public List<Person> getAuthors()
+		{
+			return authors;
+		}
+		public void setAuthors(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				if (item.getNodeType() == Node.ELEMENT_NODE) 
+				{
+					Element elem = (Element)item;
+					String lname = (elem.getElementsByTagName("surname").getLength() > 0) ? elem.getElementsByTagName("surname").item(0).getTextContent().trim() : null;
+					String fname = (elem.getElementsByTagName("given-names").getLength() > 0) ? elem.getElementsByTagName("given-names").item(0).getTextContent().trim() : null;
+					authors.add(new Person(lname, fname));
+				}				
+			}
+		}
+		public List<Person> getRefs()
+		{
+			return refs;
+		}
+		public void setRefs(NodeList list)
+		{
+			for (int k=0; k<list.getLength(); k++)
+			{
+				Node item = list.item(k);
+				if (item.getNodeType() == Node.ELEMENT_NODE) 
+				{
+					Element elem = (Element)item;
+					String lname = (elem.getElementsByTagName("surname").getLength() > 0) ? elem.getElementsByTagName("surname").item(0).getTextContent().trim() : null;
+					String fname = (elem.getElementsByTagName("given-names").getLength() > 0) ? elem.getElementsByTagName("given-names").item(0).getTextContent().trim() : null;
+					refs.add(new Person(lname, fname));
+				}				
+			}
+		}
+	}
+	class Person
+	{
+		String surname;
+		String givenName;
+		public Person(String surname, String givenName)
+		{
+			this.surname = surname;
+			this.givenName = givenName;
+		}
+		public String getSurname()
+		{
+			return surname;
+		}
+		public void setSurname(String surname)
+		{
+			this.surname = surname;
+		}
+		public String getGivenName()
+		{
+			return givenName;
+		}
+		public void setGivenName(String givenName)
+		{
+			this.givenName = givenName;
+		}
+		
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,143 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.redactor;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.io.FileWriter;
+import java.util.List;
+
+/**
+ * redactor used to replace tokens classified as PHI from i2b2 formatted xml 
+ * @author britt fitch BF19
+ *
+ */
+public class I2B2XMLRedactor implements Runnable
+{
+
+    private static Logger log    =  Logger.getLogger(I2B2XMLRedactor.class);
+
+	private String in;
+	private String out;
+	private String tableSuffix = "_test"; //always uses "_test" tables.
+	private FeatureMatrixDAO dao;
+	
+	public I2B2XMLRedactor(String in, String out) throws Exception
+	{
+		this.in = in;
+		this.out = out;
+
+		dao = new FeatureMatrixDAO(tableSuffix);
+	}
+	
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		if (args.length!=2)
+		{
+			System.out.println("USAGE:\t\t Redactor inDir outDir");
+		}
+
+		I2B2XMLRedactor runner = new I2B2XMLRedactor(args[0], args[1]);
+		runner.run();
+	}
+
+	public void run()
+	{
+		try
+		{
+			File inDir = new File(in);
+			
+			if (!inDir.exists())
+			{
+				inDir.createNewFile();
+			}
+			
+			File[] files = inDir.listFiles();
+			
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					continue;
+				}
+				
+				System.out.println("Redactor for: " + f.getName());
+				
+				//read infile
+				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+				DocumentBuilder builder = factory.newDocumentBuilder();
+				Document doc = builder.parse(f);
+				XPathFactory xPathfactory = XPathFactory.newInstance();
+				XPath xpath = xPathfactory.newXPath();
+				XPathExpression expr = xpath.compile("//TEXT");
+	
+				//read all matching nodes
+			    NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
+	
+			    FileWriter writer = null; 
+			    
+			    //for each node in a file, write out to a flat txt file of the same name.
+			    for (int i=0; i<nodes.getLength(); i++)
+			    {
+			    	Node n = nodes.item(i);
+			    	NamedNodeMap attributes = n.getAttributes();
+					String id = n.getParentNode().getAttributes().getNamedItem("ID").getNodeValue();//attributes.getNamedItem("ID").getNodeValue();
+					
+					//make outfile
+					String fname = id+".txt";
+					writer = new FileWriter(new File(out + File.separatorChar + fname));
+			    	String txt = n.getTextContent();
+					
+			    	//get PHI to redact
+			    	List<CaseFeature> phiList = dao.selectClassifiedAsPHITest(fname);
+			    	
+			    	for (CaseFeature cf : phiList)
+			    	{
+			    		txt = txt.substring(0, cf.getStartIdx()) + "xxx"+cf.getClassified_as()+"xxx" + txt.substring(cf.getEndIdx());			    		
+			    	}
+			    	
+					writer.write( txt + "\n");
+			    	writer.flush();
+				    writer.close();
+			    }
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("Unknown error redacting XXXX from the i2b2 XML text.", e);
+		}
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.redactor;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+import java.io.*;
+import java.util.List;
+
+/**
+ * redactor used to replace tokens classified as PHI from any kind of file. 
+ * @author britt fitch BF19
+ *
+ */
+//TODO: refactor
+public class Redactor implements Runnable
+{
+    private static Logger log    =  Logger.getLogger(Redactor.class);
+
+	private String dirInput;
+	private String dirOutput;
+	private String tableSuffix = "_test"; //always uses "_test" tables.
+	
+	private FeatureMatrixDAO dao;
+
+    public Redactor() throws Exception
+    {
+        this(ScrubberProperties.getDirInputTest(), ScrubberProperties.getDirOuputTest());
+    }
+
+	public Redactor(String dirInput, String dirOutput) throws Exception
+	{
+		this.dirInput  = dirInput;
+		this.dirOutput = dirOutput;
+
+		dao = new FeatureMatrixDAO(tableSuffix);
+	}
+	
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+        Redactor runner = new Redactor();
+        runner.run();
+	}
+
+	public void run()
+	{
+		try
+		{
+			File inDir = new File(dirInput);
+			
+			if (!inDir.exists())
+			{
+				inDir.createNewFile();
+			}
+			
+			File[] files = inDir.listFiles();
+			
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					continue;
+				}
+				
+				log.debug("Redacting : " + f.getName());
+				
+				//read infile
+				String str = reader2String(new InputStreamReader(new FileInputStream(f)));
+				
+			    //make outfile
+				FileWriter writer = new FileWriter(new File(dirOutput + File.separatorChar + f.getName()));
+				
+		    	//get PHI to redact
+		    	List<CaseFeature> phiList = dao.selectClassifiedAsPHI(f.getName());
+		    	
+		    	for (CaseFeature cf : phiList)
+		    	{
+		    		str = str.substring(0, cf.getStartIdx()) + "xxx" +str.substring(cf.getEndIdx());
+		    	}
+		    	
+				writer.write( str + "\n");
+		    	writer.flush();
+			    writer.close();
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("Unknown error during redaction",e);
+		}
+	}
+	
+	private static String reader2String(Reader reader) throws IOException
+    {
+	    StringBuffer strBuffer = new StringBuffer();
+	    char[] buf = new char[10000];
+	    int charsRead;
+	    try 
+	    {
+	    	while ((charsRead = reader.read(buf)) >= 0) 
+	    	{
+	    		strBuffer.append(buf, 0, charsRead);
+	    	}
+	    } 
+	    finally 
+	    {
+	      reader.close();
+	    }
+	    return strBuffer.toString();
+	  }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.templates;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+
+import java.io.*;
+import java.util.HashMap;
+
+/**
+* @author Andrew McMurry, MS
+*         <p/>
+*         With primary support from Children's Hospital Informatics Program @
+*         Harvard-MIT Health Sciences and Technology and
+*         <p/>
+*         Secondary support from the Harvard Medical School
+*         Center for BioMedical Informatics
+*         <p/>
+*         PHD candidate, Boston University Bioinformatics
+*         Member, I2b2 National Center for Biomedical Computing
+*         <p/>
+*         All works licensed under LGPL
+*         <p/>
+*         User: andy
+*         Date: 6/20/12
+*         Time: 12:18 AM
+*/
+public class TemplateFileProcessor
+{
+    private static Logger log    =  Logger.getLogger(TemplateFileProcessor.class);
+
+    public static String TEMPLATE_FILE_PREFIX = "TEMPLATE.";
+
+    public static void showUsage()
+    {
+        System.out.println("This will process a TEMPLATE.file using information from scrubber.properties");
+        System.out.println("[Usage]");
+        System.out.println();
+        System.out.println("[Default = all known templates]");
+        System.out.println("java TemplateFileProcessor");
+        System.out.println();
+        System.out.println("[Specified template]");
+        System.out.println("java TemplateFileProcessor TEMPLATE.file");
+        System.out.println();
+    }
+
+    public static void main(String[] args)
+    {
+        try
+        {
+            if(args.length==0)
+            {
+                showUsage();
+            }
+            else
+            {
+                if("ALL".equalsIgnoreCase(args[0]))
+                {
+                    System.out.println("Processing ALL known templates.");
+                    TemplateFileProcessor.processTemplatesAllKnown();
+                }
+                else
+                {
+                    File templateFor = new File(args[0]);
+
+                    System.out.println("Processing template for "+ templateFor.getAbsolutePath());
+                    TemplateFileProcessor.processTemplate(templateFor);
+                }
+            }
+        }
+        catch(Exception e)
+        {
+            System.out.println("Could not process template: "+ e.getMessage());
+        }
+    }
+
+    public static void processTemplatesAllKnown() throws IOException
+    {
+        log.info("Processing UIMA reader templates.");
+
+        processTemplate("desc/reader", ScrubberProperties.getUimaReaderFileTrain());
+        processTemplate("desc/reader", ScrubberProperties.getUimaReaderFileTest());
+        processTemplate("desc/reader", ScrubberProperties.getUimaReaderFilePublications());
+    }
+
+    public static void processTemplate(String directory, String filename) throws IOException
+    {
+        String SLASH   = ScrubberProperties.SLASH;
+
+        processTemplate(new File(directory + SLASH + TEMPLATE_FILE_PREFIX + filename));
+    }
+
+    public static File processTemplate(File templateFileAbsolutePath) throws IOException
+    {
+        return processTemplate(templateFileAbsolutePath, getOutputFile(templateFileAbsolutePath));
+    }
+
+    public static File processTemplate(File templateFileAbsolutePath, File outputFileAbsolutePath) throws IOException
+    {
+        return processTemplate(templateFileAbsolutePath, outputFileAbsolutePath, ScrubberProperties.asTokenMap());
+    }
+
+    public static File processTemplate(File templateFileAbsolutePath, File outputFileAbsolutePath, HashMap<String, String> replacements) throws IOException
+    {
+        String contents = read(templateFileAbsolutePath);
+
+        for(String key: replacements.keySet())
+        {
+            String value = replacements.get(key);
+
+            log.debug("Template is replacing: "+ key + " with "+ value);
+
+            contents = contents.replaceAll(key, value);
+        }
+
+        write(outputFileAbsolutePath, contents);
+
+        return outputFileAbsolutePath;
+    }
+
+    public static File getOutputFile(File templateFile)
+    {
+        return new File(templateFile.getAbsolutePath().replaceAll(TEMPLATE_FILE_PREFIX, ""));
+    }
+
+     /**
+     * Convenience method, read all file contents
+     *
+     * @param filename
+     * @return file contents as string
+     * @throws IOException
+     */
+    public static String read(final File filename) throws IOException
+    {
+        log.debug("Reading template from " + filename.getAbsolutePath());
+
+        return read(new FileReader(filename));
+    }
+
+    /**
+     * Convenience method, read all contents
+     *
+     * @param reader
+     * @return file contents as string
+     * @throws IOException
+     */
+    public static String read(final Reader reader) throws IOException
+    {
+    	try
+    	{
+    	    int len;
+
+            final StringBuilder contents = new StringBuilder();
+
+            final char[] buf = new char[1024];
+
+        	while((len = reader.read(buf)) > 0)
+        	{
+        	   contents.append(buf, 0, len);
+        	}
+
+        	return contents.toString();
+    	}
+    	finally
+    	{
+    	    reader.close();
+    	}
+    }
+
+    /**
+    * Convenience method, write file contents
+    *
+    * @param filename
+    * @return file contents as string
+    * @throws IOException
+    */
+    public static void write(final File filename, final String contents) throws IOException
+    {
+      log.debug("Writing contents to " + filename.getAbsolutePath());
+
+      PrintWriter output = null;
+
+      try
+      {
+          output = new PrintWriter(new FileOutputStream(filename), false);
+
+          output.write(contents);
+      }
+      finally
+      {
+          output.close();
+      }
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.annotator;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
+
+/**
+ * @author BF19
+ *
+ */
+public abstract class BaseAnnotator extends JCasAnnotator_ImplBase
+{
+	/**
+	   * @param context
+	   * @param param
+	   * @param defaultValue
+	   * @return returns the boolean parameter value
+	   * @throws AnnotatorContextException
+	   */
+	  protected static String[] safeGetConfigParameterStringArrayValue(UimaContext context, String param, String[] defaultValue) 
+	  {
+	    String[] array = (String[]) context.getConfigParameterValue(param);
+	    if (array != null && array.length > 0) 
+	    {
+	      return array;
+	    }
+	    return defaultValue;
+	  }
+	  
+	  /**
+	   * @param context
+	   * @param param
+	   * @param defaultValue
+	   * @return returns the boolean parameter value
+	   * @throws AnnotatorContextException
+	   */
+	  protected static Boolean[] safeGetConfigParameterBooleanArrayValue(UimaContext context, String param, Boolean[] defaultValue) 
+	  {
+	    Boolean[] array = (Boolean[]) context.getConfigParameterValue(param);
+	    if (array != null && array.length > 0) 
+	    {
+	      return array;
+	    }
+	    return defaultValue;
+	  }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,262 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.annotator;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.spin.scrubber.uima.dao.BaseDAO;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+public class DictionaryAnnotator extends BaseAnnotator
+{
+    private static Logger  log    =  Logger.getLogger(DictionaryAnnotator.class);
+    private static boolean DEBUG = log.isDebugEnabled();
+
+	private Connection  conn  = null;
+	private String[]    lookupQueryArray;
+	
+	public void initialize(UimaContext aContext) throws ResourceInitializationException 
+	{
+        log.info("initialize() ...");
+
+		//TODO: consider changing this query to an array of queries in case we want to query multiple tables.
+		this.lookupQueryArray = safeGetConfigParameterStringArrayValue(aContext, "lookupQuery", new String[] {});
+//		this.lookupQuery = (String) aContext.getConfigParameterValue("lookupQuery");
+
+		try
+		{
+			conn = BaseDAO.getConnectionToScrubber();
+		} 
+		catch (Exception e)
+		{
+			log.fatal("Failed to initialize", e);
+			throw new ResourceInitializationException(ResourceInitializationException.ANNOTATOR_INITIALIZATION_FAILED, new Object[] { "database connection" });
+		}
+
+        log.info("initialize() is done.");
+	}
+	
+	public void process(JCas aJCas) throws AnalysisEngineProcessException 
+	{
+		log.debug("starting dict: " + new Date(System.currentTimeMillis()));
+		
+	    // get document text from JCas
+        if(DEBUG) log.debug("Doc:"+ aJCas.getDocumentText());
+        {
+
+        }
+
+	    processWordTokens(aJCas);
+	    processChunks(aJCas);
+	    
+	    if(DEBUG) log.debug("Finish dict: " + new Date(System.currentTimeMillis()));
+	}
+	
+	private void processWordTokens(JCas aJCas)
+	{
+		Iterator<Annotation> annotIt = aJCas.getAnnotationIndex(WordToken.type).iterator();
+	    while(annotIt.hasNext())
+	    {
+	    	Annotation annot = annotIt.next();
+	    	if (annot.getType().getShortName().equals("WordToken"))
+	    	{
+	    	    String[] tokens = annot.getCoveredText().split("\\s+|\\.|\\,");
+	    	    
+	    	    for (String s : tokens)
+	    	    {
+	    	    	if (s.trim().length()==0)
+	    	    	{
+	    	    		continue;
+	    	    	}
+	    	    	
+	    	    	Set<String> results = lookup(s);
+	    	    	for (String r : results)
+	    	    	{
+	    	    		String[] matchArray = r.split("\\|");
+	    	    		OntologyMatch match = new OntologyMatch(aJCas);
+	    	    		match.setBegin(annot.getBegin());
+	    	    		match.setEnd(annot.getEnd());
+	    	    		match.setCode(matchArray[0]);
+	    	    		match.setOntology(matchArray[1]);
+	    	    		match.addToIndexes();
+	    	    		//TODO: right now this depends on WordToken annotations. this should be changed so that the dictionary could be run on its own. w/o having to follow the pos tagger
+	    	    		//TODO: add ability to lookup in flat files as well.
+	    	    	}
+	    	    }
+	    	}
+	    }
+	}
+	
+	private void processChunks(JCas aJCas)
+	{
+		Iterator<Annotation> annotIt = aJCas.getAnnotationIndex(Chunk.type).iterator();
+	    while(annotIt.hasNext())
+	    {
+	    	Chunk annot = (Chunk)annotIt.next();
+//	    	if (annot.getChunkType().equalsIgnoreCase("NP"))
+//	    	{
+	    		Set<String> tokens = generatePermutations(annot.getCoveredText());
+	    		
+	    	    for (String s : tokens)
+	    	    {
+	    	    	if (s.trim().length()==0)
+	    	    	{
+	    	    		continue;
+	    	    	}
+	    	    	
+	    	    	Set<String> results = lookup(s);
+	    	    	for (String r : results)
+	    	    	{
+	    	    		String[] matchArray = r.split("\\|");
+	    	    		OntologyMatch match = new OntologyMatch(aJCas);
+	    	    		match.setBegin(annot.getBegin() + annot.getCoveredText().indexOf(s));
+	    	    		match.setEnd(match.getBegin() + s.length());
+	    	    		match.setCode(matchArray[0]);
+	    	    		match.setOntology(matchArray[1]);
+	    	    		match.addToIndexes();
+	    	    		//TODO: right now this depends on Chunk annotations. this should be changed so that the dictionary could be run on its own. w/o having to follow the pos tagger
+	    	    		//TODO: add ability to lookup in flat files as well.
+	    	    	}
+	    	    }
+//	    	}
+	    }
+	}
+	
+	/**
+	 * implements a sliding window by getting all substrings of a string of words 
+	 * from both the start of the string and from the end of the string,
+	 * skipping strings that are only 1 word because that case will already be handled by the wordtoken annotations.
+	 * @param token
+	 * @return
+	 */
+	protected Set<String> generatePermutations(String token)
+	{
+		Set<String> results = new HashSet<String>();
+		
+		String[] tokens = token.split("\\s+|\\.|\\,");
+		int len = tokens.length;
+		
+		if (len<=1)
+		{
+			return results;
+		}
+		
+		//get permutations going FORWARD through the phrase
+		for (int level=len-1; level>=0; level--)
+		{
+			String term = "";
+			for (int iteration=0; iteration<=level; iteration++)
+			{
+				term = (term + " " + tokens[iteration]).trim();
+			}
+			if (term.length()>0 && term.contains(" "))
+			{
+				results.add(term);
+			}
+		}
+		
+		//get permutations going BACKWARD through the phrase
+		for (int level=0; level<=len-1; level++)
+		{
+			String term = "";
+			for (int iteration=level; iteration<=len-1; iteration++)
+			{
+				term = (term + " " + tokens[iteration]).trim();
+			}
+			if (term.length()>0 && term.contains(" "))
+			{
+				results.add(term);
+			}
+		}
+		
+		return results;
+	}
+	
+	/**
+	 * accepts token, looks up in subset of umls, returns a mapping of cui to ontology for this token.
+	 * @param token
+	 * @return Set<"cui|ontology">
+	 * @throws Exception
+	 */
+	private Set<String> lookup(String token)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		Set<String> results = new HashSet<String>();
+		
+		for (String sql : lookupQueryArray)
+		{
+	//		StringBuilder sql = new StringBuilder(this.lookupQuery);
+						
+			try 
+			{
+				ps = (PreparedStatement) conn.prepareStatement(sql);
+				int i=1;
+				ps.setString(i++, token);
+				rs = ps.executeQuery();
+			    
+			    if(rs!=null)
+			    {
+			    	while (rs.next())
+					{
+			    		String code = rs.getString("code");
+			    		String ontology = rs.getString("ontology");
+			    		results.add(code+"|"+ontology);		    		
+					}
+			    }
+			}
+			catch (SQLException e)
+			{
+				log.error("ERROR: failed selecting DICTIONARY matches for token: " + token, e);
+			}
+			finally
+			{
+				BaseDAO.closeRSPS(rs, ps);
+			}
+		}
+		
+		return results;
+	}
+	
+	public void destroy()
+	{
+        super.destroy();
+
+        log.info("Closing connection.");
+		BaseDAO.closeConnection(conn);
+
+        log.info("Done.");
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,218 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.annotator;
+
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceAccessException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * this class borrows much of its structure & flow from UIMA sandbox RegExAnnotator. 
+ * the main difference is that this allows the usage of an external flat file where you can specify the name of the regex that was matched.
+ * for our uses, all regex matches can be of the same type, but we need to have a name of each one in to make identifying matches & maintenance of regex easier.
+ * 
+ * @author BF19
+ *
+ */
+public class RegexAnnotator extends BaseAnnotator
+{
+    private static Logger log    =  Logger.getLogger(RegexAnnotator.class);
+    private static boolean DEBUG =  log.isDebugEnabled();
+
+    private List<String> nameList;
+	private List<String> typeList;
+	private List<Pattern> patternList;
+
+	public void initialize(UimaContext aContext) throws ResourceInitializationException
+	{
+        log.info("initialize() ... ");
+
+		super.initialize(aContext);
+
+		nameList = new ArrayList<String>();
+		typeList = new ArrayList<String>();
+		patternList = new ArrayList<Pattern>();
+		
+		try
+		{
+            //TODO: refactor into constants
+			String[] fileNameArray = safeGetConfigParameterStringArrayValue(getContext(), "Filenames", new String[] {});
+			Boolean[] caseSensitiveArray = safeGetConfigParameterBooleanArrayValue(getContext(), "CaseSensitiveFile", new Boolean[] {});
+
+			for (int i=0; i<fileNameArray.length; i++)
+			{
+				String file = fileNameArray[i];
+				boolean caseSensitive = caseSensitiveArray[i];
+				
+			    InputStream in = getContext().getResourceAsStream(file);
+			    if (in != null) 
+			    {
+			    	try 
+			        {
+			          // get buffered reader
+			          BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+			
+			          //initialize
+			          String line = null;
+			          String recentType = null;
+			          String recentName = null;
+			          
+			          //read lines from file
+			          while ((line = reader.readLine()) != null) 
+			          {
+			        	  //if line is a comment, skip it.
+			        	  if (line.startsWith("//")
+			            		|| line.length() <= 0 
+			            		|| Character.isWhitespace(line.charAt(0))) 
+			        	  {
+			        		  continue;
+			        	  }
+			            
+			        	  //if line is annotation type, add to typeList
+			              if (line.startsWith("%"))
+			              {
+			            	  recentType=line.substring(1).trim();
+			              }
+			              else if (line.startsWith("#"))
+			              {
+			            	  recentName=line.substring(1).trim();
+			              }
+			              else 
+			              {
+			            	  //check for case sensitivity 
+			            	  if (!caseSensitive)
+			            	  {
+			            		  line = "(?i)"+line;
+			            	  }
+			            	  
+			            	  //compile regex
+			            	  Pattern p = Pattern.compile(line);
+			            	  //make sure pattern doesn not match the empty string - otherwise infinite loops
+			            	  if (p.matcher("").matches())
+			            	  {
+			            		  throw new ResourceInitializationException("regex_matches_empty_string", new Object[] { line });
+			            	  }
+			            	  
+			            	  //add type, name, pattern tuple 
+			            	  patternList.add(p);
+			            	  nameList.add(recentName);
+			            	  typeList.add(recentType);
+			              }
+			          }
+			        }
+			    	finally 
+			    	{
+			    		if (in != null) 
+				        {
+			    			in.close();
+				        }
+			        }
+			    }	
+			}
+			
+			//make sure at least 1 pattern
+		    if (patternList.size()<=1)
+		    {
+		    	throw new ResourceInitializationException(AnnotatorConfigurationException.ONE_PARAM_REQUIRED, new Object[] { "Patterns, Pattern File" });
+		    }
+		    
+		    //verify all 3 lists have the same size. if not, then the input file is not properly sturctured and we are missing one of {type, name, regex}
+		    if (patternList.size()!=nameList.size()
+		    		|| patternList.size()!=typeList.size())
+		    {
+		    	throw new ResourceInitializationException(AnnotatorConfigurationException.ONE_PARAM_REQUIRED, new Object[] { "Pattern File missing 1 of [type,name,regex]" });
+		    }
+		}
+		catch (ResourceAccessException e) 
+    	{
+    		throw new ResourceInitializationException(e);
+    	} 
+    	catch (IOException e) 
+    	{
+    		throw new ResourceInitializationException(e);
+    	}
+
+        log.info("initialize() is done");
+	}
+
+	  public void process(JCas aJCas) throws AnalysisEngineProcessException 
+	  {
+        if(DEBUG) log.debug("starting regex: " + new Date(System.currentTimeMillis()));
+
+	    try 
+	    {
+	      String docText = aJCas.getDocumentText();
+	      for (int i=0; i<patternList.size(); i++)
+	      {
+	    	  long start = System.currentTimeMillis();
+	    	  Matcher matcher = patternList.get(i).matcher(docText);
+	    	  while (matcher.find()) 
+	          {
+	            	String matched = matcher.group();
+	            	int mStartPos = matcher.start();
+	            	int mEndPos = matcher.end();
+
+                    if(DEBUG) log.debug("RegEx match found: [" + matched + "]");
+
+                    // create Annotation in CAS
+	                OntologyMatch match = new OntologyMatch(aJCas);
+                   {
+                        match.setBegin(mStartPos);
+                        match.setEnd(mEndPos);
+                        match.setCode(nameList.get(i));
+                        match.setOntology("regex");
+                        match.addToIndexes();
+                   }
+	          }
+
+              if(DEBUG)
+                  log.debug("RegEx : [" + nameList.get(i) + "] took " + (System.currentTimeMillis()-start) + " millis");
+	      }
+	    } 
+	    catch (Exception e) 
+	    {
+	    	throw new AnalysisEngineProcessException(e);
+	    }
+	    
+	    log.debug("Finish regex: " + new Date(System.currentTimeMillis()));
+	  }
+
+    @Override
+    public void destroy()
+    {
+        super.destroy();
+        log.info("Done.");
+    }
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.annotator;
+
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.spin.scrubber.uima.dao.BaseDAO;
+import org.spin.scrubber.uima.dao.TfDAO;
+import org.spin.scrubber.uima.type.Calculation;
+
+import java.sql.Connection;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Map;
+
+public class TFAnnotator extends BaseAnnotator
+{
+    private static Logger log    =  Logger.getLogger(TFAnnotator.class);
+    private static boolean DEBUG =  log.isDebugEnabled();
+
+	private Connection conn = null;
+	private String lookupQuery = null; //TODO: is it better to embed the dao logic in the annotator so its more compartmentalized or to call the dao at the risk of making the project less modular? 
+	private Map<String,Integer> pubsTFMap = null;
+	
+	public void initialize(UimaContext aContext) throws ResourceInitializationException 
+	{
+        log.info("initialize()...");
+
+	    lookupQuery = (String) aContext.getConfigParameterValue("lookupQuery");
+				
+		try
+		{
+			//select all pub token/cnt/pos
+			pubsTFMap = new TfDAO().selectPubTFMap();
+			
+			//create db connection for later use
+			conn = BaseDAO.getConnectionToScrubber();
+		} 
+		catch (Exception e)
+		{
+			e.printStackTrace();
+			throw new ResourceInitializationException(ResourceInitializationException.ANNOTATOR_INITIALIZATION_FAILED, new Object[] { "database connection" });
+		}
+
+        log.info("initialize() is done...");
+	}
+	
+	public void process(JCas aJCas) throws AnalysisEngineProcessException 
+	{
+		if(DEBUG) log.debug("Starting tf: " + new Date(System.currentTimeMillis()));
+
+		processWordTokens(aJCas);
+		processNumTokens(aJCas);
+
+        if(DEBUG) log.debug("Finish tf: " + new Date(System.currentTimeMillis()));
+	}
+
+    //TODO: refactor
+	private void processWordTokens(JCas aJCas)
+	{
+		Iterator<Annotation> annotIt = aJCas.getAnnotationIndex(WordToken.type).iterator();
+	    while(annotIt.hasNext())
+	    {
+	    	Annotation annot = annotIt.next();
+	    	if (annot.getType().getShortName().equals("WordToken"))
+	    	{
+	    		createCalculationAnnotation(aJCas, annot);
+	    	}
+	    }
+	}
+
+    //TODO: refactor
+	private void processNumTokens(JCas aJCas)
+	{
+		Iterator<Annotation> annotIt = aJCas.getAnnotationIndex(NumToken.type).iterator();
+	    while(annotIt.hasNext())
+	    {
+	    	Annotation annot = annotIt.next();
+	    	if (annot.getType().getShortName().equals("NumToken"))
+	    	{
+	    		createCalculationAnnotation(aJCas, annot);
+	    	}
+	    }
+	}
+	
+	/**
+	 * lower case the annotation text and look it up in the pub map both with and without the part of speech.  
+	 * calculate the 2 term frequencies. 
+	 * @param aJCas
+	 * @param annot
+	 */
+	private void createCalculationAnnotation(JCas aJCas, Annotation annot)
+	{
+		String s = annot.getCoveredText().toLowerCase().trim();
+
+    	if (s.trim().length()==0)
+    	{
+    		return;
+    	}
+
+    	//TODO: move the feature name to the xml descriptor.
+    	//get Features... //TODO: there must be a better way to do this...
+    	String pos=null;
+	      	Feature posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName("edu.mayo.bmi.uima.core.type.BaseToken:partOfSpeech");
+  		if (posFeat!=null)
+  			pos = annot.getFeatureValueAsString(posFeat);	
+    	
+		//update all_pubs features
+		int pubTermPosCnt = (pubsTFMap.get(s+"|"+pos)==null) ? 0 : pubsTFMap.get(s+"|"+pos);
+		int pubTermCnt = (pubsTFMap.get(s)==null) ? 0 : pubsTFMap.get(s);
+
+		double pubTotalCnt = Double.valueOf(Integer.toString(pubsTFMap.get("totalPubCount")));
+
+		double hamWithPos    = -1*Math.log10(pubTermPosCnt/pubTotalCnt);
+		double hamWithoutPos = -1*Math.log10(pubTermCnt/pubTotalCnt);
+
+		Calculation match = new Calculation(aJCas);
+
+		match.setBegin(annot.getBegin());
+		match.setEnd(annot.getEnd());		
+		match.setCalculationName("tf_ham_with_pos");
+		match.setCalculationValue(Double.toString(hamWithPos));
+		match.addToIndexes();
+		
+		match = new Calculation(aJCas);
+		match.setBegin(annot.getBegin());
+		match.setEnd(annot.getEnd());
+		match.setCalculationName("tf_ham_without_pos");
+		match.setCalculationValue(Double.toString(hamWithoutPos));
+		match.addToIndexes();
+	}
+	
+	public void destroy()
+	{
+        super.destroy();
+
+        log.info("Closing connection.");
+		BaseDAO.closeConnection(conn);
+
+        log.info("Done.");
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message