Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 3C79810F23 for ; Sun, 7 Jul 2013 19:23:35 +0000 (UTC) Received: (qmail 41537 invoked by uid 500); 7 Jul 2013 19:23:35 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 41506 invoked by uid 500); 7 Jul 2013 19:23:35 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 41499 invoked by uid 99); 7 Jul 2013 19:23:35 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Jul 2013 19:23:35 +0000 X-ASF-Spam-Status: No, hits=-1998.0 required=5.0 tests=ALL_TRUSTED,FB_GET_MEDS X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Jul 2013 19:23:30 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 124FC23889ED; Sun, 7 Jul 2013 19:23:10 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1500511 [3/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./ main/ main/java/ main/java/org/ main/java/org/apache/ main/java/org/apache/uima/ main/java/org/apache/uima/examples/ main/java/org/spin/ main/java/org/spin/scrubber/ main/java... Date: Sun, 07 Jul 2013 19:23:07 -0000 To: commits@ctakes.apache.org From: brittfitch@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20130707192310.124FC23889ED@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,55 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.protege.beans; + +import com.thoughtworks.xstream.annotations.XStreamAlias; +import com.thoughtworks.xstream.annotations.XStreamAsAttribute; + +@XStreamAlias("classMention") +public class ClassMention +{ + @XStreamAlias("id") + @XStreamAsAttribute + private String id; + + private MentionClass mentionClass; + + public ClassMention(String id, MentionClass mentionClass) + { + this.setId(id); + this.setMentionClass(mentionClass); + } + public String getId() + { + return id; + } + public void setId(String id) + { + this.id = id; + } + public MentionClass getMentionClass() + { + return mentionClass; + } + public void setMentionClass(MentionClass mentionClass) + { + this.mentionClass = mentionClass; + } + +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/ClassMention.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,44 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.protege.beans; + +import com.thoughtworks.xstream.annotations.XStreamAlias; +import com.thoughtworks.xstream.annotations.XStreamAsAttribute; + +@XStreamAlias("mention") +public class Mention +{ + @XStreamAlias("id") + @XStreamAsAttribute + private String id; + + public Mention(String id) + { + this.setId(id); + } + public String getId() + { + return id; + } + + public void setId(String id) + { + this.id = id; + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Mention.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,61 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.protege.beans; + +import com.thoughtworks.xstream.annotations.XStreamAlias; +import com.thoughtworks.xstream.annotations.XStreamAsAttribute; +import com.thoughtworks.xstream.annotations.XStreamConverter; +import com.thoughtworks.xstream.converters.extended.ToAttributedValueConverter; + +@XStreamAlias("mentionClass") +@XStreamConverter(value=ToAttributedValueConverter.class, strings={"name"}) +public class MentionClass +{ + @XStreamAlias("id") + @XStreamAsAttribute + private String id; + + private String name; + + public MentionClass(String id, String name) + { + this.setId(id); + this.setName(name); + } + + public String getId() + { + return id; + } + + public void setId(String id) + { + this.id = id; + } + + public String getName() + { + return name; + } + + public void setName(String name) + { + this.name = name; + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/MentionClass.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,60 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.protege.beans; + +import com.thoughtworks.xstream.annotations.XStreamAlias; +import com.thoughtworks.xstream.annotations.XStreamAsAttribute; + +@XStreamAlias("span") +public class Span +{ + @XStreamAlias("start") + @XStreamAsAttribute + private String start; + + @XStreamAlias("end") + @XStreamAsAttribute + private String end; + + public Span(String start, String end) + { + this.setStart(start); + this.setEnd(end); + } + + public String getStart() + { + return start; + } + + public void setStart(String start) + { + this.start = start; + } + + public String getEnd() + { + return end; + } + + public void setEnd(String end) + { + this.end = end; + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Span.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,171 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +/** + * + */ +package org.spin.scrubber.protege.ontology; + +import com.thoughtworks.xstream.XStream; +import org.apache.log4j.Logger; +import org.spin.scrubber.beans.CaseFeature; +import org.spin.scrubber.protege.beans.*; +import org.spin.scrubber.uima.dao.FeatureMatrixDAO; + +import java.io.File; +import java.io.FileWriter; +import java.util.List; + +/** + * @author britt fitch BF19 + * + * This class is used to help bootstrap a local annotation effort. + * This class can be used to read classifications from "_test" tables that were generated using the out of the box training model. + * These classifications are then converted to protege xml format and can be loaded into the protoge ontology tool. + * The loaded annotations can then be corrected/removed/added and re-exported for the purpose of generating new annotated training data. + */ +public class ProtegeOntologyGenerator implements Runnable +{ + private static Logger log = Logger.getLogger(ProtegeOntologyGenerator.class); + + private String tableSuffix = "_test"; //ProtegeOntologyGenerator always uses "_test" tables. + private String outDir; + + public ProtegeOntologyGenerator(String outDir) + { + this.setOutDir(outDir); + } + + public static void main(String[] args) + { + if (args.length != 1) + { + System.out.println("USAGE: ProtegeOntologyGenerator "); + } + else + { + ProtegeOntologyGenerator pog = new ProtegeOntologyGenerator(args[0]); + pog.run(); + } + } + + /** + * Read from feature_matrix + * All tokens that were not classified as N/A + * Take all those PHI and create XML representation in Protege format + */ + public void run() + { + log.info("Running Protege Ontology Generator..."); + + try + { + XStream xstream = new XStream(); + xstream.autodetectAnnotations(true); + + //get list of distinct filenames from feature_matrix + FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix); + List fileList = dao.selectDistinctFilenames(); + + //init id for annotations + String baseMentionId = "nci_all_init_Instance_"; + int incrementingMentionId = 60000; + + for (String filename : fileList) + { + //get features for the current filename + Annotations annots = new Annotations(); + List cfList = dao.selectPHICaseFeaturesByFilename(filename); + + //create Annotation object for each feature for the current filename. + for(CaseFeature c : cfList) + { + String mentionId = baseMentionId + incrementingMentionId++; + + annots.setTextSource(c.getFilename_short()); + + Annotation annot = new Annotation(); + annot.setMention(new Mention(mentionId)); + annot.setAnnotator(new Annotator("nci_all_init_Instance_4", "britt fitch, hms")); + annot.setSpan(new Span(Integer.toString(c.getStartIdx()), Integer.toString(c.getEndIdx()))); + annot.setSpannedText(c.getToken()); + + ClassMention cm = new ClassMention(mentionId, new MentionClass(c.getClassified_as().toLowerCase(), c.getClassified_as().toLowerCase())); + + annots.getAnnotList().add(annot); + annots.getCmList().add(cm); + } + + //get xml representation of pojos + String xml = xstream.toXML(annots); + + //write xml annotation import file to outDir + File file = new File(this.getOutDir() + File.separatorChar + filename + ".xml"); + file.createNewFile(); + FileWriter writer = new FileWriter(file); + writer.write(xml); + writer.flush(); + writer.close(); + } + } + catch(Exception e) + { + log.error("failed to generate protege xml import files.", e); + } + } + + + public String getOutDir() + { + return outDir; + } + + public void setOutDir(String outDir) + { + this.outDir = outDir; + } + + /** + * method generates a protege xml annotation import file based on the example xml input supplied with the scrubber project. + */ + @Deprecated + public void runTest() + { + XStream xstream = new XStream(); + xstream.autodetectAnnotations(true); + + Annotations annots = new Annotations(); + annots.setTextSource("testcase.xml"); + + String mentionId = "nci_all_init_Instance_80000"; + + Annotation annot = new Annotation(); + annot.setMention(new Mention(mentionId)); + annot.setAnnotator(new Annotator("nci_all_init_Instance_4", "britt fitch, hms")); + annot.setSpan(new Span("170", "173")); + annot.setSpannedText("Doe"); + + ClassMention cm = new ClassMention(mentionId, new MentionClass("contact", "contact")); + + annots.getAnnotList().add(annot); + annots.getCmList().add(cm); + + String xml = xstream.toXML(annots); + System.out.println(xml); + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/ontology/ProtegeOntologyGenerator.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,204 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +/** + * + */ +package org.spin.scrubber.publications; + +import org.apache.log4j.Logger; +import org.spin.scrubber.ScrubberProperties; +import org.spin.scrubber.uima.dao.PubDAO; + +import java.io.File; +import java.io.FileOutputStream; +import java.util.List; +import java.util.Map; + +/** + * @author britt fitch (bf19) + * @link http://www.gnu.org/licenses/lgpl.html + * + * to run: + * java -classpath .:lucy.jar:uber-lucy-0.0.1-SNAPSHOT.jar edu.harvard.cbmi.lucy.PubExtractor outputDir + */ +public class PublicationsExtractorJDBC implements Runnable +{ + private static Logger log = Logger.getLogger(PublicationsExtractorJDBC.class); + + public String dirInputPublicationsTXT; + + public PublicationsExtractorJDBC() + { + this(ScrubberProperties.getDirInputPublicationsTXT()); + } + + public PublicationsExtractorJDBC(String dirInputPublicationsTXT) + { + this.dirInputPublicationsTXT = dirInputPublicationsTXT; + } + + public static void main(String[] args) throws Exception + { + PublicationsExtractorJDBC runner = new PublicationsExtractorJDBC(); + runner.run(); + } + + public void run() + { + extractAllFilterSet("all"); + } + + public void extractAllFilterSet(String subDirName) + { + try + { + File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName); + + if(!outDir.exists()) outDir.mkdir(); + + //purge dir, if has contents. + for (File f : outDir.listFiles()) + { + f.delete(); + } + + //get pubs + PubDAO dao = new PubDAO(); + FileOutputStream output = null; + List pubIdList = dao.selectAllPubIds(); + for (Integer pubId : pubIdList) + { + Map pubMap = dao.selectAllPubContentById(pubId); + + //write output files + for (Integer id : pubMap.keySet()) + { + log.info("writing file for pubId = " + subDirName + "/" + id); + String body = pubMap.get(id).trim(); + + //ignore files we know have 'no content' + if (body.length()<1 + || body.equals("(To access the full article, please see PDF)") + || body.equals("\"To access the full article, please see PDF\"") + || body.startsWith("None declared.")) + { + continue; + } + + File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt"); + output = new FileOutputStream(outFile); + output.write(body.getBytes()); + output.close(); + } + } + } + catch(Exception e) + { + log.error("extracting "+subDirName+" filter set ", e); + } + } + + //TODO: Deprecated + + @Deprecated + public void extractRandomFilterSet(String subDirName) + { + try + { + PubDAO dao = new PubDAO(); + FileOutputStream output = null; + Map pubMap = dao.selectRandomPubContent(); + + File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName); + + //create subdir, if not already existing + outDir.mkdir(); + + //purge dir, if has contents. + for (File f : outDir.listFiles()) + { + f.delete(); + } + + //write output files + for (Integer id : pubMap.keySet()) + { + System.out.println("INFO: writing file for pubId = " + subDirName + "/" + id); + String body = pubMap.get(id).trim(); + + //ignore files we know have 'no content' + if (body.length()<1 + || body.equals("(To access the full article, please see PDF)") + || body.equals("\"To access the full article, please see PDF\"")) + { + continue; + } + + File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt"); + output = new FileOutputStream(outFile); + output.write(body.getBytes()); + output.close(); + } + } + catch(Exception e) + { + System.out.println("ERROR: extracting "+subDirName+" filter set: " + e.getMessage()); + e.printStackTrace(); + } + } + + @Deprecated + public void extractFilterSet(String[] keywordFragList, String subDirName) + { + try + { + PubDAO dao = new PubDAO(); + FileOutputStream output = null; + Map pubMap = dao.selectPubContentByLikeKeywords(keywordFragList); + + //purge dir, if has contents. + File outDir = new File(dirInputPublicationsTXT + File.separator + subDirName); + for (File f : outDir.listFiles()) + { + f.delete(); + } + + //create subdir, if not already existing + outDir.mkdir(); + + //write output files + for (Integer id : pubMap.keySet()) + { + System.out.println("INFO: writing file for pubId = " + subDirName + "/" + id); + String body = pubMap.get(id).trim(); + + File outFile = new File(outDir.getAbsoluteFile() + File.separator + id + ".txt"); + output = new FileOutputStream(outFile); + output.write(body.getBytes()); + output.close(); + } + } + catch(Exception e) + { + System.out.println("ERROR: extracting "+subDirName+" filter set: " + e.getMessage()); + e.printStackTrace(); + } + } + +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsExtractorJDBC.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,344 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +/** + * + */ +package org.spin.scrubber.publications; + +import org.apache.log4j.Logger; +import org.spin.scrubber.ScrubberProperties; +import org.spin.scrubber.uima.dao.PubDAO; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.EntityResolver; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.File; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +/** + * @author bf19 + * + * accepts 1 command line param which is a dir containing open access publications + * @link http://www.ncbi.nlm.nih.gov/pmc/tools/ftp/ + */ +public class PublicationsParserOpenAccessXML implements Runnable +{ + private static Logger log = Logger.getLogger(PublicationsParserOpenAccessXML.class); + + private String dirInputPublicationsXML; + + public PublicationsParserOpenAccessXML() + { + this(ScrubberProperties.getDirInputPublicationsXML()); + } + + public PublicationsParserOpenAccessXML(String dirInputPublicationsXML) + { + this.dirInputPublicationsXML = dirInputPublicationsXML; + } + + /** + * @param args + */ + public static void main(String[] args) + { + PublicationsParserOpenAccessXML parser = new PublicationsParserOpenAccessXML(); + parser.run(); + } + + public void run() + { + try + { + //extract fields + File inDir = new File(dirInputPublicationsXML); + List files = getFileList(new ArrayList(), inDir); + PubDAO dao = new PubDAO(); + + for (File f : files) + { + log.debug("Parsing : " + f.getName()); + + //read infile + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + builder.setEntityResolver(new EntityResolver() { + public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { + if (systemId.contains("dtd")) { + return new InputSource(new StringReader("")); + } else { + return null; + } + } + }); + Document doc = builder.parse(f); + XPathFactory xPathfactory = XPathFactory.newInstance(); + XPath xpath = xPathfactory.newXPath(); + + XPathExpression journalTitleXP = xpath.compile("//journal-title"); + XPathExpression articleTitleXP = xpath.compile("//article-title"); + XPathExpression articleIdPMCXP = xpath.compile("//article-id[@pub-id-type='pmc']"); + XPathExpression articleIdPMIDXP = xpath.compile("//article-id[@pub-id-type='pmid']"); + XPathExpression articleIdDOIXP = xpath.compile("//article-id[@pub-id-type='doi']"); + XPathExpression keywordXP = xpath.compile("//kwd"); + XPathExpression bodyXP = xpath.compile("//body//p"); + XPathExpression authorsXP = xpath.compile("//contrib[@contrib-type='author']"); + XPathExpression refsXP = xpath.compile("//ref-list//name"); + + //read all matching nodes + NodeList jtitleList = (NodeList) journalTitleXP.evaluate(doc, XPathConstants.NODESET); + NodeList atitleList = (NodeList) articleTitleXP.evaluate(doc, XPathConstants.NODESET); + NodeList pmcList = (NodeList) articleIdPMCXP.evaluate(doc, XPathConstants.NODESET); + NodeList pmidList = (NodeList) articleIdPMIDXP.evaluate(doc, XPathConstants.NODESET); + NodeList doiList = (NodeList) articleIdDOIXP.evaluate(doc, XPathConstants.NODESET); + NodeList keywordList = (NodeList) keywordXP.evaluate(doc, XPathConstants.NODESET); + NodeList bodyList = (NodeList) bodyXP.evaluate(doc, XPathConstants.NODESET); + NodeList authorList = (NodeList) authorsXP.evaluate(doc, XPathConstants.NODESET); + NodeList refList = (NodeList) refsXP.evaluate(doc, XPathConstants.NODESET); + + //populate pojo + Pub pub = new Pub(); + pub.setJtitle(jtitleList); + pub.setAtitle(atitleList); + pub.setPmc(pmcList); + pub.setPmid(pmidList); + pub.setDoi(doiList); + pub.setKeywords(keywordList); + pub.setAuthors(authorList); + pub.setRefs(refList); + pub.setBody(bodyList); + + //insert records + int id = dao.insertPub(pub.getJtitle(), pub.getAtitle(), pub.getPmc(), pub.getPmid(), pub.getDoi(), pub.getBody(), f.getAbsolutePath()); + if (id > 0) + { + for (Person p : pub.getAuthors()) + { + dao.insertAuthor(id, p.getSurname(), p.getGivenName()); + } + for (Person p : pub.getRefs()) + { + dao.insertRef(id, p.getSurname(), p.getGivenName()); + } + for (String s : pub.getKeywords()) + { + dao.insertKeyword(id, s); + } + } + } + } + catch (Exception e) + { + log.error("unknown error parsing xml: ", e); + } + } + + /** + * recursively get files from directory structure. + */ + public List getFileList(List files, File file) + { + if (!file.isDirectory()) + { + files.add(file); + } + else if(file.isDirectory()) + { + for(File f : file.listFiles()) + { + getFileList(files, f); + } + } + + return files; + } + + class Pub + { + String jtitle; + String atitle; + String pmc; + String pmid; + String doi; + String body = ""; + List keywords = new ArrayList(); + List authors = new ArrayList(); + List refs = new ArrayList(); + public String getJtitle() + { + return jtitle; + } + public void setJtitle(NodeList list) + { + for (int k=0; k getKeywords() + { + return keywords; + } + public void setKeywords(NodeList list) + { + for (int k=0; k getAuthors() + { + return authors; + } + public void setAuthors(NodeList list) + { + for (int k=0; k 0) ? elem.getElementsByTagName("surname").item(0).getTextContent().trim() : null; + String fname = (elem.getElementsByTagName("given-names").getLength() > 0) ? elem.getElementsByTagName("given-names").item(0).getTextContent().trim() : null; + authors.add(new Person(lname, fname)); + } + } + } + public List getRefs() + { + return refs; + } + public void setRefs(NodeList list) + { + for (int k=0; k 0) ? elem.getElementsByTagName("surname").item(0).getTextContent().trim() : null; + String fname = (elem.getElementsByTagName("given-names").getLength() > 0) ? elem.getElementsByTagName("given-names").item(0).getTextContent().trim() : null; + refs.add(new Person(lname, fname)); + } + } + } + } + class Person + { + String surname; + String givenName; + public Person(String surname, String givenName) + { + this.surname = surname; + this.givenName = givenName; + } + public String getSurname() + { + return surname; + } + public void setSurname(String surname) + { + this.surname = surname; + } + public String getGivenName() + { + return givenName; + } + public void setGivenName(String givenName) + { + this.givenName = givenName; + } + + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/publications/PublicationsParserOpenAccessXML.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,143 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.redactor; + +import org.apache.log4j.Logger; +import org.spin.scrubber.beans.CaseFeature; +import org.spin.scrubber.uima.dao.FeatureMatrixDAO; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.File; +import java.io.FileWriter; +import java.util.List; + +/** + * redactor used to replace tokens classified as PHI from i2b2 formatted xml + * @author britt fitch BF19 + * + */ +public class I2B2XMLRedactor implements Runnable +{ + + private static Logger log = Logger.getLogger(I2B2XMLRedactor.class); + + private String in; + private String out; + private String tableSuffix = "_test"; //always uses "_test" tables. + private FeatureMatrixDAO dao; + + public I2B2XMLRedactor(String in, String out) throws Exception + { + this.in = in; + this.out = out; + + dao = new FeatureMatrixDAO(tableSuffix); + } + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception + { + if (args.length!=2) + { + System.out.println("USAGE:\t\t Redactor inDir outDir"); + } + + I2B2XMLRedactor runner = new I2B2XMLRedactor(args[0], args[1]); + runner.run(); + } + + public void run() + { + try + { + File inDir = new File(in); + + if (!inDir.exists()) + { + inDir.createNewFile(); + } + + File[] files = inDir.listFiles(); + + for (File f : files) + { + if (f.isDirectory()) + { + continue; + } + + System.out.println("Redactor for: " + f.getName()); + + //read infile + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document doc = builder.parse(f); + XPathFactory xPathfactory = XPathFactory.newInstance(); + XPath xpath = xPathfactory.newXPath(); + XPathExpression expr = xpath.compile("//TEXT"); + + //read all matching nodes + NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET); + + FileWriter writer = null; + + //for each node in a file, write out to a flat txt file of the same name. + for (int i=0; i phiList = dao.selectClassifiedAsPHITest(fname); + + for (CaseFeature cf : phiList) + { + txt = txt.substring(0, cf.getStartIdx()) + "xxx"+cf.getClassified_as()+"xxx" + txt.substring(cf.getEndIdx()); + } + + writer.write( txt + "\n"); + writer.flush(); + writer.close(); + } + } + } + catch (Exception e) + { + log.error("Unknown error redacting XXXX from the i2b2 XML text.", e); + } + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/I2B2XMLRedactor.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,133 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.redactor; + +import org.apache.log4j.Logger; +import org.spin.scrubber.ScrubberProperties; +import org.spin.scrubber.beans.CaseFeature; +import org.spin.scrubber.uima.dao.FeatureMatrixDAO; + +import java.io.*; +import java.util.List; + +/** + * redactor used to replace tokens classified as PHI from any kind of file. + * @author britt fitch BF19 + * + */ +//TODO: refactor +public class Redactor implements Runnable +{ + private static Logger log = Logger.getLogger(Redactor.class); + + private String dirInput; + private String dirOutput; + private String tableSuffix = "_test"; //always uses "_test" tables. + + private FeatureMatrixDAO dao; + + public Redactor() throws Exception + { + this(ScrubberProperties.getDirInputTest(), ScrubberProperties.getDirOuputTest()); + } + + public Redactor(String dirInput, String dirOutput) throws Exception + { + this.dirInput = dirInput; + this.dirOutput = dirOutput; + + dao = new FeatureMatrixDAO(tableSuffix); + } + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception + { + Redactor runner = new Redactor(); + runner.run(); + } + + public void run() + { + try + { + File inDir = new File(dirInput); + + if (!inDir.exists()) + { + inDir.createNewFile(); + } + + File[] files = inDir.listFiles(); + + for (File f : files) + { + if (f.isDirectory()) + { + continue; + } + + log.debug("Redacting : " + f.getName()); + + //read infile + String str = reader2String(new InputStreamReader(new FileInputStream(f))); + + //make outfile + FileWriter writer = new FileWriter(new File(dirOutput + File.separatorChar + f.getName())); + + //get PHI to redact + List phiList = dao.selectClassifiedAsPHI(f.getName()); + + for (CaseFeature cf : phiList) + { + str = str.substring(0, cf.getStartIdx()) + "xxx" +str.substring(cf.getEndIdx()); + } + + writer.write( str + "\n"); + writer.flush(); + writer.close(); + } + } + catch (Exception e) + { + log.error("Unknown error during redaction",e); + } + } + + private static String reader2String(Reader reader) throws IOException + { + StringBuffer strBuffer = new StringBuffer(); + char[] buf = new char[10000]; + int charsRead; + try + { + while ((charsRead = reader.read(buf)) >= 0) + { + strBuffer.append(buf, 0, charsRead); + } + } + finally + { + reader.close(); + } + return strBuffer.toString(); + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/redactor/Redactor.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,211 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.templates; + +import org.apache.log4j.Logger; +import org.spin.scrubber.ScrubberProperties; + +import java.io.*; +import java.util.HashMap; + +/** +* @author Andrew McMurry, MS +*

+* With primary support from Children's Hospital Informatics Program @ +* Harvard-MIT Health Sciences and Technology and +*

+* Secondary support from the Harvard Medical School +* Center for BioMedical Informatics +*

+* PHD candidate, Boston University Bioinformatics +* Member, I2b2 National Center for Biomedical Computing +*

+* All works licensed under LGPL +*

+* User: andy +* Date: 6/20/12 +* Time: 12:18 AM +*/ +public class TemplateFileProcessor +{ + private static Logger log = Logger.getLogger(TemplateFileProcessor.class); + + public static String TEMPLATE_FILE_PREFIX = "TEMPLATE."; + + public static void showUsage() + { + System.out.println("This will process a TEMPLATE.file using information from scrubber.properties"); + System.out.println("[Usage]"); + System.out.println(); + System.out.println("[Default = all known templates]"); + System.out.println("java TemplateFileProcessor"); + System.out.println(); + System.out.println("[Specified template]"); + System.out.println("java TemplateFileProcessor TEMPLATE.file"); + System.out.println(); + } + + public static void main(String[] args) + { + try + { + if(args.length==0) + { + showUsage(); + } + else + { + if("ALL".equalsIgnoreCase(args[0])) + { + System.out.println("Processing ALL known templates."); + TemplateFileProcessor.processTemplatesAllKnown(); + } + else + { + File templateFor = new File(args[0]); + + System.out.println("Processing template for "+ templateFor.getAbsolutePath()); + TemplateFileProcessor.processTemplate(templateFor); + } + } + } + catch(Exception e) + { + System.out.println("Could not process template: "+ e.getMessage()); + } + } + + public static void processTemplatesAllKnown() throws IOException + { + log.info("Processing UIMA reader templates."); + + processTemplate("desc/reader", ScrubberProperties.getUimaReaderFileTrain()); + processTemplate("desc/reader", ScrubberProperties.getUimaReaderFileTest()); + processTemplate("desc/reader", ScrubberProperties.getUimaReaderFilePublications()); + } + + public static void processTemplate(String directory, String filename) throws IOException + { + String SLASH = ScrubberProperties.SLASH; + + processTemplate(new File(directory + SLASH + TEMPLATE_FILE_PREFIX + filename)); + } + + public static File processTemplate(File templateFileAbsolutePath) throws IOException + { + return processTemplate(templateFileAbsolutePath, getOutputFile(templateFileAbsolutePath)); + } + + public static File processTemplate(File templateFileAbsolutePath, File outputFileAbsolutePath) throws IOException + { + return processTemplate(templateFileAbsolutePath, outputFileAbsolutePath, ScrubberProperties.asTokenMap()); + } + + public static File processTemplate(File templateFileAbsolutePath, File outputFileAbsolutePath, HashMap replacements) throws IOException + { + String contents = read(templateFileAbsolutePath); + + for(String key: replacements.keySet()) + { + String value = replacements.get(key); + + log.debug("Template is replacing: "+ key + " with "+ value); + + contents = contents.replaceAll(key, value); + } + + write(outputFileAbsolutePath, contents); + + return outputFileAbsolutePath; + } + + public static File getOutputFile(File templateFile) + { + return new File(templateFile.getAbsolutePath().replaceAll(TEMPLATE_FILE_PREFIX, "")); + } + + /** + * Convenience method, read all file contents + * + * @param filename + * @return file contents as string + * @throws IOException + */ + public static String read(final File filename) throws IOException + { + log.debug("Reading template from " + filename.getAbsolutePath()); + + return read(new FileReader(filename)); + } + + /** + * Convenience method, read all contents + * + * @param reader + * @return file contents as string + * @throws IOException + */ + public static String read(final Reader reader) throws IOException + { + try + { + int len; + + final StringBuilder contents = new StringBuilder(); + + final char[] buf = new char[1024]; + + while((len = reader.read(buf)) > 0) + { + contents.append(buf, 0, len); + } + + return contents.toString(); + } + finally + { + reader.close(); + } + } + + /** + * Convenience method, write file contents + * + * @param filename + * @return file contents as string + * @throws IOException + */ + public static void write(final File filename, final String contents) throws IOException + { + log.debug("Writing contents to " + filename.getAbsolutePath()); + + PrintWriter output = null; + + try + { + output = new PrintWriter(new FileOutputStream(filename), false); + + output.write(contents); + } + finally + { + output.close(); + } + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/templates/TemplateFileProcessor.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,67 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +/** + * + */ +package org.spin.scrubber.uima.annotator; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.annotator.AnnotatorContextException; + +/** + * @author BF19 + * + */ +public abstract class BaseAnnotator extends JCasAnnotator_ImplBase +{ + /** + * @param context + * @param param + * @param defaultValue + * @return returns the boolean parameter value + * @throws AnnotatorContextException + */ + protected static String[] safeGetConfigParameterStringArrayValue(UimaContext context, String param, String[] defaultValue) + { + String[] array = (String[]) context.getConfigParameterValue(param); + if (array != null && array.length > 0) + { + return array; + } + return defaultValue; + } + + /** + * @param context + * @param param + * @param defaultValue + * @return returns the boolean parameter value + * @throws AnnotatorContextException + */ + protected static Boolean[] safeGetConfigParameterBooleanArrayValue(UimaContext context, String param, Boolean[] defaultValue) + { + Boolean[] array = (Boolean[]) context.getConfigParameterValue(param); + if (array != null && array.length > 0) + { + return array; + } + return defaultValue; + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/BaseAnnotator.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,262 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.uima.annotator; + +import com.mysql.jdbc.PreparedStatement; + +import org.apache.ctakes.typesystem.type.syntax.Chunk; +import org.apache.ctakes.typesystem.type.syntax.WordToken; +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.spin.scrubber.uima.dao.BaseDAO; +import org.spin.scrubber.uima.type.OntologyMatch; + +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Date; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +public class DictionaryAnnotator extends BaseAnnotator +{ + private static Logger log = Logger.getLogger(DictionaryAnnotator.class); + private static boolean DEBUG = log.isDebugEnabled(); + + private Connection conn = null; + private String[] lookupQueryArray; + + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + log.info("initialize() ..."); + + //TODO: consider changing this query to an array of queries in case we want to query multiple tables. + this.lookupQueryArray = safeGetConfigParameterStringArrayValue(aContext, "lookupQuery", new String[] {}); +// this.lookupQuery = (String) aContext.getConfigParameterValue("lookupQuery"); + + try + { + conn = BaseDAO.getConnectionToScrubber(); + } + catch (Exception e) + { + log.fatal("Failed to initialize", e); + throw new ResourceInitializationException(ResourceInitializationException.ANNOTATOR_INITIALIZATION_FAILED, new Object[] { "database connection" }); + } + + log.info("initialize() is done."); + } + + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + log.debug("starting dict: " + new Date(System.currentTimeMillis())); + + // get document text from JCas + if(DEBUG) log.debug("Doc:"+ aJCas.getDocumentText()); + { + + } + + processWordTokens(aJCas); + processChunks(aJCas); + + if(DEBUG) log.debug("Finish dict: " + new Date(System.currentTimeMillis())); + } + + private void processWordTokens(JCas aJCas) + { + Iterator annotIt = aJCas.getAnnotationIndex(WordToken.type).iterator(); + while(annotIt.hasNext()) + { + Annotation annot = annotIt.next(); + if (annot.getType().getShortName().equals("WordToken")) + { + String[] tokens = annot.getCoveredText().split("\\s+|\\.|\\,"); + + for (String s : tokens) + { + if (s.trim().length()==0) + { + continue; + } + + Set results = lookup(s); + for (String r : results) + { + String[] matchArray = r.split("\\|"); + OntologyMatch match = new OntologyMatch(aJCas); + match.setBegin(annot.getBegin()); + match.setEnd(annot.getEnd()); + match.setCode(matchArray[0]); + match.setOntology(matchArray[1]); + match.addToIndexes(); + //TODO: right now this depends on WordToken annotations. this should be changed so that the dictionary could be run on its own. w/o having to follow the pos tagger + //TODO: add ability to lookup in flat files as well. + } + } + } + } + } + + private void processChunks(JCas aJCas) + { + Iterator annotIt = aJCas.getAnnotationIndex(Chunk.type).iterator(); + while(annotIt.hasNext()) + { + Chunk annot = (Chunk)annotIt.next(); +// if (annot.getChunkType().equalsIgnoreCase("NP")) +// { + Set tokens = generatePermutations(annot.getCoveredText()); + + for (String s : tokens) + { + if (s.trim().length()==0) + { + continue; + } + + Set results = lookup(s); + for (String r : results) + { + String[] matchArray = r.split("\\|"); + OntologyMatch match = new OntologyMatch(aJCas); + match.setBegin(annot.getBegin() + annot.getCoveredText().indexOf(s)); + match.setEnd(match.getBegin() + s.length()); + match.setCode(matchArray[0]); + match.setOntology(matchArray[1]); + match.addToIndexes(); + //TODO: right now this depends on Chunk annotations. this should be changed so that the dictionary could be run on its own. w/o having to follow the pos tagger + //TODO: add ability to lookup in flat files as well. + } + } +// } + } + } + + /** + * implements a sliding window by getting all substrings of a string of words + * from both the start of the string and from the end of the string, + * skipping strings that are only 1 word because that case will already be handled by the wordtoken annotations. + * @param token + * @return + */ + protected Set generatePermutations(String token) + { + Set results = new HashSet(); + + String[] tokens = token.split("\\s+|\\.|\\,"); + int len = tokens.length; + + if (len<=1) + { + return results; + } + + //get permutations going FORWARD through the phrase + for (int level=len-1; level>=0; level--) + { + String term = ""; + for (int iteration=0; iteration<=level; iteration++) + { + term = (term + " " + tokens[iteration]).trim(); + } + if (term.length()>0 && term.contains(" ")) + { + results.add(term); + } + } + + //get permutations going BACKWARD through the phrase + for (int level=0; level<=len-1; level++) + { + String term = ""; + for (int iteration=level; iteration<=len-1; iteration++) + { + term = (term + " " + tokens[iteration]).trim(); + } + if (term.length()>0 && term.contains(" ")) + { + results.add(term); + } + } + + return results; + } + + /** + * accepts token, looks up in subset of umls, returns a mapping of cui to ontology for this token. + * @param token + * @return Set<"cui|ontology"> + * @throws Exception + */ + private Set lookup(String token) + { + PreparedStatement ps = null; + ResultSet rs = null; + Set results = new HashSet(); + + for (String sql : lookupQueryArray) + { + // StringBuilder sql = new StringBuilder(this.lookupQuery); + + try + { + ps = (PreparedStatement) conn.prepareStatement(sql); + int i=1; + ps.setString(i++, token); + rs = ps.executeQuery(); + + if(rs!=null) + { + while (rs.next()) + { + String code = rs.getString("code"); + String ontology = rs.getString("ontology"); + results.add(code+"|"+ontology); + } + } + } + catch (SQLException e) + { + log.error("ERROR: failed selecting DICTIONARY matches for token: " + token, e); + } + finally + { + BaseDAO.closeRSPS(rs, ps); + } + } + + return results; + } + + public void destroy() + { + super.destroy(); + + log.info("Closing connection."); + BaseDAO.closeConnection(conn); + + log.info("Done."); + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/DictionaryAnnotator.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java?rev=1500511&view=auto ============================================================================== --- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java (added) +++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/RegexAnnotator.java Sun Jul 7 19:23:05 2013 @@ -0,0 +1,218 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + ******************************************************************************/ +package org.spin.scrubber.uima.annotator; + +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceAccessException; +import org.apache.uima.resource.ResourceInitializationException; +import org.spin.scrubber.uima.type.OntologyMatch; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * this class borrows much of its structure & flow from UIMA sandbox RegExAnnotator. + * the main difference is that this allows the usage of an external flat file where you can specify the name of the regex that was matched. + * for our uses, all regex matches can be of the same type, but we need to have a name of each one in to make identifying matches & maintenance of regex easier. + * + * @author BF19 + * + */ +public class RegexAnnotator extends BaseAnnotator +{ + private static Logger log = Logger.getLogger(RegexAnnotator.class); + private static boolean DEBUG = log.isDebugEnabled(); + + private List nameList; + private List typeList; + private List patternList; + + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + log.info("initialize() ... "); + + super.initialize(aContext); + + nameList = new ArrayList(); + typeList = new ArrayList(); + patternList = new ArrayList(); + + try + { + //TODO: refactor into constants + String[] fileNameArray = safeGetConfigParameterStringArrayValue(getContext(), "Filenames", new String[] {}); + Boolean[] caseSensitiveArray = safeGetConfigParameterBooleanArrayValue(getContext(), "CaseSensitiveFile", new Boolean[] {}); + + for (int i=0; i pubsTFMap = null; + + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + log.info("initialize()..."); + + lookupQuery = (String) aContext.getConfigParameterValue("lookupQuery"); + + try + { + //select all pub token/cnt/pos + pubsTFMap = new TfDAO().selectPubTFMap(); + + //create db connection for later use + conn = BaseDAO.getConnectionToScrubber(); + } + catch (Exception e) + { + e.printStackTrace(); + throw new ResourceInitializationException(ResourceInitializationException.ANNOTATOR_INITIALIZATION_FAILED, new Object[] { "database connection" }); + } + + log.info("initialize() is done..."); + } + + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + if(DEBUG) log.debug("Starting tf: " + new Date(System.currentTimeMillis())); + + processWordTokens(aJCas); + processNumTokens(aJCas); + + if(DEBUG) log.debug("Finish tf: " + new Date(System.currentTimeMillis())); + } + + //TODO: refactor + private void processWordTokens(JCas aJCas) + { + Iterator annotIt = aJCas.getAnnotationIndex(WordToken.type).iterator(); + while(annotIt.hasNext()) + { + Annotation annot = annotIt.next(); + if (annot.getType().getShortName().equals("WordToken")) + { + createCalculationAnnotation(aJCas, annot); + } + } + } + + //TODO: refactor + private void processNumTokens(JCas aJCas) + { + Iterator annotIt = aJCas.getAnnotationIndex(NumToken.type).iterator(); + while(annotIt.hasNext()) + { + Annotation annot = annotIt.next(); + if (annot.getType().getShortName().equals("NumToken")) + { + createCalculationAnnotation(aJCas, annot); + } + } + } + + /** + * lower case the annotation text and look it up in the pub map both with and without the part of speech. + * calculate the 2 term frequencies. + * @param aJCas + * @param annot + */ + private void createCalculationAnnotation(JCas aJCas, Annotation annot) + { + String s = annot.getCoveredText().toLowerCase().trim(); + + if (s.trim().length()==0) + { + return; + } + + //TODO: move the feature name to the xml descriptor. + //get Features... //TODO: there must be a better way to do this... + String pos=null; + Feature posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName("edu.mayo.bmi.uima.core.type.BaseToken:partOfSpeech"); + if (posFeat!=null) + pos = annot.getFeatureValueAsString(posFeat); + + //update all_pubs features + int pubTermPosCnt = (pubsTFMap.get(s+"|"+pos)==null) ? 0 : pubsTFMap.get(s+"|"+pos); + int pubTermCnt = (pubsTFMap.get(s)==null) ? 0 : pubsTFMap.get(s); + + double pubTotalCnt = Double.valueOf(Integer.toString(pubsTFMap.get("totalPubCount"))); + + double hamWithPos = -1*Math.log10(pubTermPosCnt/pubTotalCnt); + double hamWithoutPos = -1*Math.log10(pubTermCnt/pubTotalCnt); + + Calculation match = new Calculation(aJCas); + + match.setBegin(annot.getBegin()); + match.setEnd(annot.getEnd()); + match.setCalculationName("tf_ham_with_pos"); + match.setCalculationValue(Double.toString(hamWithPos)); + match.addToIndexes(); + + match = new Calculation(aJCas); + match.setBegin(annot.getBegin()); + match.setEnd(annot.getEnd()); + match.setCalculationName("tf_ham_without_pos"); + match.setCalculationValue(Double.toString(hamWithoutPos)); + match.addToIndexes(); + } + + public void destroy() + { + super.destroy(); + + log.info("Closing connection."); + BaseDAO.closeConnection(conn); + + log.info("Done."); + } +} Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/annotator/TFAnnotator.java ------------------------------------------------------------------------------ svn:mime-type = text/plain