incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1443507 - in /incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser: MaxentParserWrapper.java ParserWrapper.java util/CommandLineParserUtil.java util/TreeUtils.java
Date Thu, 07 Feb 2013 14:39:52 GMT
Author: tmill
Date: Thu Feb  7 14:39:51 2013
New Revision: 1443507

URL: http://svn.apache.org/viewvc?rev=1443507&view=rev
Log:


Modified:
    incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
    incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ParserWrapper.java
    incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/CommandLineParserUtil.java
    incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java

Modified: incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java?rev=1443507&r1=1443506&r2=1443507&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
(original)
+++ incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
Thu Feb  7 14:39:51 2013
@@ -20,119 +20,43 @@ package org.apache.ctakes.constituency.p
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
 
-//import opennlp.tools.lang.english.TreebankParser; // no longer part of OpenNLP as of 1.5
-import opennlp.model.AbstractModel;
-import opennlp.model.MaxentModel;
-import opennlp.tools.chunker.Chunker;
-import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.cmdline.parser.ParserTool;
-import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.parser.AbstractBottomUpParser;
-import opennlp.tools.parser.ChunkContextGenerator;
 import opennlp.tools.parser.Parse;
-import opennlp.tools.parser.ParserChunkerSequenceValidator;
 import opennlp.tools.parser.ParserModel;
 import opennlp.tools.parser.chunking.Parser;
-import opennlp.tools.parser.lang.en.HeadRules;
-import opennlp.tools.postag.POSDictionary;
-import opennlp.tools.postag.POSTagger;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.postag.TagDictionary;
 import opennlp.tools.util.Span;
 
-import org.apache.log4j.Logger;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.FSArray;
-import org.apache.uima.jcas.cas.StringArray;
-import org.apache.uima.jcas.tcas.Annotation;
-
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
-import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
 import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.StringArray;
 
 public class MaxentParserWrapper implements ParserWrapper {
 
 	Parser parser = null;
-	private boolean useTagDictionary = true;
-	private boolean useCaseSensitiveTagDictionary = true;
 	private String parseStr = "";
 	Logger logger = Logger.getLogger(this.getClass().getName());
 	
 	public MaxentParserWrapper(String dataDir) {
 		try {
-			//parser = TreebankParser.getParser(dataDir, useTagDictionary, useCaseSensitiveTagDictionary,
AbstractBottomUpParser.defaultBeamSize, AbstractBottomUpParser.defaultAdvancePercentage);
-			
 			File d = new File(dataDir);
-			
-			MaxentModel buildModel = null;
-			MaxentModel checkModel = null;
-			POSTagger posTagger = null;
-			Chunker chunker = null;
-			HeadRules headRules = null;
 
 			if (!d.isDirectory()) {
 				FileInputStream fis = new FileInputStream(d);
 				ParserModel model = new ParserModel(fis);
 				parser = new Parser(model, AbstractBottomUpParser.defaultBeamSize, AbstractBottomUpParser.defaultAdvancePercentage);
-			} else {
-				// This branch is for handling models built with OpenNLp 1.4
-				// Once the models are rebuilt using OpenNLP 1.5 this code should be removed
-				// @see TreebankParser.java in OpenNLP 1.4
-				{
-					File f = new File(d, "build.bin.gz"); // TODO consider moving these literals to an XML
file or properties file
-					buildModel = new opennlp.maxent.io.SuffixSensitiveGISModelReader(f).getModel();
-				}
-				
-				{
-					File f = new File(d, "check.bin.gz");
-					checkModel = new opennlp.maxent.io.SuffixSensitiveGISModelReader(f).getModel();
-				}
-				
-				{
-					File f = new File(d, "pos.model.bin");
-					//File f = new File(d, "tag.bin.gz");
-					MaxentModel posModel = new opennlp.maxent.io.SuffixSensitiveGISModelReader(f).getModel();
-					if (useTagDictionary) {
-						File td = new File(d, "tagdict");
-						TagDictionary tagDictionary = new POSDictionary(td.getAbsolutePath()); //null;
-						posTagger = new POSTaggerME((AbstractModel) posModel, tagDictionary);
-					} else {
-						// f = new File(d, "dict.bin.gz");
-						Dictionary dictionary = null; // new Dictionary();
-						posTagger = new POSTaggerME((AbstractModel) posModel, dictionary);
-
-					}
-				}
-				
-				
-				{
-					File f = new File(d, "chunk.bin.gz");
-					MaxentModel chunkModel = new opennlp.maxent.io.SuffixSensitiveGISModelReader(f).getModel();
-					chunker = new ChunkerME(chunkModel);
-				}
-			
-				{
-					FileReader fr = new FileReader(new File(d, "head_rules"));
-					headRules = new HeadRules(fr);
-				}
-
-				parser = new Parser(buildModel, checkModel, posTagger, chunker, headRules); //TreebankParser.getParser(modelFileOrDirname,
useTagDictionary, useCaseSensitiveTagDictionary, AbstractBottomUpParser.defaultBeamSize, AbstractBottomUpParser.defaultAdvancePercentage);
 			}
-			
-			
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
@@ -151,158 +75,30 @@ public class MaxentParserWrapper impleme
 	 * For some reason the built-in tokenizer does not like that.
 	 */
 	@Override
-	public void createAnnotations(JCas jcas) {
+	public void createAnnotations(JCas jcas) throws AnalysisEngineProcessException {
 		String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
 		logger.info("Started processing: " + docId);
 		// iterate over sentences
 		FSIterator iterator = jcas.getAnnotationIndex(Sentence.type).iterator();
-		// Map from indices in the parsed string to indices in the sofa
-		HashMap<Integer,Integer> indexMap = new HashMap<Integer,Integer>();
 		Parse parse = null;
 		
 		while(iterator.hasNext()){
 			Sentence sentAnnot = (Sentence) iterator.next();
-//			if(parser == null){
-//				sentAnnot.setParse("Parser not initialized properly.");
-//			}
 			if(sentAnnot.getCoveredText().length() == 0){
 				continue;
 			}
-			indexMap.clear();
-//			if(sentAnnot.getBegin() == 5287){
-//				System.err.println("At the beginning point...");
-//			}
-			FSArray termArray = getTerminals(jcas, sentAnnot);
-//			if(termArray.size() == 0){
-//				System.err.println("Array ofl ength 0");
-//			}
-			String sentStr = getSentence(termArray, indexMap);
-			StringBuffer parseBuff = new StringBuffer();
+			FSArray termArray = TreeUtils.getTerminals(jcas, sentAnnot);
+			String sentStr = TreeUtils.getSentence(termArray);
 			if(sentStr.length() == 0){
-//				System.err.println("String of length 0");
-				parseBuff.append("");
 				parse = null;
 			}else{
 				parse = ParserTool.parseLine(sentStr, parser, 1)[0];
-				parse.show(parseBuff);
 			}
-//			Span span = parse.getSpan();
-			parseStr = parseBuff.toString();
-			TopTreebankNode top = new TopTreebankNode(jcas, sentAnnot.getBegin(), sentAnnot.getEnd());
-			top.setTreebankParse(parseBuff.toString());
-			top.setTerminals(termArray);
-			top.setParent(null);
-			if(parse != null) recursivelyCreateStructure(jcas, top, parse, top, indexMap);
+			TopTreebankNode top = TreeUtils.buildAlignedTree(jcas, parse, sentAnnot);
+			top.addToIndexes();
 		}
-		logger.info("Done parsing: " + docId);
+//		logger.info("Done parsing: " + docId);
 	}
 
-	private void recursivelyCreateStructure(JCas jcas, TreebankNode parent, Parse parse, TopTreebankNode
root, Map<Integer,Integer> imap){
-		String[] typeParts;
-		if(parse.getType().startsWith("-")){
-			// check for dash at the start (for escaped types like -RRB- and so forth that cannot
take function tags anyways)
-			typeParts = new String[]{parse.getType()};
-		}else{
-			typeParts = parse.getType().split("-");
-		}
-		parent.setNodeType(typeParts[0]);
-		parent.setNodeValue(typeParts[0]);
-		parent.setLeaf(parse.getChildCount() == 0);
-		StringArray tags = new StringArray(jcas, typeParts.length-1);
-		for(int i = 1; i < typeParts.length; i++){
-			tags.set(i-1, typeParts[i]);
-		}
-		parent.setNodeTags(tags);
-		// This is not part of the MiPacq/SHARP type system, but it is hopefully being added. 
-		parent.setHeadIndex(parse.getHeadIndex());
-		
-		Parse[] subtrees = parse.getChildren();
-		FSArray children = new FSArray(jcas, subtrees.length);
-		
-		for(int i = 0; i < subtrees.length; i++){
-			Parse subtree = subtrees[i];
-			Span span = subtree.getSpan();
-			if(subtree.getChildCount() > 0){
-				try{
-					TreebankNode child = new TreebankNode(jcas, root.getBegin() + imap.get(span.getStart()),
root.getBegin() + imap.get(span.getEnd()));
-					child.setParent(parent);
-					children.set(i, child);
-					recursivelyCreateStructure(jcas, child, subtree, root, imap);
-				}catch(NullPointerException e){
-					System.err.println("MaxentParserWrapper Error: " + e);
-				}
-			}else{
-				TerminalTreebankNode term = root.getTerminals(subtree.getHeadIndex());
-				children.set(i,term);
-				term.setParent(parent);
-			}
-//			children.set(i, child);
-		}
-		parent.setChildren(children);
-		parent.addToIndexes();
-	}
-	
-	private String getSentence(FSArray termArray, Map<Integer,Integer> imap){
-		StringBuffer sent = new StringBuffer();
-		int offset = 0;
-		
-		for(int i = 0; i < termArray.size(); i++){
-			TerminalTreebankNode ttn = (TerminalTreebankNode) termArray.get(i);
-			String word = ttn.getNodeType();
-			word = word.replaceAll("\\s", "");
-			if(i == 0) offset = ttn.getBegin();
-			else if(word.length() == 0) continue;
-			else sent.append(" ");
 
-			sent.append(word);
-//			imap.put(sent.length()-ttn.getNodeType().length(), ttn.getBegin()-offset);
-//			imap.put(sent.length(), ttn.getEnd()-offset);
-			imap.put(sent.length()-word.length(), ttn.getBegin()-offset);
-			imap.put(sent.length(), ttn.getEnd()-offset);
-		}
-		
-		return sent.toString();
-	}
-	
-	private FSArray getTerminals(JCas jcas, Sentence sent){
-		ArrayList<BaseToken> wordList = new ArrayList<BaseToken>();
-		FSIterator<Annotation> iterator = jcas.getAnnotationIndex(BaseToken.type).subiterator(sent);
-		while(iterator.hasNext()){
-			BaseToken w = (BaseToken)iterator.next();
-			if(w instanceof NewlineToken) continue;
-			wordList.add(w);
-		}
-		
-		FSArray terms = new FSArray(jcas, wordList.size());
-		for(int i = 0; i < wordList.size(); i++){
-			BaseToken w = (BaseToken) wordList.get(i);
-			TerminalTreebankNode ttn = new TerminalTreebankNode(jcas, w.getBegin(), w.getEnd());
-			ttn.setChildren(null);
-			ttn.setIndex(i);
-			ttn.setTokenIndex(i);
-			ttn.setLeaf(true);
-			ttn.setNodeTags(null);
-			if(w instanceof PunctuationToken){
-				String tokStr = w.getCoveredText();
-				if(tokStr.equals("(") || tokStr.equals("[")){
-					ttn.setNodeType("-LRB-");
-				}else if(tokStr.equals(")") || tokStr.equals("]")){
-					ttn.setNodeType("-RRB-");
-				}else if(tokStr.equals("{")){
-					ttn.setNodeType("-LCB-");
-				}else if(tokStr.equals("}")){
-					ttn.setNodeType("-RCB-");
-				}else{
-					ttn.setNodeType(w.getCoveredText());
-				}
-			}else{
-				ttn.setNodeType(w.getCoveredText());
-			}
-			ttn.setNodeValue(ttn.getNodeType());
-			ttn.addToIndexes();
-			terms.set(i, ttn);
-		}
-		
-		return terms;
-	}
 }

Modified: incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ParserWrapper.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ParserWrapper.java?rev=1443507&r1=1443506&r2=1443507&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ParserWrapper.java
(original)
+++ incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ParserWrapper.java
Thu Feb  7 14:39:51 2013
@@ -18,10 +18,11 @@
  */
 package org.apache.ctakes.constituency.parser;
 
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.jcas.JCas;
 
 public interface ParserWrapper {
 	public String getParseString(FSIterator tokens);
-	public void createAnnotations(JCas jcas);
+	public void createAnnotations(JCas jcas) throws AnalysisEngineProcessException;
 }

Modified: incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/CommandLineParserUtil.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/CommandLineParserUtil.java?rev=1443507&r1=1443506&r2=1443507&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/CommandLineParserUtil.java
(original)
+++ incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/CommandLineParserUtil.java
Thu Feb  7 14:39:51 2013
@@ -54,5 +54,4 @@ public class CommandLineParserUtil {
 			System.out.println(parse.getTreebankParse());
 		}
 	}
-
 }

Modified: incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java?rev=1443507&r1=1443506&r2=1443507&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java
(original)
+++ incubator/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java
Thu Feb  7 14:39:51 2013
@@ -21,13 +21,25 @@ package org.apache.ctakes.constituency.p
 import opennlp.tools.parser.Parse;
 
 
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
 import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.tcas.Annotation;
 
+import java.util.HashMap;
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Map;
 
 public class TreeUtils {
 
@@ -157,5 +169,125 @@ public class TreeUtils {
 		top = (TopTreebankNode) cur;
 		return top;
 	}
+
+	public static TopTreebankNode buildAlignedTree(JCas jcas, Parse parse, Sentence sent) throws
AnalysisEngineProcessException {
+		TopTreebankNode root = new TopTreebankNode(jcas, sent.getBegin(), sent.getEnd());
+		FSArray termArray = TreeUtils.getTerminals(jcas, sent);
+		
+		StringBuffer parseBuff = new StringBuffer();
+		
+		TopTreebankNode top = new TopTreebankNode(jcas, sent.getBegin(), sent.getEnd());
+		top.setTreebankParse(parseBuff.toString());
+		top.setTerminals(termArray);
+		top.setParent(null);
+		if(parse != null) recursivelyCreateStructure(jcas, top, parse, top);
+	
+		return root;
+	}
+	
+	public static FSArray getTerminals(JCas jcas, Sentence sent){
+		ArrayList<BaseToken> wordList = new ArrayList<BaseToken>();
+		FSIterator<Annotation> iterator = jcas.getAnnotationIndex(BaseToken.type).subiterator(sent);
+		while(iterator.hasNext()){
+			BaseToken w = (BaseToken)iterator.next();
+			if(w instanceof NewlineToken) continue;
+			wordList.add(w);
+		}
+		
+		FSArray terms = new FSArray(jcas, wordList.size());
+		for(int i = 0; i < wordList.size(); i++){
+			BaseToken w = wordList.get(i);
+			TerminalTreebankNode ttn = new TerminalTreebankNode(jcas, w.getBegin(), w.getEnd());
+			ttn.setChildren(null);
+			ttn.setIndex(i);
+			ttn.setTokenIndex(i);
+			ttn.setLeaf(true);
+			ttn.setNodeTags(null);
+			if(w instanceof PunctuationToken){
+				String tokStr = w.getCoveredText();
+				if(tokStr.equals("(") || tokStr.equals("[")){
+					ttn.setNodeType("-LRB-");
+				}else if(tokStr.equals(")") || tokStr.equals("]")){
+					ttn.setNodeType("-RRB-");
+				}else if(tokStr.equals("{")){
+					ttn.setNodeType("-LCB-");
+				}else if(tokStr.equals("}")){
+					ttn.setNodeType("-RCB-");
+				}else{
+					ttn.setNodeType(w.getCoveredText());
+				}
+			}else{
+				ttn.setNodeType(w.getCoveredText());
+			}
+			ttn.setNodeValue(ttn.getNodeType());
+			ttn.addToIndexes();
+			terms.set(i, ttn);
+		}
+		
+		return terms;
+	}
+	
+	public static String getSentence(FSArray termArray){
+		StringBuffer sent = new StringBuffer();
+		int offset = 0;
+		
+		for(int i = 0; i < termArray.size(); i++){
+			TerminalTreebankNode ttn = (TerminalTreebankNode) termArray.get(i);
+			String word = ttn.getNodeType();
+			word = word.replaceAll("\\s", "");
+			if(i == 0) offset = ttn.getBegin();
+			else if(word.length() == 0) continue;
+			else sent.append(" ");
+
+			sent.append(word);
+		}		
+		return sent.toString();
+	}
+	
+	private static void recursivelyCreateStructure(JCas jcas, TreebankNode parent, Parse parse,
TopTreebankNode root) throws AnalysisEngineProcessException{
+		String[] typeParts;
+		if(parse.getType().startsWith("-")){
+			// check for dash at the start (for escaped types like -RRB- and so forth that cannot
take function tags anyways)
+			typeParts = new String[]{parse.getType()};
+		}else{
+			typeParts = parse.getType().split("-");
+		}
+		parent.setNodeType(typeParts[0]);
+		parent.setNodeValue(typeParts[0]);
+		parent.setLeaf(parse.getChildCount() == 0);
+		StringArray tags = new StringArray(jcas, typeParts.length-1);
+		for(int i = 1; i < typeParts.length; i++){
+			tags.set(i-1, typeParts[i]);
+		}
+		parent.setNodeTags(tags);
+		parent.setHeadIndex(parse.getHeadIndex());
+		
+		Parse[] subtrees = parse.getChildren();
+		FSArray children = new FSArray(jcas, subtrees.length);
+		
+		for(int i = 0; i < subtrees.length; i++){
+			Parse subtree = subtrees[i];
+			if(subtree.getChildCount() > 0){
+				try{
+					TreebankNode child = new TreebankNode(jcas);
+					child.setParent(parent);
+					children.set(i, child);
+					recursivelyCreateStructure(jcas, child, subtree, root);
+				}catch(NullPointerException e){
+					System.err.println("MaxentParserWrapper Error: " + e);
+					throw new AnalysisEngineProcessException();
+				}
+			}else{
+				TerminalTreebankNode term = root.getTerminals(subtree.getHeadIndex());
+				children.set(i,term);
+				term.setParent(parent);
+			}
+		}
+		// after we've built up all the children we can fill in the span of the parent.
+		parent.setBegin(((TreebankNode)children.get(0)).getBegin());
+		parent.setEnd(((TreebankNode)children.get(subtrees.length-1)).getEnd());
+		parent.setChildren(children);
+//		parent.addToIndexes();
+	}
 }
 



Mime
View raw message