lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c.@apache.org
Subject svn commit: r1479234 [3/15] - in /lucene/dev/branches/lucene4956: dev-tools/idea/.idea/ dev-tools/idea/lucene/analysis/arirang/ lucene/analysis/ lucene/analysis/arirang/ lucene/analysis/arirang/src/ lucene/analysis/arirang/src/java/ lucene/analysis/ari...
Date Sun, 05 May 2013 03:40:06 GMT
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java Sun May  5 03:39:51 2013
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.kr.morph;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class WordEntry {
+
+	public static final int IDX_NOUN = 0;
+	public static final int IDX_VERB = 1;
+	public static final int IDX_BUSA = 2;
+	public static final int IDX_DOV = 3;
+	public static final int IDX_BEV = 4;
+	public static final int IDX_NE = 5;
+	public static final int IDX_ADJ = 6; // 형용사
+	public static final int IDX_NPR = 7;  // 명사의 분류 (M:Measure)
+	public static final int IDX_CNOUNX = 8; 
+	public static final int IDX_REGURA = 9;
+	
+	/**
+	 * 단어
+	 */
+	private String word;
+	
+	/**
+	 * 단어특성
+	 */
+	private char[] features;
+	
+	private List<CompoundEntry> compounds = new ArrayList();
+	
+	public WordEntry() {
+		
+	}
+	
+	public WordEntry(String word) {
+		this.word = word;
+	}
+	
+	public WordEntry(String word, char[] cs) {
+		this.word = word;
+		this.features = cs;
+	}
+	
+	public WordEntry(String word, List c) {
+		this.word = word;
+		this.compounds = c;
+	}
+	
+	public void setWord(String w) {
+		this.word = w;
+	}
+	
+	public String getWord() {
+		return this.word;
+	}
+	
+	public void setFeatures(char[] cs) {
+		this.features = cs;
+	}
+	
+	public char getFeature(int index) {
+		if(features==null||features.length<index) return '0';		
+		return features[index];
+	}
+	
+	public char[] getFeatures() {
+		return this.features;
+	}
+	
+	public void setCompounds(List<CompoundEntry> c) {
+		this.compounds = c;
+	}
+	
+	public List<CompoundEntry> getCompounds() {
+		return this.compounds;
+	}
+	
+}

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java Sun May  5 03:39:51 2013
@@ -0,0 +1,589 @@
+package org.apache.lucene.analysis.kr.morph;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kr.utils.DictionaryUtil;
+import org.apache.lucene.analysis.kr.utils.MorphUtil;
+import org.apache.lucene.analysis.kr.utils.SyllableUtil;
+import org.apache.lucene.analysis.kr.utils.VerbUtil;
+
+public class WordSpaceAnalyzer {
+
+	private MorphAnalyzer morphAnal;
+	
+	public WordSpaceAnalyzer() {
+		morphAnal = new MorphAnalyzer();
+		morphAnal.setExactCompound(false);
+	}
+	
+	public List<AnalysisOutput> analyze(String input)  throws MorphException {
+
+		List stack = new ArrayList();
+		
+		WSOutput output = new WSOutput();
+		
+		int wStart = 0;
+		
+		int sgCount = -9;
+		
+		Map<Integer, Integer> fCounter = new HashMap();
+		
+		for(int i=0;i<input.length();i++) {						
+			
+			char[] f = SyllableUtil.getFeature(input.charAt(i));
+			
+			String prefix = i==input.length()-1 ? "X" : input.substring(wStart,i+2);					
+			Iterator iter = DictionaryUtil.findWithPrefix(prefix);
+			
+			List<AnalysisOutput> candidates = new ArrayList();		
+			
+			WordEntry entry = null;
+					
+			if(input.charAt(i)=='있' || input.charAt(i)=='없' || input.charAt(i)=='앞') {
+				addSingleWord(input.substring(wStart,i), candidates);
+				
+								
+			// 다음 음절이 2음절 이상 단어에 포함되어 있고 마지막 음절이 아니라면   띄워쓰기 위치가 아닐 가능성이 크다.
+			// 부사, 관형사, 감탄사 등 단일어일 가능성인 경우 띄워쓰기가 가능하나, 
+			// 이 경우는 다음 음절을 조사하여 
+			} else if(i!= input.length()-1 && iter.hasNext()) { 
+				// 아무짓도 하지 않음.
+				sgCount = i;
+			} else if(!iter.hasNext() && 
+					(entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) { 				
+				candidates.add(buildSingleOutput(entry));
+				
+			// 현 음절이 조사나 어미가 시작되는 음절일 가능성이 있다면...	
+			} else if(f[SyllableUtil.IDX_EOGAN]=='1'||f[SyllableUtil.IDX_JOSA1]=='1'){				
+				if(f[SyllableUtil.IDX_JOSA1]=='1') 
+					candidates.addAll(anlysisWithJosa(input.substring(wStart), i-wStart));
+
+				if(f[SyllableUtil.IDX_EOGAN]=='1') 
+					candidates.addAll(anlysisWithEomi(input.substring(wStart), i-wStart));
+			}
+	
+			// 호보가 될 가능성이 높은 순으로 정렬한다.
+			Collections.sort(candidates, new WSOuputComparator());
+			
+			// 길이가 가장 긴 단어를 단일어로 추가한다.
+			appendSingleWord(candidates);
+			
+			// 분석에 실패한 단어를 
+			analysisCompouns(candidates);
+			
+			// 호보가 될 가능성이 높은 순으로 정렬한다.
+			Collections.sort(candidates, new WSOuputComparator());			
+			
+			int reseult = validationAndAppend(output, candidates, input);
+			if(reseult==1) {
+				i = output.getLastEnd()-1;
+				wStart = output.getLastEnd();
+			} else if(reseult==-1) {
+				Integer index = fCounter.get(output.getLastEnd());
+				if(index==null) index = output.getLastEnd();
+				else index = index + 1;
+				i = index;
+				wStart = output.getLastEnd();
+				fCounter.put(output.getLastEnd(), index);				
+			}
+
+		}
+		
+		// 분석에 실패하였다면 원래 문자열을 되돌려 준다.
+		if(output.getLastEnd()<input.length()) {
+			
+			String source = input.substring(output.getLastEnd());
+			int score = DictionaryUtil.getWord(source)==null ? AnalysisOutput.SCORE_ANALYSIS : AnalysisOutput.SCORE_CORRECT;
+			AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+					PatternConstants.PTN_N,score);
+			
+			o.setSource(source);
+			output.getPhrases().add(o);
+			morphAnal.confirmCNoun(o);
+			
+		}
+
+		return output.getPhrases();
+	}
+	
+	/**
+	 * 조사로 끝나는 어구를 분석한다.
+	 * @param snipt
+	 * @param js
+	 * @return
+	 * @throws MorphException
+	 */
+	private List<AnalysisOutput> anlysisWithJosa(String snipt, int js) throws MorphException {
+
+		List<AnalysisOutput> candidates = new ArrayList();
+		if(js<1) return candidates;
+		
+		int jend = findJosaEnd(snipt, js);
+
+		if(jend==-1) return candidates; // 타당한 조사가 아니라면...
+	
+		String input = snipt.substring(0,jend);
+
+		boolean josaFlag = true;
+		
+		for(int i=input.length()-1;i>0;i--) {
+			
+			String stem = input.substring(0,i);
+			
+			String josa = input.substring(i);
+
+			char[] feature =  SyllableUtil.getFeature(josa.charAt(0));	
+			
+			if(josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') {
+				morphAnal.analysisWithJosa(stem,josa,candidates);				
+			}
+				
+			if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
+			
+			if(!josaFlag) break;
+			
+		}
+		
+		if(input.length()==1) {
+			AnalysisOutput o =new AnalysisOutput(input,null,null,PatternConstants.POS_NOUN,
+					 PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+			candidates.add(o);
+		}
+		
+		fillSourceString(input, candidates);
+		
+		return candidates;
+	}
+	
+	/**
+	 * 조사의 첫음절부터 조사의 2음절이상에 사용될 수 있는 음절을 조사하여
+	 * 가장 큰 조사를 찾는다.
+	 * @param snipt
+	 * @param jstart
+	 * @return
+	 * @throws MorphException
+	 */
+	private int findJosaEnd(String snipt, int jstart) throws MorphException {
+		
+		int jend = jstart;
+
+		// [것을]이 명사를 이루는 경우는 없다.
+		if(snipt.charAt(jstart-1)=='것'&&(snipt.charAt(jstart)=='을')) return jstart+1;
+		
+		if(snipt.length()>jstart+2&&snipt.charAt(jstart+1)=='스') { // 사랑스러운, 자랑스러운 같은 경우르 처리함.
+			char[] chrs = MorphUtil.decompose(snipt.charAt(jstart+2));
+
+			if(chrs.length>=2&&chrs[0]=='ㄹ'&&chrs[1]=='ㅓ') return -1;
+		}
+		
+		// 조사의 2음절로 사용될 수 마지막 음절을 찾는다.
+		for(int i=jstart+1;i<snipt.length();i++) {
+			char[] f = SyllableUtil.getFeature(snipt.charAt(i));
+			if(f[SyllableUtil.IDX_JOSA2]=='0') break;
+			jend = i;				
+		}
+				
+		int start = jend;
+		boolean hasJosa = false;
+		for(int i=start;i>=jstart;i--) {
+			String str = snipt.substring(jstart,i+1);
+			if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snipt,i,i+2) &&
+					!isNounPart(snipt,jstart)) {
+				jend = i;
+				hasJosa = true;
+				break;
+			}
+		}
+
+		if(!hasJosa) return -1;
+		
+		return jend+1;
+		
+	}
+	
+	/**
+	 * 향후 계산이나 원 문자열을 보여주기 위해 source string 을 저장한다.
+	 * @param source
+	 * @param candidates
+	 */
+	private void fillSourceString(String source, List<AnalysisOutput> candidates) {
+		
+		for(AnalysisOutput o : candidates) {
+			o.setSource(source);
+		}
+		
+	}
+	
+	/**
+	 * 목록의 1번지가 가장 큰 길이를 가진다.
+	 * @param candidates
+	 */
+	private void appendSingleWord(List<AnalysisOutput> candidates) throws MorphException {
+	
+		if(candidates.size()==0) return;
+		
+		String source = candidates.get(0).getSource();
+		
+		WordEntry entry = DictionaryUtil.getWordExceptVerb(source);
+		
+		if(entry!=null) {
+			candidates.add(buildSingleOutput(entry));
+		} else {
+
+			if(candidates.get(0).getPatn()>PatternConstants.PTN_VM&&
+					candidates.get(0).getPatn()<=PatternConstants.PTN_VMXMJ) return;
+			
+			if(source.length()<5) return;
+			
+			AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+					 PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+			o.setSource(source);
+			morphAnal.confirmCNoun(o);			
+			if(o.getScore()==AnalysisOutput.SCORE_CORRECT) candidates.add(o);
+		}				
+	}
+	
+	private void addSingleWord(String source, List<AnalysisOutput> candidates) throws MorphException {
+		
+		WordEntry entry = DictionaryUtil.getWordExceptVerb(source);
+		
+		if(entry!=null) {
+			candidates.add(buildSingleOutput(entry));
+		} else {
+			AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+					 PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+			o.setSource(source);
+			morphAnal.confirmCNoun(o);			
+			candidates.add(o);
+		}
+		
+//		Collections.sort(candidates, new WSOuputComparator());
+		
+	}
+	
+	private List anlysisWithEomi(String snipt, int estart) throws MorphException {
+
+		List<AnalysisOutput> candidates = new ArrayList();
+		
+		int eend = findEomiEnd(snipt,estart);		
+
+		// 동사앞에 명사분리
+		int vstart = 0;
+		for(int i=estart-1;i>=0;i--) {	
+			Iterator iter = DictionaryUtil.findWithPrefix(snipt.substring(i,estart)); 
+			if(iter.hasNext()) vstart=i;
+			else break;
+		}
+			
+		if(snipt.length()>eend &&
+				DictionaryUtil.findWithPrefix(snipt.substring(vstart,eend+1)).hasNext()) 
+			return candidates;	// 다음음절까지 단어의 일부라면.. 분해를 안한다.
+		
+		String pvword = null;
+		if(vstart!=0) pvword = snipt.substring(0,vstart);
+			
+		while(true) { // ㄹ,ㅁ,ㄴ 이기때문에 어미위치를 뒤로 잡았는데, 용언+어미의 형태가 아니라면.. 어구 끝을 하나 줄인다.
+			String input = snipt.substring(vstart,eend);
+			anlysisWithEomiDetail(input, candidates);				
+			if(candidates.size()==0) break;		
+			if(("ㄹ".equals(candidates.get(0).getEomi()) ||
+					"ㅁ".equals(candidates.get(0).getEomi()) ||
+					"ㄴ".equals(candidates.get(0).getEomi())) &&
+					eend>estart+1 && candidates.get(0).getPatn()!=PatternConstants.PTN_VM &&
+					candidates.get(0).getPatn()!=PatternConstants.PTN_NSM
+					) {
+				eend--;
+			}else if(pvword!=null&&candidates.get(0).getPatn()>=PatternConstants.PTN_VM&& // 명사 + 용언 어구 중에.. 용언어구로 단어를 이루는 경우는 없다.
+					candidates.get(0).getPatn()<=PatternConstants.PTN_VMXMJ && DictionaryUtil.getWord(input)!=null){
+				candidates.clear();
+				break;
+			}else if(pvword!=null&&VerbUtil.verbSuffix(candidates.get(0).getStem())
+					&&DictionaryUtil.getNoun(pvword)!=null){ // 명사 + 용언화 접미사 + 어미 처리
+				candidates.clear();
+				anlysisWithEomiDetail(snipt.substring(0,eend), candidates);
+				pvword=null;
+				break;				
+			} else {
+				break;
+			}
+		}
+						
+		if(candidates.size()>0&&pvword!=null) {
+			AnalysisOutput o =new AnalysisOutput(pvword,null,null,PatternConstants.POS_NOUN,
+					PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);	
+			morphAnal.confirmCNoun(o);
+			
+			List<CompoundEntry> cnouns = o.getCNounList();
+			if(cnouns.size()==0) {
+				boolean is = DictionaryUtil.getWordExceptVerb(pvword)!=null;
+				cnouns.add(new CompoundEntry(pvword,0,is));
+			} 
+			
+			for(AnalysisOutput candidate : candidates) {
+				candidate.getCNounList().addAll(cnouns);
+				candidate.getCNounList().add(new CompoundEntry(candidate.getStem(),0,true));
+				candidate.setStem(pvword+candidate.getStem()); // 이렇게 해야 WSOutput 에 복합명사 처리할 때 정상처리됨
+			}
+			
+		}
+
+		fillSourceString(snipt.substring(0,eend), candidates);
+	
+		return candidates;
+	}
+	
+	private void anlysisWithEomiDetail(String input, List<AnalysisOutput> candidates ) 
+	throws MorphException {
+
+		boolean eomiFlag = true;
+		
+		int strlen = input.length();
+		
+		char ch = input.charAt(strlen-1);
+		char[] feature =  SyllableUtil.getFeature(ch);
+		
+		if(feature[SyllableUtil.IDX_YNPNA]=='1'||feature[SyllableUtil.IDX_YNPLA]=='1'||
+				feature[SyllableUtil.IDX_YNPMA]=='1')
+			morphAnal.analysisWithEomi(input,"",candidates);
+		
+		for(int i=strlen-1;i>0;i--) {
+			
+			String stem = input.substring(0,i);
+			String eomi = input.substring(i);
+
+			feature =  SyllableUtil.getFeature(eomi.charAt(0));		
+			
+			if(eomiFlag) {			
+				morphAnal.analysisWithEomi(stem,eomi,candidates);
+			}			
+			
+			if(eomiFlag&&feature[SyllableUtil.IDX_EOMI2]=='0') eomiFlag = false;
+			
+			if(!eomiFlag) break;
+		}
+		
+	}
+	
+	/**
+	 * 어미의 첫음절부터 어미의 1음절이상에 사용될 수 있는 음절을 조사하여
+	 * 가장 큰 조사를 찾는다.
+	 * @param snipt
+	 * @param jstart
+	 * @return
+	 * @throws MorphException
+	 */
+	private int findEomiEnd(String snipt, int estart) throws MorphException {
+		
+		int jend = 0;
+		
+		String tail = null;
+		char[] chr = MorphUtil.decompose(snipt.charAt(estart));
+		if(chr.length==3 && (chr[2]=='ㄴ')) {
+			tail = '은'+snipt.substring(estart+1);
+		}else if(chr.length==3 && (chr[2]=='ㄹ')) {
+			tail = '을'+snipt.substring(estart+1);			
+		}else if(chr.length==3 && (chr[2]=='ㅂ')) {
+			tail = '습'+snipt.substring(estart+1);
+		}else {
+			tail = snipt.substring(estart);
+		}				
+
+		// 조사의 2음절로 사용될 수 마지막 음절을 찾는다.
+		int start = 0;
+		for(int i=1;i<tail.length();i++) {
+			char[] f = SyllableUtil.getFeature(tail.charAt(i));	
+			if(f[SyllableUtil.IDX_EOGAN]=='0') break;
+			start = i;				
+		}
+					
+		for(int i=start;i>0;i--) { // 찾을 수 없더라도 1음절은 반드시 반환해야 한다.
+			String str = tail.substring(0,i+1);	
+			char[] chrs = MorphUtil.decompose(tail.charAt(i));	
+			if(DictionaryUtil.existEomi(str) || 
+					(i<2&&chrs.length==3&&(chrs[2]=='ㄹ'||chrs[2]=='ㅁ'||chrs[2]=='ㄴ'))) { // ㅁ,ㄹ,ㄴ이 연속된 용언은 없다, 사전을 보고 확인을 해보자
+				jend = i;
+				break;
+			}
+		}
+		
+		return estart+jend+1;
+		
+	}
+	
+	/**
+	 * validation 후 후보가 될 가능성이 높은 최상위 것을 결과에 추가한다.
+	 * 
+	 * @param output
+	 * @param candidates
+	 * @param stack
+	 */
+	private int validationAndAppend(WSOutput output, List<AnalysisOutput> candidates, String input)
+	throws MorphException {
+		
+		if(candidates.size()==0) return 0;
+		
+		AnalysisOutput o = candidates.remove(0);		
+		AnalysisOutput po = output.getPhrases().size()>0 ?  output.getPhrases().get(output.getPhrases().size()-1) : null;
+		
+		String ejend = o.getSource().substring(o.getStem().length());
+		
+		char[] chrs = po!=null&&po.getStem().length()>0 ? MorphUtil.decompose(po.getStem().charAt(po.getStem().length()-1)) : null;
+		String pjend = po!=null&&po.getStem().length()>0 ? po.getSource().substring(po.getStem().length()) : null;
+		
+		char ja = 'x'; // 임의의 문자
+		if(po!=null&&(po.getPatn()==PatternConstants.PTN_VM||po.getPatn()==PatternConstants.PTN_VMCM||po.getPatn()==PatternConstants.PTN_VMXM)) {		
+			char[] chs = MorphUtil.decompose(po.getEomi().charAt(po.getEomi().length()-1));
+			if(chs.length==3) ja=chs[2];
+			else if(chs.length==1) ja=chs[0];			
+		}
+		
+		int nEnd = output.getLastEnd()+o.getSource().length();
+		
+		char[] f = nEnd<input.length() ? SyllableUtil.getFeature(input.charAt(nEnd)) : null;			
+		
+		// 밥먹고 같은 경우가 가능하나.. 먹고는 명사가 아니다.
+		if(po!=null&&po.getPatn()==PatternConstants.PTN_N&&candidates.size()>0&&  
+				o.getPatn()==PatternConstants.PTN_VM&&candidates.get(0).getPatn()==PatternConstants.PTN_N) {
+			o = candidates.remove(0); 			
+		}else if(po!=null&&po.getPatn()>=PatternConstants.PTN_VM&&candidates.size()>0&&
+				candidates.get(0).getPatn()==PatternConstants.PTN_N&&
+				(ja=='ㄴ'||ja=='ㄹ')) { // 다녀가ㄴ, 사,람(e) 로 분해 방지
+			o = candidates.remove(0);
+		}
+		
+		//=============================================
+		if(o.getPos()==PatternConstants.POS_NOUN && MorphUtil.hasVerbOnly(o.getStem())) {		
+			output.removeLast();		
+			return -1;
+		}else if(nEnd<input.length() && f[SyllableUtil.IDX_JOSA1]=='1' 
+			&& DictionaryUtil.getNoun(o.getSource())!=null) {
+			return -1;
+		}else if(nEnd<input.length() && o.getScore()==AnalysisOutput.SCORE_ANALYSIS 
+			&& DictionaryUtil.findWithPrefix(ejend+input.charAt(nEnd)).hasNext()) { // 루씬하ㄴ 글형태소분석기 방지
+			return -1;	
+		}else if(po!=null&&po.getPatn()==PatternConstants.PTN_VM&&"ㅁ".equals(po.getEomi())&&
+				o.getStem().equals("하")) { // 다짐 합니다 로 분리되는 것 방지
+			output.removeLast();
+			return -1;	
+		}else if(po!=null&&po.getPatn()==PatternConstants.PTN_N&&VerbUtil.verbSuffix(o.getStem())&&
+				!"있".equals(o.getStem())) { // 사랑받다, 사랑스러운을 처리, 그러나 있은 앞 단어와 결합하지 않는다.
+			output.removeLast();
+			return -1;			
+		} else {	
+			output.addPhrase(o);				
+		}
+				
+		return 1;
+	}
+	
+	
+	private AnalysisOutput buildSingleOutput(WordEntry entry) {
+		
+		char pos = PatternConstants.POS_NOUN;
+		
+		int ptn = PatternConstants.PTN_N;
+		
+		if(entry.getFeature(WordEntry.IDX_NOUN)=='0') {
+			pos = PatternConstants.POS_AID;
+			ptn = PatternConstants.PTN_AID;
+		}
+		
+		AnalysisOutput o = new AnalysisOutput(entry.getWord(),null,null,pos,
+				ptn,AnalysisOutput.SCORE_CORRECT);
+		
+		o.setSource(entry.getWord());
+		
+		return o;
+	}
+	
+	private void analysisCompouns(List<AnalysisOutput> candidates) throws MorphException {
+		
+		// 복합명사 분해여부 결정하여 분해
+		boolean changed = false;
+		boolean correct = false;
+		for(AnalysisOutput o:candidates) {
+			
+			if(o.getScore()==AnalysisOutput.SCORE_CORRECT) {
+				if(o.getPatn()!=PatternConstants.PTN_NJ) correct=true;
+				// "활성화해"가 [활성화(N),하(t),어야(e)] 분석성공하였는데 [활성/화해]분해되는 것을 방지
+				if("하".equals(o.getVsfx())) break; 
+				continue;
+			}
+
+			if(o.getPatn()<=PatternConstants.PTN_VM&&o.getStem().length()>2) {
+				 if(!(correct&&o.getPatn()==PatternConstants.PTN_N)) morphAnal.confirmCNoun(o);
+				 if(o.getScore()==AnalysisOutput.SCORE_CORRECT) changed=true;
+			}
+		}
+		
+	}
+	
+	/**
+	 * 문자열에 
+	 * @param str	분석하고자 하는 전체 문자열
+	 * @param ws	문자열에서 명사를 찾는 시작위치
+	 * @param es	문자열에서 명사를 찾는 끝 위치
+	 * @return
+	 * @throws MorphException
+	 */
+	private boolean findNounWithinStr(String str, int ws, int es) throws MorphException {
+
+		if(str.length()<es) return false;
+				
+		for(int i=es;i<str.length();i++) {
+			char[] f = SyllableUtil.getFeature(str.charAt(i));	
+			if(i==str.length() || (f[SyllableUtil.IDX_JOSA1]=='1')) {				
+				return (DictionaryUtil.getWord(str.substring(ws,i))!=null);
+			}
+		}
+		
+		return false;
+	}
+	
+	private boolean isNounPart(String str, int jstart) throws MorphException  {
+		
+		if(true) return false;
+		
+		for(int i=jstart-1;i>=0;i--) {			
+			if(DictionaryUtil.getWordExceptVerb(str.substring(i,jstart+1))!=null)
+				return true;
+			
+		}
+		
+		
+		return false;
+		
+	}
+	
+	private void printCandidate(WSOutput output) {
+		
+		List<AnalysisOutput> os = output.getPhrases();
+		for(AnalysisOutput o : os) {
+			System.out.print(o.toString()+"("+o.getScore()+")| ");
+		}
+		System.out.println("<==");
+		
+	}	
+}

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java Sun May  5 03:39:51 2013
@@ -0,0 +1,317 @@
+package org.apache.lucene.analysis.kr.tagging;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+import org.apache.lucene.analysis.kr.utils.ConstraintUtil;
+import org.apache.lucene.analysis.kr.utils.FileUtil;
+import org.apache.lucene.analysis.kr.utils.KoreanEnv;
+import org.apache.lucene.analysis.kr.utils.StringUtil;
+import org.apache.lucene.analysis.kr.utils.Trie;
+
+
+/**
+ * 여러개의 형태소분석 결과 중에 최적의 것을 선택한다.
+ * 이 함수는 문장단위로 호출되어야 한다.
+ */
+public class Tagger {
+		
+	private static Trie<String, String[]> occurrences;
+	
+	private static final String tagDicLoc = "tagger.dic";
+	
+	private static final String NILL = "NILL";
+	
+	private static final String NOPATN = "0";
+	
+	private AnalysisOutput po;
+	
+	public AnalysisOutput tagging(String psource, List<AnalysisOutput> pmorphs)  throws MorphException {
+					
+		return tagging(psource, null, pmorphs, null);
+		
+	}
+	
+	public AnalysisOutput tagging(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs)  throws MorphException {
+
+		if((pmorphs==null||pmorphs.size()==0)&&(rmorphs==null||rmorphs.size()==0)) return null;
+	
+		po = lookupBest(psource, rsource, pmorphs, rmorphs);
+		
+		po.setSource(psource);
+
+		return po;
+		
+	}
+	
+	/**
+	 * po가 NULL이 아닌 경우만 호출된다.
+	 * occurrence.dic 에 등록되어 있는 경우만.. 최적을 찾아서 반환한다.
+	 * 1. 첫번째는 어간으로 시작되는 문법 규칙을 찾는다.
+	 * 2. 두번째는 표층형으로 시작되는 문법규칙을 찾는다.
+	 * @param morphs
+	 * @return
+	 */
+	private AnalysisOutput lookupBest(String psource,String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs)  throws MorphException {
+		
+		if(pmorphs.size()==1) return pmorphs.get(0);
+
+		AnalysisOutput select  = null;
+		if(rmorphs!=null&&rmorphs.size()!=0) select = lookupBestByRWord(psource, rsource, pmorphs, rmorphs);		
+		if(select!=null) return select;
+
+		if(po!=null) select = lookupBestByPWord(psource, pmorphs);
+		
+		if(select!=null) return select;
+
+		return pmorphs.get(0);
+	}
+	
+	/**
+	 * 앞 어절에 의해 현재 어절을 결정한다.
+	 * 앞 어절은 NULL이 아니다.
+	 * @param source
+	 * @param pmorphs
+	 * @param rmorphs
+	 * @return
+	 * @throws MorphException
+	 */
+	private AnalysisOutput lookupBestByPWord(String rsource, List<AnalysisOutput> rmorphs)  throws MorphException {
+		
+	
+		List<AnalysisOutput> removes = new ArrayList();				
+
+		for(AnalysisOutput morph : rmorphs) {
+	
+			Iterator<String[]> iterw = getGR("F"+rsource+"^W");
+
+			AnalysisOutput best = selectBest(iterw, po.getSource(), rsource, po, morph, true, removes);
+			if(best!=null) return best;						
+
+			Iterator<String[]> iters = getGR("F"+morph.getStem()+"^S");
+			best = selectBest(iters, po.getSource(), rsource, po, morph, true, removes);
+			if(best!=null) return best;				
+			
+		}	
+		
+		for(AnalysisOutput morph : removes) {
+			if(rmorphs.size()>1) rmorphs.remove(morph);
+		}
+		
+		return null;
+		
+	}
+	
+	/**
+	 * 뒷 어절에 의해 현재 어절이 결정된다.
+	 * 뒷 어절은 NULL이 아니다.
+	 * @param source
+	 * @param pmorphs
+	 * @param rmorphs
+	 * @return
+	 * @throws MorphException
+	 */
+	private AnalysisOutput lookupBestByRWord(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs)  throws MorphException {
+		
+		List<AnalysisOutput> removes = new ArrayList();
+		
+		for(AnalysisOutput rmorph : rmorphs) {
+			
+			if(rmorph.getScore()!=AnalysisOutput.SCORE_CORRECT) break;
+			
+			String rend = rmorph.getJosa();
+			if(rend==null) rend = rmorph.getEomi();						
+			
+			for(AnalysisOutput pmorph : pmorphs) {						
+			
+				Iterator<String[]> iterw = getGR("R"+psource+"^W/");
+				
+				String pend = pmorph.getJosa();
+				if(pend==null) pend = pmorph.getEomi();
+				
+				AnalysisOutput best = selectBest(iterw, psource, rsource, pmorph, rmorph, false, removes);
+				if(best!=null) return best;	
+								
+				Iterator<String[]> iters = getGR("R"+NILL+"/"+pend+"/");	
+				best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
+				if(best!=null) return best;	
+				
+				iters = getGR("R"+pmorph.getStem()+"^S/");	
+				best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
+				if(best!=null) return best;					
+				
+			}
+						
+		}		
+		
+		for(AnalysisOutput morph : removes) {
+			if(pmorphs.size()>1) pmorphs.remove(morph);
+		}
+		
+		return null;
+		
+	}
+	
+	private AnalysisOutput selectBest(Iterator<String[]> iter, String psource, String rsource, 
+			AnalysisOutput pmorph, AnalysisOutput rmorph, boolean rear, List removes) {
+
+		while(iter.hasNext()) {		
+
+			String[] values = iter.next();
+		
+			if(checkGrammer(values, psource, rsource, pmorph, rmorph, rear)) {
+				if(rear) return rmorph;
+				else return pmorph;
+			} else if("1".equals(values[6])) {
+				if(!removes.contains(pmorph)) removes.add(pmorph);
+				break;
+			}				
+		}
+		
+		return null;
+		
+	}
+	
+	private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) {
+		
+		boolean ok = true;		
+		
+		String pend = pmorph.getJosa();
+		if(pend==null) pend = pmorph.getEomi();
+
+		String rend = rmorph.getJosa();
+		if(rend==null) rend = rmorph.getEomi();
+
+		if(depFront&&!NILL.equals(values[0])&&!checkWord(psource,values[0],pmorph)) { // 앞 어절의 어휘
+			return false;
+		} 			
+
+		if(!NILL.equals(values[1])&& !checkEomi(values[1], pend)) { // 앞 어절의 어미
+			return false;
+		}
+
+		if(!NOPATN.equals(values[2])&&!checkPattern(values[2], pmorph.getPatn())) {// 앞 어절의 패턴
+			return false;
+		} 	
+
+		if(!depFront&&!NILL.equals(values[3])&&!checkWord(rsource,values[3],rmorph)) { // 뒷 어절의 어휘
+			return false;			
+		}
+
+		if(!NILL.equals(values[4])&& !checkEomi(values[4], rend)) { // 뒷 어절의 어미
+			return false;
+		}
+
+		if(!NOPATN.equals(values[5]) && !checkPattern(values[5], rmorph.getPatn())) { // 뒷 어절의 패턴
+			return false;
+		}
+
+		return true;
+		
+	}
+	
+	private boolean checkWord(String source, String value, AnalysisOutput morph) {		
+		
+		String[] types = StringUtil.split(value,"^");
+		String[] strs  = StringUtil.split(types[0],",");
+		
+		String text = source;
+		if("S".equals(types[1])) text = morph.getStem();		
+	
+		for(int i=0;i<strs.length;i++) {
+			if(strs[i].equals(text)) return true;
+		}
+		
+		return false;
+	}
+	
+	private boolean checkEomi(String value, String rend) {
+		
+		String[] strs  = StringUtil.split(value,",");
+		
+		for(int i=0;i<strs.length;i++) {
+			if(strs[i].equals(rend)) return true;
+		}
+		
+		return false;		
+	}
+	
+	private boolean checkPattern(String value, int ptn) {
+		
+		String[] strs  = StringUtil.split(value,",");
+		String strPtn = Integer.toString(ptn);
+		
+		for(int i=0;i<strs.length;i++) {
+			
+			if("E".equals(strs[i])&&ConstraintUtil.isEomiPhrase(ptn))
+				return true;
+			else if("J".equals(strs[i])&&
+					(ConstraintUtil.isJosaNounPhrase(ptn)||ptn==PatternConstants.PTN_N)) 
+				return true;			
+			else if(strs[i].equals(strPtn)) 
+				return true;
+			
+		}
+		
+		return false;		
+	}
+	
+	public static synchronized Iterator<String[]> getGR(String prefix) throws MorphException {
+
+		if(occurrences==null) loadTaggerDic();
+		
+		return occurrences.getPrefixedBy(prefix);
+	}
+	
+	private static synchronized void loadTaggerDic() throws MorphException {
+		
+		occurrences = new Trie(true);
+		
+		try {
+			
+			List<String> strs = FileUtil.readLines(KoreanEnv.getInstance().getValue(tagDicLoc), "UTF-8");
+			
+			for(String str : strs) {
+				if(str==null) continue;
+				str = str.trim();
+				String[] syls = StringUtil.split(str,":");
+				if(syls.length!=4) continue;
+				
+				String key = null;				
+				if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
+				else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
+				
+				String[] patns = StringUtil.split(syls[1]+"/"+syls[2]+"/"+syls[3],"/");
+				
+				occurrences.add(syls[0]+key, patns);
+				
+			}			
+			
+		} catch (Exception e) {
+			throw new MorphException("Fail to read the tagger dictionary.("+tagDicLoc+")\n"+e.getMessage());
+		}
+	}
+		
+}

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java Sun May  5 03:39:51 2013
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+
+/**
+ * 결합이 가능한 조건을 처리하는 클래스
+ */
+public class ConstraintUtil {
+
+	private static Map hahes = new HashMap(); // "글로벌화해 ", "민족화해" 처럼 화해와 결합이 가능한 명사
+	static {
+		hahes.put("민족", "Y");hahes.put("동서", "Y");hahes.put("남북", "Y");
+	}
+	
+	private static Map eomiPnouns = new HashMap(); 
+	static {
+		eomiPnouns.put("ㄴ", "Y");eomiPnouns.put("ㄹ", "Y");eomiPnouns.put("ㅁ", "Y");
+	}
+	
+	private static Map PTN_MLIST= new HashMap();
+	static {
+		PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM);
+		PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM);
+		PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM);
+		PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM);
+		PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM);
+		PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM);
+		PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM);
+	}
+	
+	private static Map PTN_JLIST= new HashMap();
+	static {
+		PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ);
+		PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ);
+		PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ);
+		PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ);
+	}
+	
+	private static Map WORD_GUKS= new HashMap();
+	static {
+		WORD_GUKS.put("날것", "Y");
+		WORD_GUKS.put("들것", "Y");
+		WORD_GUKS.put("별것", "Y");
+		WORD_GUKS.put("찰것", "Y");
+		WORD_GUKS.put("탈것", "Y");
+		WORD_GUKS.put("하잘것", "Y");
+	}
+	
+	// 종성이 있는 음절과 연결될 수 없는 조사
+	private static Map JOSA_TWO= new HashMap();
+	static {
+		JOSA_TWO.put("가", "Y");
+		JOSA_TWO.put("는", "Y");
+		JOSA_TWO.put("다", "Y");
+		JOSA_TWO.put("나", "Y");
+		JOSA_TWO.put("니", "Y");
+		JOSA_TWO.put("ê³ ", "Y");
+		JOSA_TWO.put("라", "Y");
+		JOSA_TWO.put("와", "Y");
+		JOSA_TWO.put("랑", "Y");
+		JOSA_TWO.put("를", "Y");
+		JOSA_TWO.put("ë©°", "Y");
+		JOSA_TWO.put("든", "Y");
+		JOSA_TWO.put("야", "Y");
+		JOSA_TWO.put("여", "Y");
+	}
+	
+	// 종성이 없는 음절과 연결될 수 없는 조사
+	private static Map JOSA_THREE= new HashMap();
+	static {
+		JOSA_THREE.put("ê³¼", "Y");
+		JOSA_THREE.put("은", "Y");
+		JOSA_THREE.put("아", "Y");
+		JOSA_THREE.put("으", "Y");
+		JOSA_THREE.put("은", "Y");
+		JOSA_THREE.put("을", "Y");
+	}
+	
+	public static boolean canHaheCompound(String key) {
+		if(hahes.get(key)!=null) return true;
+		return false;
+	}
+		
+	/**
+	 * 어미가 ㄴ,ㄹ,ㅁ 으로 끝나는지 조사한다.
+	 * @param eomi
+	 * @return
+	 */
+	public static boolean isNLM(String eomi) {
+		
+		if(eomi==null || "".equals(eomi)) return false;
+		
+		if(eomiPnouns.get(eomi)!=null) return true;
+		
+		char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1));
+		if(chrs.length==3  && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true;
+		
+		return true;
+		
+	}
+	
+	public static boolean isEomiPhrase(int ptn) {
+		
+		if(PTN_MLIST.get(ptn)!=null) return true;
+		
+		return false;
+		
+	}
+	
+	public static boolean isJosaNounPhrase(int ptn) {
+		
+		if(PTN_JLIST.get(ptn)!=null) return true;
+		
+		return false;
+		
+	}
+	
+	public static boolean isJosaAdvPhrase(int ptn) {
+		
+		if(PatternConstants.PTN_ADVJ==ptn) return true;
+		
+		return false;
+		
+	}
+	
+	public static boolean isAdvPhrase(int ptn) {
+		
+		if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true;
+		
+		return false;
+		
+	}
+	
+	public static boolean isTwoJosa(String josa) {
+		
+		return (JOSA_TWO.get(josa)!=null);
+		
+	}
+	public static boolean isThreeJosa(String josa) {
+		
+		return (JOSA_THREE.get(josa)!=null);
+		
+	}	
+}

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java Sun May  5 03:39:51 2013
@@ -0,0 +1,308 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.CompoundEntry;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.WordEntry;
+
+public class DictionaryUtil {
+	
+	private static Trie<String,WordEntry> dictionary;
+	
+	private static HashMap josas;
+	
+	private static HashMap eomis;
+	
+	private static HashMap prefixs;
+	
+	private static HashMap suffixs;
+	
+	private static HashMap<String,WordEntry> uncompounds;
+	
+	private static HashMap<String, String> cjwords;
+	
+	/**
+	 * 사전을 로드한다.
+	 */
+	public synchronized static void loadDictionary() throws MorphException {
+		
+		dictionary = new Trie<String, WordEntry>(true);
+		List<String> strList = null;
+		List<String> compounds = null;
+		try {
+			strList = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_DICTIONARY),"UTF-8");
+			strList.addAll(FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_EXTENSION),"UTF-8"));
+			compounds = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_COMPOUNDS),"UTF-8");			
+		} catch (IOException e) {			
+			new MorphException(e.getMessage(),e);
+		} catch (Exception e) {
+			new MorphException(e.getMessage(),e);
+		}
+		if(strList==null) throw new MorphException("dictionary is null");;
+		
+		for(String str:strList) {
+			String[] infos = StringUtil.split(str,",");
+			if(infos.length!=2) continue;
+			infos[1] = infos[1].trim();
+			if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
+			
+			WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
+			dictionary.add(entry.getWord(), entry);
+		}
+		
+		for(String compound: compounds) {		
+			String[] infos = StringUtil.split(compound,":");
+			if(infos.length!=2) continue;
+			WordEntry entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray());
+			entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1],",")));
+			dictionary.add(entry.getWord(), entry);
+		}
+	}
+	
+	public static Iterator findWithPrefix(String prefix) throws MorphException {
+		if(dictionary==null) loadDictionary();
+		return dictionary.getPrefixedBy(prefix);
+	}
+
+	public static WordEntry getWord(String key) throws MorphException {		
+		if(dictionary==null) loadDictionary();
+		if(key.length()==0) return null;
+		
+		return (WordEntry)dictionary.get(key);
+	}
+	
+	public static WordEntry getWordExceptVerb(String key) throws MorphException {		
+		WordEntry entry = getWord(key);		
+		if(entry==null) return null;
+		
+		if(entry.getFeature(WordEntry.IDX_NOUN)=='1'||
+				entry.getFeature(WordEntry.IDX_BUSA)=='1') return entry;
+		return null;
+	}
+	
+	public static WordEntry getNoun(String key) throws MorphException {	
+
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+		
+		if(entry.getFeature(WordEntry.IDX_NOUN)=='1') return entry;
+		return null;
+	}
+	
+	public static WordEntry getCNoun(String key) throws MorphException {	
+
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+
+		if(entry.getFeature(WordEntry.IDX_NOUN)=='1' || entry.getFeature(WordEntry.IDX_NOUN)=='2') return entry;
+		return null;
+	}
+	
+	public static WordEntry getVerb(String key) throws MorphException {
+		
+		WordEntry entry = getWord(key);	
+		if(entry==null) return null;
+
+		if(entry.getFeature(WordEntry.IDX_VERB)=='1') {
+			return entry;
+		}
+		return null;
+	}
+	
+	public static WordEntry getAdverb(String key) throws MorphException {
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+
+		if(entry.getFeature(WordEntry.IDX_BUSA)=='1') return entry;
+		return null;
+	}
+	
+	public static WordEntry getBusa(String key) throws MorphException {
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+
+		if(entry.getFeature(WordEntry.IDX_BUSA)=='1'&&entry.getFeature(WordEntry.IDX_NOUN)=='0') return entry;
+		return null;
+	}
+	
+	public static WordEntry getIrrVerb(String key, char irrType) throws MorphException {
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+
+		if(entry.getFeature(WordEntry.IDX_VERB)=='1'&&
+				entry.getFeature(WordEntry.IDX_REGURA)==irrType) return entry;
+		return null;
+	}
+	
+	public static WordEntry getBeVerb(String key) throws MorphException {
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+		
+		if(entry.getFeature(WordEntry.IDX_BEV)=='1') return entry;
+		return null;
+	}
+	
+	public static WordEntry getDoVerb(String key) throws MorphException {
+		WordEntry entry = getWord(key);
+		if(entry==null) return null;
+		
+		if(entry.getFeature(WordEntry.IDX_DOV)=='1') return entry;
+		return null;
+	}
+	
+	public static WordEntry getUncompound(String key) throws MorphException {
+		
+		try {
+			if(uncompounds==null) {
+				uncompounds = new HashMap();
+				List<String> lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_UNCOMPOUNDS),"UTF-8");	
+				for(String compound: lines) {		
+					String[] infos = StringUtil.split(compound,":");
+					if(infos.length!=2) continue;
+					WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray());
+					entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1],",")));
+					uncompounds.put(entry.getWord(), entry);
+				}			
+			}	
+		}catch(Exception e) {
+			throw new MorphException(e);
+		}
+		return uncompounds.get(key);
+	}
+	
+	public static String getCJWord(String key) throws MorphException {
+		
+		try {
+			if(cjwords==null) {
+				cjwords = new HashMap();
+				List<String> lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_CJ),"UTF-8");	
+				for(String cj: lines) {		
+					String[] infos = StringUtil.split(cj,":");
+					if(infos.length!=2) continue;
+					cjwords.put(infos[0], infos[1]);
+				}			
+			}	
+		}catch(Exception e) {
+			throw new MorphException(e);
+		}
+		return cjwords.get(key);
+		
+	}
+	
+	public static boolean existJosa(String str) throws MorphException {
+		if(josas==null) {
+			josas = new HashMap();
+			readFile(josas,KoreanEnv.FILE_JOSA);
+		}	
+		if(josas.get(str)==null) return false;
+		else return true;
+	}
+	
+	public static boolean existEomi(String str)  throws MorphException {
+		if(eomis==null) {
+			eomis = new HashMap();
+			readFile(eomis,KoreanEnv.FILE_EOMI);
+		}
+
+		if(eomis.get(str)==null) return false;
+		else return true;
+	}
+	
+	public static boolean existPrefix(String str)  throws MorphException {
+		if(prefixs==null) {
+			prefixs = new HashMap();
+			readFile(prefixs,KoreanEnv.FILE_PREFIX);
+		}
+
+		if(prefixs.get(str)==null) return false;
+		else return true;
+	}
+	
+	public static boolean existSuffix(String str)  throws MorphException {
+		if(suffixs==null) {
+			suffixs = new HashMap();
+			readFile(suffixs,KoreanEnv.FILE_SUFFIX);
+		}
+
+		if(suffixs.get(str)!=null) return true;
+		
+		return false;
+	}
+	
+	/**
+	 * ㄴ,ㄹ,ㅁ,ㅂ과 eomi 가 결합하여 어미가 될 수 있는지 점검한다.
+	 * @param s
+	 * @param end
+	 * @return
+	 */
+	public static String combineAndEomiCheck(char s, String eomi) throws MorphException {
+	
+		if(eomi==null) eomi="";
+
+		if(s=='ㄴ') eomi = "은"+eomi;
+		else if(s=='ㄹ') eomi = "을"+eomi;
+		else if(s=='ㅁ') eomi = "음"+eomi;
+		else if(s=='ㅂ') eomi = "습"+eomi;
+		else eomi = s+eomi;
+
+		if(existEomi(eomi)) return eomi;		
+
+		return null;
+		
+	}
+	
+	/**
+	 * 
+	 * @param map
+	 * @param type	1: josa, 2: eomi
+	 * @throws MorphException
+	 */
+	private static synchronized void readFile(HashMap map, String dic) throws MorphException {		
+		
+		String path = KoreanEnv.getInstance().getValue(dic);
+
+		try{
+			List<String> line = FileUtil.readLines(path,"UTF-8");
+			for(int i=1;i<line.size();i++) {
+				map.put(line.get(i).trim(), line.get(i));
+			}
+		}catch(IOException e) {
+ 		    throw new MorphException(e.getMessage(),e);
+		} catch (Exception e) {
+ 		    throw new MorphException(e.getMessage(),e);
+		}
+	}
+	
+	private static List compoundArrayToList(String source, String[] arr) {
+		List list = new ArrayList();
+		for(String str: arr) {
+			CompoundEntry ce = new CompoundEntry(str);
+			ce.setOffset(source.indexOf(str));
+			list.add(ce);
+		}
+		return list;
+	}
+}
+

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java Sun May  5 03:39:51 2013
@@ -0,0 +1,665 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+
+public class EomiUtil {
+
+	
+	public static final String RESULT_FAIL = "0";
+	
+	public static final String RESULT_SUCCESS = "1";
+	
+	public static final String[] verbSuffix = {
+		  "이","하","되","스럽","스러우","시키","있","없","같","당하","만하","드리","받","나","내"
+	};
+	
+	/**
+	 * 가장 길이가 긴 어미를 분리한다.
+	 * @param term
+	 * @return
+	 * @throws MorphException
+	 */
+	public static String[] longestEomi(String term) throws MorphException  {
+		
+		String[] result = new String[2];
+		result[0] = term;
+		
+		String stem;
+		String eomi;
+		char[] efeature;
+		
+		for(int i=term.length();i>0;i--) {
+			
+			stem = term.substring(0,i);			
+		
+			if(i!=term.length()) {
+				eomi = term.substring(i);
+				efeature  = SyllableUtil.getFeature(eomi.charAt(0));				
+			} else {
+				efeature = SyllableUtil.getFeature(stem.charAt(i-1));
+				eomi="";
+			}
+
+			if(SyllableUtil.isAlpanumeric(stem.charAt(i-1))) break;
+			
+			char[] jasos = MorphUtil.decompose(stem.charAt(i-1));
+	
+			if(!"".equals(eomi)&&!DictionaryUtil.existEomi(eomi)) {
+				// do not anything.
+			} else if(jasos.length>2&&
+					(jasos[2]=='ㄴ'||jasos[2]=='ㄹ'||jasos[2]=='ㅁ'||jasos[2]=='ㅂ')&&
+					DictionaryUtil.combineAndEomiCheck(jasos[2], eomi)!=null) {
+				result[0] = Character.toString(MorphUtil.makeChar(stem.charAt(i-1), 0));
+				if(i!=0) result[0] = stem.substring(0,i-1)+result[0];
+				result[1] = Character.toString(jasos[2]);
+			}else if(i>0&&(stem.endsWith("하")&&"여".equals(eomi))||
+					(stem.endsWith("가")&&"거라".equals(eomi))||
+					(stem.endsWith("오")&&"너라".equals(eomi))) {
+				result[0] = stem;
+				result[1] = eomi;			
+			}else if(jasos.length==2&&(!stem.endsWith("아")&&!stem.endsWith("어"))&&
+					(jasos[1]=='ㅏ'||jasos[1]=='ㅓ'||jasos[1]=='ㅔ'||jasos[1]=='ㅐ')&&
+					(DictionaryUtil.combineAndEomiCheck('어', eomi)!=null)) {		
+				char[] chs = MorphUtil.decompose(stem.charAt(stem.length()-1));				
+				result[0] = stem;
+				result[1] = "어"+eomi;
+			}else if((jasos[1]=='ㅘ'||jasos[1]=='ㅝ'||jasos[1]=='ㅕ'||jasos[1]=='ㅐ'||jasos[1]=='ㅒ')&&
+					(DictionaryUtil.combineAndEomiCheck('어', eomi)!=null)) {				
+				String end = "";				
+				if(jasos[1]=='ㅘ')
+					end=MorphUtil.makeChar(stem.charAt(i-1), 8, 0)+"아";	
+				else if(jasos[1]=='ㅝ')
+					end=MorphUtil.makeChar(stem.charAt(i-1), 13, 0)+"어";	
+				else if(jasos[1]=='ㅕ')
+					end=Character.toString(MorphUtil.makeChar(stem.charAt(i-1), 6, 0));
+				else if(jasos[1]=='ㅐ')
+					end=MorphUtil.makeChar(stem.charAt(i-1), 0, 0)+"어";	
+				else if(jasos[1]=='ㅒ')
+					end=MorphUtil.makeChar(stem.charAt(i-1), 20, 0)+"애";										
+				
+				if(jasos.length==3) {					
+					end = end.substring(0,end.length()-1)+MorphUtil.replaceJongsung(end.charAt(end.length()-1),stem.charAt(i-1));
+				}
+				
+				if(stem.length()<2) result[0] = end;
+				else result[0] = stem.substring(0,stem.length()-1)+end;
+				result[1] = eomi;	
+				
+			}else if(efeature!=null&&efeature[SyllableUtil.IDX_EOMI1]!='0'&&
+				DictionaryUtil.existEomi(eomi)) {
+				if(!(((jasos.length==2&&jasos[0]=='ㄹ')||(jasos.length==3&&jasos[2]=='ㄹ'))&&eomi.equals("러"))) { // ㄹ 불규칙은 예외
+					result[0] = stem;
+					result[1] = eomi;
+				}
+			}
+
+			if(efeature!=null&&efeature[SyllableUtil.IDX_EOMI2]=='0') break;
+		}	
+
+		return result;
+		
+	}	
+	
+	/**
+	 * 선어말어미를 분석한다.
+	 * @param stem
+	 * @return
+	 */
+	public static String[] splitPomi(String stem) throws MorphException  {
+
+		//	 results[0]:성공(1)/실패(0), results[1]: 어근, results[2]: 선어말어미
+		String[] results = new String[2];  
+		results[0] = stem;
+
+		if(stem==null||stem.length()==0||"있".equals(stem)) return results;
+	
+		char[] chrs = stem.toCharArray();
+		int len = chrs.length;
+		String pomi = "";
+		int index = len-1;
+	
+		char[] jaso = MorphUtil.decompose(chrs[index]);
+		if(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ') return results;  // 선어말어미가 발견되지 않았다
+		
+		if(chrs[index]=='ê² ') {
+			pomi = "ê² ";
+			setPomiResult(results,stem.substring(0,index),pomi);		
+			if(--index<=0||
+					(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ')) 
+				return results; // 다음이거나 선어말어미가 없다면...
+			jaso = MorphUtil.decompose(chrs[index]);
+		}
+
+		if(chrs[index]=='었') { // 시었, ㅆ었, 었
+			pomi = chrs[index]+pomi;	
+			setPomiResult(results,stem.substring(0,index),pomi);		
+			if(--index<=0||
+					(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ')) 
+				return results; // 다음이거나 선어말어미가 없다면...				
+			jaso = MorphUtil.decompose(chrs[index]);
+		}
+
+		if(chrs[index]=='였'){
+			pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;	
+			if(index>0&&chrs[index-1]=='하') 
+				stem = stem.substring(0,index);	
+			else
+				 stem = stem.substring(0,index)+"이";
+			setPomiResult(results,stem,pomi);	
+		}else if(chrs[index]=='셨'){
+				pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;	
+				stem = stem.substring(0,index);		
+				setPomiResult(results,stem,"시"+pomi);				
+		}else if(chrs[index]=='았'||chrs[index]=='었') {
+			pomi = chrs[index]+pomi;	
+			setPomiResult(results,stem.substring(0,index),pomi);		
+			if(--index<=0||
+					(chrs[index]!='시'&&chrs[index]!='으')) return results; // 다음이거나 선어말어미가 없다면...				
+			jaso = MorphUtil.decompose(chrs[index]);		
+		}else if(jaso.length==3&&jaso[2]=='ㅆ') {
+		
+			 if(jaso[0]=='ㅎ'&&jaso[1]=='ㅐ') {			 
+				pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;	
+				stem = stem.substring(0,index)+"하";	
+			}else if(jaso[0]!='ㅇ'&&(jaso[1]=='ㅏ'||jaso[1]=='ㅓ'||jaso[1]=='ㅔ'||jaso[1]=='ㅐ')) {		
+				pomi = "었"+pomi;
+				stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index], 0);				
+			}else if(jaso[0]!='ㅇ'&&(jaso[1]=='ㅙ')) {
+				pomi = "었"+pomi;
+				stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],11, 0);				
+			} else if(jaso[1]=='ㅘ') {			
+				pomi = MorphUtil.replaceJongsung('아',chrs[index])+pomi;	
+				stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],8, 0);
+			} else if(jaso[1]=='ㅝ') {
+				pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;	
+				stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],13, 0);
+			} else if(jaso[1]=='ㅕ') {					
+				pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;				
+				stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],20, 0);					
+			} else if(jaso[1]=='ㅐ') {
+				pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
+				stem = stem.substring(0,index);
+			} else if(jaso[1]=='ㅒ') {
+				pomi = MorphUtil.replaceJongsung('애',chrs[index])+pomi;	
+				stem = stem.substring(0,index);
+			} else {
+				pomi = "었"+pomi;
+			}
+			setPomiResult(results,stem,pomi);				
+			if(chrs[index]!='시'&&chrs[index]!='으') return results; // 다음이거나 선어말어미가 없다면...				
+			jaso = MorphUtil.decompose(chrs[index]);				
+		}
+
+		char[] nChrs = null;
+		if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
+		else nChrs = new char[2];
+
+		if(nChrs.length==2&&chrs[index]=='시'&&(chrs.length<=index+1||
+				(chrs.length>index+1&&chrs[index+1]!='셨'))) {
+			if(DictionaryUtil.getWord(results[0])!=null) return results;  //'시'가 포함된 단어가 있다. 성가시다/도시다/들쑤시다 
+			pomi = chrs[index]+pomi;	
+			setPomiResult(results,stem.substring(0,index),pomi);			
+			if(--index==0||chrs[index]!='으') return results; // 다음이거나 선어말어미가 없다면...				
+			jaso = MorphUtil.decompose(chrs[index]);
+		}
+		
+		if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
+		else nChrs = new char[2];
+		if(chrs.length>index+1&&nChrs.length==3&&(chrs[index+1]=='셨'||chrs[index+1]=='시')&&chrs[index]=='으') {
+			pomi = chrs[index]+pomi;	
+			setPomiResult(results,stem.substring(0,index),pomi);		
+		}
+	
+		return results;
+	}
+	
+	/**
+	 * 불규칙 용언의 원형을 구한다.
+	 * @param output
+	 * @return
+	 * @throws MorphException
+	 */
+	public static List irregular(AnalysisOutput output) throws MorphException {
+		
+		List results = new ArrayList();
+	
+		if(output.getStem()==null||output.getStem().length()==0) 
+			return results;		
+		
+		String ending = output.getEomi();
+		if(output.getPomi()!=null) ending = output.getPomi();
+		
+		List<String[]> irrs = new ArrayList();
+		
+		irregularStem(irrs,output.getStem(),ending);
+		irregularEnding(irrs,output.getStem(),ending);
+		irregularAO(irrs,output.getStem(),ending);
+
+		try {
+			for(String[] irr: irrs) {
+				AnalysisOutput result = output.clone();
+				result.setStem(irr[0]);
+				if(output.getPatn()==PatternConstants.PTN_VM) {
+					if(output.getPomi()==null) result.setEomi(irr[1]);
+					else result.setPomi(irr[1]);
+				}	
+				results.add(result);
+			}				
+		} catch (CloneNotSupportedException e) {
+			throw new MorphException(e.getMessage(),e);
+		}
+				
+		return results;
+		
+	}
+	
+	/**
+	 * 어간만 변하는 경우
+	 * @param results
+	 * @param stem
+	 * @param ending
+	 */
+	private static void irregularStem(List results, String stem, String ending) {	
+
+		char feCh = ending.charAt(0);
+		char[] fechJaso =  MorphUtil.decompose(feCh);
+		char ls = stem.charAt(stem.length()-1);
+		char[] lsJaso = MorphUtil.decompose(ls);
+	
+		if(feCh=='아'||feCh=='어'||feCh=='으') {
+			if(lsJaso[lsJaso.length-1]=='ㄹ') { // ㄷ 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-1)+
+								MorphUtil.makeChar(stem.charAt(stem.length()-1),7)
+								,ending
+								,String.valueOf(PatternConstants.IRR_TYPE_DI)});
+			} else if(lsJaso.length==2) { // ㅅ 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-1)+
+								MorphUtil.makeChar(stem.charAt(stem.length()-1),19)
+								,ending
+								,String.valueOf(PatternConstants.IRR_TYPE_SI)});				
+			}			
+		}
+		
+		if((fechJaso[0]=='ㄴ'||fechJaso[0]=='ㄹ'||fechJaso[0]=='ㅁ'||	feCh=='오'||feCh=='시')
+				&&(ls=='우')) { // ㅂ 불규칙
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-1),17)
+							,ending
+							,String.valueOf(PatternConstants.IRR_TYPE_BI)});				
+		}
+		
+		if((fechJaso[0]=='ㄴ'||fechJaso[0]=='ㅂ'||fechJaso[0]=='ㅅ'||	feCh=='오')
+				&&(lsJaso.length==2)) { // ㄹ 탈락
+
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-1),8)
+							,ending
+							,String.valueOf(PatternConstants.IRR_TYPE_LI)});			
+		}
+		
+		if(lsJaso.length==2
+				&&(fechJaso[0]=='ㄴ'||fechJaso[0]=='ㄹ'||fechJaso[0]=='ㅁ'||fechJaso[0]=='ㅂ'||
+					lsJaso[1]=='ㅏ'||lsJaso[1]=='ㅓ'||lsJaso[1]=='ㅑ'||lsJaso[1]=='ㅕ')
+					&&!"나".equals(stem)) { // ㅎ 불규칙, 그러나 [낳다]는 ㅎ 불규칙이 아니다.
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-1),27)
+							,ending
+							,String.valueOf(PatternConstants.IRR_TYPE_HI)});			
+		}		
+	}
+	
+	/**
+	 * 어미만 변하는 경우
+	 * @param results
+	 * @param stem
+	 * @param ending
+	 */
+	private static void irregularEnding(List results, String stem, String ending) {
+		if(ending.startsWith("ㅆ")) return;
+		
+		char feCh = ending.charAt(0);
+		char ls = stem.charAt(stem.length()-1);
+
+		if(feCh=='러'&&ls=='르') { // '러' 불규칙
+			results.add(
+					new String[]{stem
+							,"어"+ending.substring(1)
+							,String.valueOf(PatternConstants.IRR_TYPE_RO)});				
+		} else if("라".equals(ending)&&"가거".equals(stem)) { // '거라' 불규칙
+			results.add( 
+					new String[]{stem.substring(0,stem.length()-1)
+							,"어라"
+							,String.valueOf(PatternConstants.IRR_TYPE_GU)});							
+		} else if("라".equals(ending)&&"오너".equals(stem)) { // '너라' 불규칙
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)
+							,"어라"
+							,String.valueOf(PatternConstants.IRR_TYPE_NU)});			
+		}
+		
+		if("여".equals(ending)&&ls=='하') { // '여' 불규칙
+			results.add(
+					new String[]{stem
+							,"어"
+							,String.valueOf(PatternConstants.IRR_TYPE_NU)});				
+		}
+	}
+	
+	/**
+	 * 어간과 어미가 모두 변하는 경우
+	 * @param results
+	 * @param stem
+	 * @param ending
+	 */
+	private static void irregularAO(List results, String stem, String ending) {
+		
+		char ls = stem.charAt(stem.length()-1);
+		char[] lsJaso = MorphUtil.decompose(ls);
+		
+		if(lsJaso.length<2) return;
+		
+		if(lsJaso[1]=='ㅘ') {
+			if(stem.endsWith("도와")||stem.endsWith("고와")) { // '곱다', '돕다'의 'ㅂ' 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-2)+
+								MorphUtil.makeChar(stem.charAt(stem.length()-2),17) // + 'ㅂ'
+								,makeTesnseEomi("아",ending)
+								,String.valueOf(PatternConstants.IRR_TYPE_BI)});					
+			}else { // '와' 축약
+				results.add(
+						new String[]{stem.substring(0,stem.length()-1)+
+								MorphUtil.makeChar(stem.charAt(stem.length()-1),8,0) // 자음 + ㅗ 
+								,makeTesnseEomi("아",ending)
+								,String.valueOf(PatternConstants.IRR_TYPE_WA)});				
+			}
+		} else if(stem.endsWith("퍼")) {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0) // 자음 + - 
+							,makeTesnseEomi("어",ending)
+							,String.valueOf(PatternConstants.IRR_TYPE_WA)});	
+		} else if(lsJaso[1]=='ㅝ') {
+			if(stem.length()>=2) // 'ㅂ' 불규칙
+				results.add(
+					new String[]{stem.substring(0,stem.length()-2)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-2),17) // + 'ㅂ'
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_BI)});	
+
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-1),13,0) // 자음 + ㅗ 
+							,makeTesnseEomi("어",ending)
+							,String.valueOf(PatternConstants.IRR_TYPE_WA)});	
+		} else if(stem.length()>=2&&ls=='라') {
+			char[] ns = MorphUtil.decompose(stem.charAt(stem.length()-2));
+			if(ns.length==3&&ns[2]=='ㄹ') { // 르 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-2)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-2),0) + "르"
+						   ,makeTesnseEomi("아",ending)
+						   ,String.valueOf(PatternConstants.IRR_TYPE_RO)});					
+			}			
+		} else if(stem.length()>=2&&ls=='러') {
+			char[] ns = MorphUtil.decompose(stem.charAt(stem.length()-2));
+			if(stem.charAt(stem.length()-2)=='르') { // 러 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-1)
+						   ,makeTesnseEomi("어",ending)
+						   ,String.valueOf(PatternConstants.IRR_TYPE_LO)});	
+			} else if(ns.length==3&&ns[2]=='ㄹ') { // 르 불규칙
+				results.add(
+						new String[]{stem.substring(0,stem.length()-2)+
+							MorphUtil.makeChar(stem.charAt(stem.length()-2),0) + "르"
+						   ,makeTesnseEomi("어",ending)
+						   ,String.valueOf(PatternConstants.IRR_TYPE_RO)});	
+			}
+		} else if(stem.endsWith("펴")||stem.endsWith("켜")) {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),20,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_EI)});	
+		} else if(stem.endsWith("해")) {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),0,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_EI)});				
+		} else if(lsJaso.length==2&&lsJaso[1]=='ㅏ') {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_UO)});	
+		} else if(lsJaso.length==2&&lsJaso[1]=='ㅓ') {
+			// 으 탈락
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_UO)});	
+			//	 아 불규칙
+			results.add(
+					new String[]{stem
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_AH)});	
+		} else if(lsJaso[1]=='ㅕ') {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),20,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_EI)});	
+		} else if(lsJaso[1]=='ㅙ') {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),11,0)
+					   ,makeTesnseEomi("어",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_OE)});	
+		} else if(lsJaso[1]=='ㅐ') {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),0,27)
+					   ,makeTesnseEomi("아",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_HI)});
+		} else if(lsJaso[1]=='ㅒ') {
+			results.add(
+					new String[]{stem.substring(0,stem.length()-1)+
+						MorphUtil.makeChar(stem.charAt(stem.length()-1),2,27)
+					   ,makeTesnseEomi("아",ending)
+					   ,String.valueOf(PatternConstants.IRR_TYPE_HI)});							
+		}
+		
+	}
+	
+	/**
+	 * 시제 선어미말을 만들어서 반환한다.
+	 * @param preword  '아' 또는 '어'
+	 * @param endword  어미[선어미말을 포함]
+	 * @return '았' 또는 '었'을 만들어서 반환한다.
+	 */
+	public static String makeTesnseEomi(String preword, String endword) {
+
+		if(preword==null||preword.length()==0) return endword;
+		if(endword==null||endword.length()==0) return preword;
+
+		if(endword.charAt(0)=='ㅆ') {
+			return preword.substring(0,preword.length()-1)+
+				MorphUtil.makeChar(preword.charAt(preword.length()-1),20)+endword.substring(1,endword.length());		
+		} else if(endword.charAt(0)=='ㄴ') {
+			return preword.substring(0,preword.length()-1)+
+			MorphUtil.makeChar(preword.charAt(preword.length()-1),4)+endword.substring(1,endword.length());
+		} else if(endword.charAt(0)=='ㄹ') {
+			return preword.substring(0,preword.length()-1)+
+			MorphUtil.makeChar(preword.charAt(preword.length()-1),8)+endword.substring(1,endword.length());	
+		} else if(endword.charAt(0)=='ㅁ') {
+			return preword.substring(0,preword.length()-1)+
+			MorphUtil.makeChar(preword.charAt(preword.length()-1),16)+endword.substring(1,endword.length());					
+		} else if(endword.charAt(0)=='ㅂ') {
+			return preword.substring(0,preword.length()-1)+
+			MorphUtil.makeChar(preword.charAt(preword.length()-1),17)+endword.substring(1,endword.length());
+		}
+		return preword+endword;		
+	}
+	
+	
+ 
+   /**
+    * '음/기' + '이' + 어미, '에서/부터/에서부터' + '이' + 어미 인지 조사한다.
+    * @param stem
+    * @return
+    */
+   public static boolean endsWithEEomi(String stem) {
+	   int len = stem.length();
+	   if(len<2||!stem.endsWith("이")) return false;
+	  
+	   char[] jasos = MorphUtil.decompose(stem.charAt(len-2));
+	   if(jasos.length==3&&jasos[2]=='ㅁ')
+		   return true;
+	   else {
+		   int index = stem.lastIndexOf("기");
+		   if(index==-1) index = stem.lastIndexOf("에서");
+		   if(index==-1) index = stem.lastIndexOf("부터");
+		   if(index==-1) return false;
+		   return true;
+	   }
+   }
+   
+	private static void setPomiResult(String[] results,String stem, String pomi ) {
+		results[0] = stem;
+		results[1] = pomi;
+	}	
+	
+	/**
+	 * 
+	 * @param ch
+	 * @return
+	 */
+	public static boolean IsNLMBSyl(char ech, char lch) throws MorphException {
+	
+		char[] features = SyllableUtil.getFeature(ech);
+
+		switch(lch) {
+
+			case 'ㄴ' :
+				return (features[SyllableUtil.IDX_YNPNA]=='1' || features[SyllableUtil.IDX_YNPLN]=='1');				
+			case 'ㄹ' :
+				return (features[SyllableUtil.IDX_YNPLA]=='1');
+			case 'ㅁ' :
+				return (features[SyllableUtil.IDX_YNPMA]=='1');		
+			case 'ㅂ' :
+				return (features[SyllableUtil.IDX_YNPBA]=='1');					
+		}
+	
+		return false;
+	}
+	
+	/**
+	 * 어미를 분리한다.
+	 * 
+	 * 1. 규칙용언과 어간만 바뀌는 불규칙 용언
+	 * 2. 어미가 종성 'ㄴ/ㄹ/ㅁ/ㅂ'으로 시작되는 어절
+	 * 3. '여/거라/너라'의 불규칙 어절
+	 * 4. 어미 '아/어'가 탈락되는 어절
+	 * 5. '아/어'의 변이체 분리
+	 * 
+	 * @param stem
+	 * @param end
+	 * @return
+	 * @throws MorphException
+	 */
+	public static String[] splitEomi(String stem, String end) throws MorphException {
+
+		String[] strs = new String[2];
+		int strlen = stem.length();
+		if(strlen==0) return strs;
+
+		char estem = stem.charAt(strlen-1);
+		char[] chrs = MorphUtil.decompose(estem);
+		if(chrs.length==1) return strs; // 한글이 아니라면...
+
+		if((chrs.length==3)&&(chrs[2]=='ㄴ'||chrs[2]=='ㄹ'||chrs[2]=='ㅁ'||chrs[2]=='ㅂ')&&
+				EomiUtil.IsNLMBSyl(estem,chrs[2])&&
+				DictionaryUtil.combineAndEomiCheck(chrs[2], end)!=null) {		
+			strs[1] = Character.toString(chrs[2]);
+			if(end.length()>0) strs[1] += end;
+			strs[0] = stem.substring(0,strlen-1) + MorphUtil.makeChar(estem, 0);	
+		} else if(estem=='해'&&DictionaryUtil.existEomi("어"+end)) {			
+			strs[0] = stem.substring(0,strlen-1)+"하";
+			strs[1] = "어"+end;	
+		} else if(estem=='히'&&DictionaryUtil.existEomi("이"+end)) {			
+			strs[0] = stem.substring(0,strlen-1)+"하";
+			strs[1] = "이"+end;				
+		} else if(chrs[0]!='ㅇ'&&
+				(chrs[1]=='ㅏ'||chrs[1]=='ㅓ'||chrs[1]=='ㅔ'||chrs[1]=='ㅐ')&&
+				(chrs.length==2 || SyllableUtil.getFeature(estem)[SyllableUtil.IDX_YNPAH]=='1')&&
+				(DictionaryUtil.combineAndEomiCheck('어', end)!=null)) {		
+		
+			strs[0] = stem;
+			if(chrs.length==2) strs[1] = "어"+end;	
+			else strs[1] = end;	
+		} else if(stem.endsWith("하")&&"여".equals(end)) {			
+			strs[0] = stem;
+			strs[1] = "어";	
+		}else if((chrs.length==2)&&(chrs[1]=='ㅘ'||chrs[1]=='ㅙ'||chrs[1]=='ㅝ'||chrs[1]=='ㅕ'||chrs[1]=='ㅐ'||chrs[1]=='ㅒ')&&
+				(DictionaryUtil.combineAndEomiCheck('어', end)!=null)) {		
+	
+			StringBuffer sb = new StringBuffer();
+			
+			if(strlen>1) sb.append(stem.substring(0,strlen-1));
+			
+			if(chrs[1]=='ㅘ')
+				sb.append(MorphUtil.makeChar(estem, 8, 0)).append(MorphUtil.replaceJongsung('아',estem));	
+			else if(chrs[1]=='ㅝ')
+				sb.append(MorphUtil.makeChar(estem, 13, 0)).append(MorphUtil.replaceJongsung('어',estem));	
+			else if(chrs[1]=='ㅙ')
+				sb.append(MorphUtil.makeChar(estem, 11, 0)).append(MorphUtil.replaceJongsung('어',estem));				
+			else if(chrs[1]=='ㅕ')
+				sb.append(Character.toString(MorphUtil.makeChar(estem, 20, 0))).append(MorphUtil.replaceJongsung('어',estem));
+			else if(chrs[1]=='ㅐ')
+				sb.append(MorphUtil.makeChar(estem, 0, 0)).append(MorphUtil.replaceJongsung('어',estem));
+			else if(chrs[1]=='ㅒ')
+				sb.append(MorphUtil.makeChar(estem, 20, 0)).append(MorphUtil.replaceJongsung('애',estem));	
+			
+		
+			strs[0] = sb.toString();
+		
+			end = strs[0].substring(strs[0].length()-1)+end;				
+			strs[0] = strs[0].substring(0,strs[0].length()-1);
+			
+			strs[1] = end;		
+
+		}else if(!"".equals(end)&&DictionaryUtil.existEomi(end)) {		
+			strs = new String[]{stem, end};
+		}
+
+		return strs;
+	}
+}



Mime
View raw message