uima-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Debbie Zhang" <debbie.d.zh...@gmail.com>
Subject RE: Read file name in an annotator
Date Mon, 14 Jul 2014 09:03:08 GMT
Hi Ravi,

Thank you very much for the sample code. However, in my case, the PEAR file will be deployed
to a different system. Therefore, I have no access to "file.getAbsoluteFile().toURL().toString())".

I searched the uima-user mailing list archives and found an old post which was sent by Marshall
Schor last year:
http://mail-archives.apache.org/mod_mbox/uima-user/201205.mbox/%3C4FA095E1.8070102@schor.com%3E
Within this post, CTakes was suggested. I downloaded CTakes. I tried to use org.apache.ctakes.typesystem.type.structured.DocumentID
defined by cTakes. However, I can't get it working.

typeSystemDescriptor.xml:
<?xml version="1.0" encoding="UTF-8"?>
-<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
<name>typeSystemDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
-<imports>
<import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
</imports>
-<types>
-<typeDescription>
<name>uima.TestThirdPartyLib</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>

TestThirdPartyLib.xml (my annotation which uses org.apache.ctakes.typesystem.type.structured.DocumentID
as input) 
<?xml version="1.0" encoding="UTF-8"?>
-<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>annotators.TestThirdPartyLibDescriptor</annotatorImplementationName>
-<analysisEngineMetaData>
<name>TestThirdPartyLibDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters/>
<configurationParameterSettings/>
-<typeSystemDescription>
-<imports>
<import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
</imports>
-<types>
-<typeDescription>
<name>uima.TestThirdPartyLib</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
-<capabilities>
-<capability>
-<inputs>
<type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type>
</inputs>
-<outputs>
<type allAnnotatorFeatures="true">uima.TestThirdPartyLib</type>
</outputs>
<languagesSupported/>
</capability>
</capabilities>
-<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

TestThirdPartyLibDescriptor.java:
package annotators;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uima.TestThirdPartyLib;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.resource.ResourceInitializationException;

/**
 * Test annotation
 */
public class TestThirdPartyLibDescriptor extends JCasAnnotator_ImplBase
{
	/**
	 * @see AnalysisComponent#initialize(UimaContext)
	 */
	public void initialize(UimaContext aContext) throws ResourceInitializationException {
		super.initialize(aContext);
  }
  /**
   * @see JCasAnnotator_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) {
		
	  String docText = aJCas.getDocumentText();
	  test(aJCas);
	  
	  System.out.println("Say something");
  }
  private void test(JCas aJCas)
  {  
	 //System.out.println("Full text:*"+aJCas.getDocumentText()+"*");
	  		
	 JFSIndexRepository indexes = aJCas.getJFSIndexRepository();
	 FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS(DocumentID.type);
	 while (documentIDIterator.isValid()) {
		 DocumentID documentIDAnnotation = (DocumentID) documentIDIterator.next();
		 String documentID = documentIDAnnotation.getDocumentID();
		 System.out.println("DocumentID: "+documentID);
	 }
	 	
	 //create an annotation 
	 TestThirdPartyLib annotation = new TestThirdPartyLib(aJCas);
	 //annotation.setBegin(la.begin());
	 //annotation.setEnd(la.end());
	 annotation.addToIndexes();
  }
}

TestMain.java

import uima.*;

import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;

import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;

public class TestMain {
	
	static String readFile(File infile) throws Exception
	{
		//read file
    	BufferedReader reader = new BufferedReader(
                new FileReader(infile));
    	StringBuffer fileData = new StringBuffer();
        char[] buf = new char[1024];
        int numRead=0;
        while((numRead=reader.read(buf)) != -1){
                String readData = String.valueOf(buf, 0, numRead);
                fileData.append(readData);
            }
        reader.close();
        
        return fileData.toString();
	}

	public static void main(String[] args) throws Exception
	{	
		try {
		System.out.println("Say something");
		File aeFile = new File("desc/TestThirdPartyLibDescriptor.xml");
		XMLInputSource in = new XMLInputSource(aeFile);
		ResourceSpecifier specifier =
		UIMAFramework.getXMLParser().parseResourceSpecifier(in);
		AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);

		JCas jcas = ae.newJCas();
		File inputFileFolder = new File("data");
		int count = 0;
		for (final File fileEntry : inputFileFolder.listFiles()) {
	        if (fileEntry.isDirectory()) {
	            continue;
	        } else 
	        {  	
	        	//if (fileEntry.getName().indexOf(filename)!=-1)
	        	{
	        		//System.out.println(count+": "+fileEntry.getName());
	        		String filecontent = TestMain.readFile(fileEntry);
	        	
	        		//analyze a document
	        		jcas.setDocumentText(filecontent);
	        		ae.process(jcas);
	        		        		
	        		jcas.reset();    		
	        		count += 1;
	        		//break;
	        	}
	        }
		}
	} catch(Exception e) {
		e.printStackTrace();
	}
	}
}

It seems to be silly to use cTakes just using it for getting the file name. However, I really
need to get the file name as it is the only way to identify a file. Can anyone tell me what
I did wrong so org.apache.ctakes.typesystem.type.structured.DocumentID doesn't work?

Any help and suggest will be greatly appreciated! Thank you!

Regards,

Debbie Zhang
 
> -----Original Message-----
> From: Ravindra [mailto:ravindra.bajpai@gmail.com]
> Sent: Thursday, 10 July 2014 9:39 PM
> To: user@uima.apache.org
> Cc: thomas.ginter@utah.edu
> Subject: Re: Read file name in an annotator
> 
> May this help -
> 
>     // Also store location of source document in CAS. This information
> is critical
>     // if CAS Consumers will need to know where the original document
> contents are located.
>     // For example, the Semantic Search CAS Indexer writes this
> information into the
>     // search index that it creates, which allows applications that use
> the search index to
>     // locate the documents that satisfy their semantic queries.
>     SourceDocumentInformation srcDocInfo = new
> SourceDocumentInformation(jcas);
>     srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
>     srcDocInfo.setOffsetInSource(0);
>     srcDocInfo.setDocumentSize((int) file.length());
>     srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
>     srcDocInfo.addToIndexes();
> 
> 
> followed by
>    // retrieve the filename of the input file from the CAS
>     FSIterator it =
> jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
>     File outFile = null;
>     if (it.hasNext()) {
>       SourceDocumentInformation fileLoc = (SourceDocumentInformation)
> it.next();
>       File inFile;
>       try {
>         inFile = new File(new URL(fileLoc.getUri()).getPath());
>         String outFileName = inFile.getName();
>         if (fileLoc.getOffsetInSource() > 0) {
>           outFileName += ("_" + fileLoc.getOffsetInSource());
>         }
>         outFileName += ".xmi";
>         outFile = new File(mOutputDir, outFileName);
>         modelFileName = mOutputDir.getAbsolutePath() + "/" +
> inFile.getName() + ".ecore";
>       } catch (MalformedURLException e1) {
>         // invalid URL, use default processing below
>       }
>     }
> 
> look for SourceDocumentInformation in the examples
> 
> 
> --
> Ravi.
> *''We do not inherit the earth from our ancestors, we borrow it from
> our children.'' PROTECT IT !*
> 
> 
> On Thu, Jul 10, 2014 at 4:49 PM, Debbie Zhang <debbie.d.zhang@gmail.com>
> wrote:
> 
> > Thanks Thomas. May I ask if there is any sample code of UIMA readers
> > that can provide file name information for developing annotation? I
> > was looking on the internet today, but couldn't find one. Thanks
> again
> > for your help - much appreciated!
> >
> > Regards,
> >
> > Debbie Zhang
> >
> > > -----Original Message-----
> > > From: Thomas Ginter [mailto:thomas.ginter@utah.edu]
> > > Sent: Thursday, 10 July 2014 5:00 AM
> > > To: user@uima.apache.org
> > > Subject: Re: Read file name in an annotator
> > >
> > > Hi Debbie,
> > >
> > > The file name is not provided by default in UIMA although I believe
> > > the UIMA FileReader does populate a SourceDocumentInformation
> > > annotation with this information.  Our group has a set of readers
> > > that populate our own annotation type to provide location data and
> > > other meta- information for each record (CAS) being processed.  In
> > > short you will be better off writing your reader to provide that
> information for you.
> > >
> > > Thanks,
> > >
> > > Thomas Ginter
> > > 801-448-7676
> > > thomas.ginter@utah.edu
> > >
> > >
> > >
> > >
> > > On Jul 9, 2014, at 5:41, Debbie Zhang <debbie.d.zhang@gmail.com>
> wrote:
> > >
> > > > Hi,
> > > >
> > > > Can anyone tell me how to read the file name in an annotator
> using
> > > the
> > > > JCas? It seems the DocumentAnnotation does't contain file name.
> > > > Thank you!
> > > >
> > > > Best regards,
> > > >
> > > > Debbie Zhang
> >
> >
> >

Mime
View raw message