uima-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Peter Klügl <peter.klu...@averbis.com>
Subject Re: Ruta problem with spaces and special chars
Date Tue, 04 Jul 2017 11:34:09 GMT
Hi,


the visibility concept in ruta is coverage-based which means that 
something that starts or ends with something invisible is also invisible.

In your first test, the offests result in ": hell" and " hell" and 
therefore the inlined rule with DetectedValue does not match. If the 
offsets are adapted to the comment in the test, Attribute will not 
match. Each time because the annotation starts with something invisible, 
the whitespace. You can fix this by making whitespaces visible, e.g., by 
adding this line:

                 + "BOOLEAN located;\n" //
                 + "RETAINTYPE(WS);"//
                 + "BLOCK(doc) Document{} {\n" //

Usually, I'd rather recommend to trim the annotation if the whitespaes 
are not important.


In the second test, the problem is the same.

Best,

Peter

Am 04.07.2017 um 09:00 schrieb Josep María Formentí Serra:
> Thanks Peter. It's just a class, I put the code here and sent you directly
>
> ============================================================================
> import static
> org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
> import static org.junit.Assert.assertEquals;
>
> import java.io.IOException;
> import java.net.URISyntaxException;
>
> import org.antlr.runtime.RecognitionException;
> import org.apache.uima.UIMAException;
> import org.apache.uima.UIMAFramework;
> import org.apache.uima.analysis_engine.AnalysisEngine;
> import org.apache.uima.analysis_engine.AnalysisEngineDescription;
> import org.apache.uima.cas.Type;
> import org.apache.uima.cas.text.AnnotationFS;
> import org.apache.uima.fit.factory.JCasFactory;
> import org.apache.uima.fit.util.CasUtil;
> import org.apache.uima.jcas.JCas;
> import org.apache.uima.resource.metadata.TypeSystemDescription;
> import org.apache.uima.ruta.descriptor.RutaBuildOptions;
> import org.apache.uima.ruta.descriptor.RutaDescriptorFactory;
> import org.apache.uima.ruta.descriptor.RutaDescriptorInformation;
> import org.apache.uima.ruta.engine.RutaEngine;
> import org.junit.Test;
>
> public class RutaAnnotatorTest {
>
>      // We try to create a Detection because an Attribute (" hello")
> contains a
>      // detected value ("hello")
>      @Test
>      public void testSpaceProblem() throws UIMAException, IOException,
> RecognitionException, URISyntaxException {
>          // Prepare data
>          String str = "attr: hello";
>          String rutaRule = "PACKAGE ruta;\n" //
>                  + "DECLARE Detection;\n" //
>                  + "DECLARE DetectedValue;\n" //
>                  + "DECLARE Attribute;\n" //
>                  + "BOOLEAN located;\n" //
>                  + "BLOCK(doc) Document{} {\n" //
>                  + "     Document{ -> located = false};\n" //
>                  + "        a1:Attribute{} -> {" //
>                  + "            d1:DetectedValue{ -> located = true};" //
>                  + "        };" //
>                  + "        Document{located -> CREATE(Detection)};\n"//
>                  + "}\n"; //
>
>          // Prepare CAS
>          RutaDescriptorFactory factory = new RutaDescriptorFactory();
>          RutaDescriptorInformation descriptorInformation =
> factory.parseDescriptorInformation(rutaRule);
>          RutaBuildOptions options = new RutaBuildOptions();
>          TypeSystemDescription typeSystemDescription =
> factory.createTypeSystemDescription("", descriptorInformation,
>                  options, null);
>
>          JCas cas = JCasFactory.createJCas(typeSystemDescription);
>          cas.setDocumentText(str);
>
>          Type attrType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.Attribute");
>          AnnotationFS attr = cas.getCas().createAnnotation(attrType, 4, 10);
>          cas.addFsToIndexes(attr);
>
>          Type detectedValueType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.DetectedValue");
>          AnnotationFS detectedValue =
> cas.getCas().createAnnotation(detectedValueType, 5, 10);
>          cas.addFsToIndexes(detectedValue);
>
>          // Execute Ruta
>          AnalysisEngineDescription ruta =
> createEngineDescription(RutaEngine.class, RutaEngine.PARAM_RULES, rutaRule);
>          AnalysisEngine pipe = UIMAFramework.produceAnalysisEngine(ruta);
>          pipe.process(cas);
>
>          // Validate result
>          Type detectionType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.Detection");
>          assertEquals(1, CasUtil.select(cas.getCas(), detectionType).size());
>      }
>
>      // We try to create a Detection because an Attribute ("\" hello\"")
> contains a
>      // detected value ("llo")
>      @Test
>      public void testQuoteProblem() throws UIMAException, IOException,
> RecognitionException, URISyntaxException {
>          // Prepare data
>          String str = "attr: \" hello\"";
>          String rutaRule = "PACKAGE ruta;\n" //
>                  + "DECLARE Detection;\n" //
>                  + "DECLARE DetectedValue;\n" //
>                  + "DECLARE Attribute;\n" //
>                  + "BOOLEAN located;\n" //
>                  + "BLOCK(doc) Document{} {\n" //
>                  + "     Document{ -> located = false};\n" //
>                  + "        a1:Attribute{} -> {" //
>                  + "            d1:DetectedValue{ -> located = true};" //
>                  + "        };" //
>                  + "        Document{located -> CREATE(Detection)};\n"//
>                  + "}\n"; //
>
>          // Prepare CAS
>          RutaDescriptorFactory factory = new RutaDescriptorFactory();
>          RutaDescriptorInformation descriptorInformation =
> factory.parseDescriptorInformation(rutaRule);
>          RutaBuildOptions options = new RutaBuildOptions();
>          TypeSystemDescription typeSystemDescription =
> factory.createTypeSystemDescription("", descriptorInformation,
>                  options, null);
>
>          JCas cas = JCasFactory.createJCas(typeSystemDescription);
>          cas.setDocumentText(str);
>
>          Type attrType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.Attribute");
>          AnnotationFS attr = cas.getCas().createAnnotation(attrType, 5, 12);
>          cas.addFsToIndexes(attr);
>
>          Type detectedValueType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.DetectedValue");
>          AnnotationFS detectedValue =
> cas.getCas().createAnnotation(detectedValueType, 9, 12);
>          cas.addFsToIndexes(detectedValue);
>
>          // Execute Ruta
>          AnalysisEngineDescription ruta =
> createEngineDescription(RutaEngine.class, RutaEngine.PARAM_RULES, rutaRule);
>          AnalysisEngine pipe = UIMAFramework.produceAnalysisEngine(ruta);
>          pipe.process(cas);
>
>          // Validate result
>          Type detectionType = CasUtil.getAnnotationType(cas.getCas(),
> "ruta.Anonymous.Detection");
>          assertEquals(1, CasUtil.select(cas.getCas(), detectionType).size());
>      }
>
> }
> ============================================================================
>
> 2017-07-03 20:50 GMT+02:00 Peter Klügl <peter.kluegl@averbis.com>:
>
>> Hi,
>>
>>
>> I think this mailing list does not allow mail attachments, at least I do
>> not see any.
>>
>>
>> Can you upload the tests anywhere and post the links here? Or you can send
>> the test directly to me. Or you can create a Jira issue and attach them
>> there: https://issues.apache.org/jira/browse/UIMA-5474?jql=project%
>> 20%3D%20UIMA%20AND%20component%20%3D%20Ruta
>>
>>
>> Best,
>>
>>
>> Peter
>>
>>
>>
>> Am 03.07.2017 um 14:47 schrieb Josep María Formentí Serra:
>>
>>> Hi,
>>>
>>>    We've experimented some problems applying rules in texts that contains
>>> spaces or special chars, in texts that contains some spaces or special
>>> chars the rules are not applied properly.
>>>
>>>    As example of this problems I attach 2 tests, these tests are using a
>>> simplification of the kind of rules that we are using in our project.
>>>
>>> Best,
>>>    JM
>>>
>>
>


Mime
View raw message