nifi-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NIFI-2380) ExtractEmailAttachments processor should support TNEF files (aka winmail.dat)
Date Tue, 06 Sep 2016 12:44:21 GMT

    [ https://issues.apache.org/jira/browse/NIFI-2380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15467325#comment-15467325
] 

ASF GitHub Bot commented on NIFI-2380:
--------------------------------------

Github user trixpan commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/817#discussion_r77626391
  
    --- Diff: nifi-nar-bundles/nifi-email-bundle/nifi-email-processors/src/main/java/org/apache/nifi/processors/email/ExtractTNEFAttachments.java
---
    @@ -0,0 +1,202 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nifi.processors.email;
    +
    +import org.apache.commons.lang3.StringUtils;
    +import org.apache.nifi.annotation.behavior.EventDriven;
    +import org.apache.nifi.annotation.behavior.InputRequirement;
    +import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
    +import org.apache.nifi.annotation.behavior.SideEffectFree;
    +import org.apache.nifi.annotation.behavior.SupportsBatching;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.flowfile.attributes.CoreAttributes;
    +import org.apache.nifi.logging.ComponentLog;
    +import org.apache.nifi.processor.AbstractProcessor;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.ProcessorInitializationContext;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.FlowFileHandlingException;
    +import org.apache.nifi.processor.io.InputStreamCallback;
    +import org.apache.nifi.processor.io.OutputStreamCallback;
    +import org.apache.nifi.stream.io.BufferedInputStream;
    +import org.apache.poi.hmef.Attachment;
    +import org.apache.poi.hmef.HMEFMessage;
    +
    +import java.io.IOException;
    +import java.io.InputStream;
    +import java.io.OutputStream;
    +import java.util.ArrayList;
    +import java.util.Collections;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Properties;
    +import java.util.Set;
    +
    +
    +@SupportsBatching
    +@EventDriven
    +@SideEffectFree
    +@Tags({"split", "email"})
    +@InputRequirement(Requirement.INPUT_REQUIRED)
    +@CapabilityDescription("Extract attachments from a mime formatted email file, splitting
them into individual flowfiles.")
    +@WritesAttributes({
    +        @WritesAttribute(attribute = "filename ", description = "The filename of the
attachment"),
    +        @WritesAttribute(attribute = "email.tnef.attachment.parent.filename ", description
= "The filename of the parent FlowFile"),
    +        @WritesAttribute(attribute = "email.tnef.attachment.parent.uuid", description
= "The UUID of the original FlowFile.")})
    +
    +public class ExtractTNEFAttachments extends AbstractProcessor {
    +    public static final String ATTACHMENT_ORIGINAL_FILENAME = "email.tnef.attachment.parent.filename";
    +    public static final String ATTACHMENT_ORIGINAL_UUID = "email.tnef.attachment.parent.uuid";
    +
    +    public static final Relationship REL_ATTACHMENTS = new Relationship.Builder()
    +            .name("attachments")
    +            .description("Each individual attachment will be routed to the attachments
relationship")
    +            .build();
    +    public static final Relationship REL_ORIGINAL = new Relationship.Builder()
    +            .name("original")
    +            .description("The original file")
    +            .build();
    +    public static final Relationship REL_FAILURE = new Relationship.Builder()
    +            .name("failure")
    +            .description("Flowfiles that could not be parsed")
    +            .build();
    +    private Set<Relationship> relationships;
    +    private List<PropertyDescriptor> descriptors;
    +
    +
    +    @Override
    +    protected void init(final ProcessorInitializationContext context) {
    +        final Set<Relationship> relationships = new HashSet<>();
    +        relationships.add(REL_ATTACHMENTS);
    +        relationships.add(REL_ORIGINAL);
    +        relationships.add(REL_FAILURE);
    +        this.relationships = Collections.unmodifiableSet(relationships);
    +
    +        final List<PropertyDescriptor> descriptors = new ArrayList<>();
    +
    +        this.descriptors = Collections.unmodifiableList(descriptors);
    +    }
    +
    +    @Override
    +    public void onTrigger(final ProcessContext context, final ProcessSession session)
{
    +        final ComponentLog logger = getLogger();
    +        final FlowFile originalFlowFile = session.get();
    +        if (originalFlowFile == null) {
    +            return;
    +        }
    +        final List<FlowFile> attachmentsList = new ArrayList<>();
    +        final List<FlowFile> invalidFlowFilesList = new ArrayList<>();
    +        final List<FlowFile> originalFlowFilesList = new ArrayList<>();
    +
    +        session.read(originalFlowFile, new InputStreamCallback() {
    +                @Override
    +                public void process(final InputStream rawIn) throws IOException {
    +                    try (final InputStream in = new BufferedInputStream(rawIn)) {
    +                        Properties props = new Properties();
    +
    +                        HMEFMessage hmefMessage = null;
    +
    +                            // This will trigger an exception in case content is not
a TNEF.
    +                            hmefMessage = new HMEFMessage(in);
    +
    +                        // Add otiginal flowfile (may revert later on in case of errors)
//
    +                        originalFlowFilesList.add(originalFlowFile);
    +
    +                        if (hmefMessage != null) {
    +                             // Attachments isn empty, proceeding.
    +                            if (! hmefMessage.getAttachments().isEmpty()) {
    +                                final String originalFlowFileName = originalFlowFile.getAttribute(CoreAttributes.FILENAME.key());
    +                                try {
    +                                    for (final Attachment attachment : hmefMessage.getAttachments())
{
    +                                        FlowFile split = session.create(originalFlowFile);
    +                                        final Map<String, String> attributes =
new HashMap<>();
    +                                        if (StringUtils.isNotBlank(attachment.getLongFilename()))
{
    +                                            attributes.put(CoreAttributes.FILENAME.key(),
attachment.getFilename());
    +                                        }
    +
    +                                        String parentUuid = originalFlowFile.getAttribute(CoreAttributes.UUID.key());
    +                                        attributes.put(ATTACHMENT_ORIGINAL_UUID, parentUuid);
    +                                        attributes.put(ATTACHMENT_ORIGINAL_FILENAME,
originalFlowFileName);
    +
    +                                        // TODO: Extract Mime Type (HMEF doesn't seem
to be able to get this info.
    +
    +                                        split = session.append(split, new OutputStreamCallback()
{
    +                                            @Override
    +                                            public void process(OutputStream out) throws
IOException {
    +                                                out.write(attachment.getContents());
    +                                            }
    +                                        });
    +                                        split = session.putAllAttributes(split, attributes);
    +                                        attachmentsList.add(split);
    +                                    }
    +                                } catch (FlowFileHandlingException e) {
    +                                    // Something went wrong
    +                                    // Removing splits that may have been created
    +                                    session.remove(attachmentsList);
    +                                    // Removing the original flow from its list
    +                                    originalFlowFilesList.remove(originalFlowFile);
    +                                    logger.error("Flowfile {} triggered error {} while
processing message removing generated FlowFiles from sessions", new Object[]{originalFlowFile,
e});
    +                                    invalidFlowFilesList.add(originalFlowFile);
    +                                }
    +                            }
    +                        }
    +                    }catch (Exception e) {
    +                        // Another error hit...
    +                        // Removing the original flow from its list
    +                        originalFlowFilesList.remove(originalFlowFile);
    +                        logger.error("Could not parse the flowfile {} as an email, treating
as failure", new Object[]{originalFlowFile, e});
    +                        // Message is invalid or triggered an error during parsing
    +                        invalidFlowFilesList.add(originalFlowFile);
    +                    }
    +                }
    +        });
    +
    +        session.transfer(attachmentsList, REL_ATTACHMENTS);
    +
    +        // As per above code, originalFlowfile may be routed to invalid or
    +        // original depending on RFC2822 compliance.
    +        session.transfer(invalidFlowFilesList, REL_FAILURE);
    +        session.transfer(originalFlowFilesList, REL_ORIGINAL);
    +
    +        if (attachmentsList.size() > 10) {
    +            logger.info("Split {} into {} files", new Object[]{originalFlowFile, attachmentsList.size()});
    +        } else if (attachmentsList.size() > 1){
    +            logger.info("Split {} into {} files: {}", new Object[]{originalFlowFile,
attachmentsList.size(), attachmentsList});
    +        }
    --- End diff --
    
    It is based on SplitContent
    
    https://github.com/apache/nifi/blob/1bd2cf0d09a7111bcecffd0f473aa71c25a69845/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/SplitContent.java#L292
    
    But I agree the code is hard to grasp as there are two completely separate tests occurring
within the same if then else statement. 
    
    Addressed.
    



> ExtractEmailAttachments processor should support TNEF files (aka winmail.dat)
> -----------------------------------------------------------------------------
>
>                 Key: NIFI-2380
>                 URL: https://issues.apache.org/jira/browse/NIFI-2380
>             Project: Apache NiFi
>          Issue Type: Improvement
>    Affects Versions: 1.0.0
>            Reporter: Andre
>            Assignee: Andre
>             Fix For: 1.1.0
>
>
> during the review of NIFI-1899 Dan Marshall highlighted some use cases for email processing
that have not been addressed as part of the initial development cycle.
> One of these use cases was the decoding of Microsoft Transport Neutral Encoding Files
(TNEF). 
> This type of attachments is popularly know as winmail.dat and uses a non RFC compliant
structure to transfer attachments across different Microsoft Outlook clients.
> Given the prevalence of outlook and the issues with winmail.dat files, it would be nice
to be able to decode TNEF as we currently do with MIME attachments.
> Permalink to Dan's comments http://mail-archives.apache.org/mod_mbox/nifi-dev/201607.mbox/%3C1468716836729-12827.post%40n7.nabble.com%3E



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message