xerces-c-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Li, PingShan \(Kansas City\)" <pingshan...@securepassage.com>
Subject RE: question about xerces memory usage
Date Wed, 17 Oct 2007 19:19:04 GMT
Thank you for your email. 

I am posting the main function I modified in DOMCount.cpp to clarify my question:

1. I only parse the file using Xerces once.
2. All the initialization and release code is the same in DOMCount.cpp, I did not change it.
3. The code I added is only trying to read from the file and append to a string to get exception.

My question is that there should be no difference in the iteration number if I run the following
code
and then comment out other code and just run the added code. But my real point is that even
I use
Xerces and release it as directed in the sample code provided by xerces project, it may have
impact on
the other part of my project on memory usage.

Thanks

PingShan Li

int main(int argC, char* argV[])
{
    // Check command line and extract arguments.
    if (argC < 2)
    {
        usage();
        return 1;
    }

    const char*                xmlFile = 0;
    AbstractDOMParser::ValSchemes valScheme = AbstractDOMParser::Val_Auto;
    bool                       doNamespaces       = false;
    bool                       doSchema           = false;
    bool                       schemaFullChecking = false;
    bool                       doList = false;
    bool                       errorOccurred = false;
    bool                       recognizeNEL = false;
    bool                       printOutEncounteredEles = false;
    char                       localeStr[64];
    memset(localeStr, 0, sizeof localeStr);

    int argInd;
    for (argInd = 1; argInd < argC; argInd++)
    {
        // Break out on first parm not starting with a dash
        if (argV[argInd][0] != '-')
            break;

        // Watch for special case help request
        if (!strcmp(argV[argInd], "-?"))
        {
            usage();
            return 2;
        }
         else if (!strncmp(argV[argInd], "-v=", 3)
              ||  !strncmp(argV[argInd], "-V=", 3))
        {
            const char* const parm = &argV[argInd][3];

            if (!strcmp(parm, "never"))
                valScheme = AbstractDOMParser::Val_Never;
            else if (!strcmp(parm, "auto"))
                valScheme = AbstractDOMParser::Val_Auto;
            else if (!strcmp(parm, "always"))
                valScheme = AbstractDOMParser::Val_Always;
            else
            {
                XERCES_STD_QUALIFIER cerr << "Unknown -v= value: " << parm <<
XERCES_STD_QUALIFIER
endl;
                return 2;
            }
        }
         else if (!strcmp(argV[argInd], "-n")
              ||  !strcmp(argV[argInd], "-N"))
        {
            doNamespaces = true;
        }
         else if (!strcmp(argV[argInd], "-s")
              ||  !strcmp(argV[argInd], "-S"))
        {
            doSchema = true;
        }
         else if (!strcmp(argV[argInd], "-f")
              ||  !strcmp(argV[argInd], "-F"))
        {
            schemaFullChecking = true;
        }
         else if (!strcmp(argV[argInd], "-l")
              ||  !strcmp(argV[argInd], "-L"))
        {
            doList = true;
        }
         else if (!strcmp(argV[argInd], "-special:nel"))
        {
            // turning this on will lead to non-standard compliance behaviour
            // it will recognize the unicode character 0x85 as new line character
            // instead of regular character as specified in XML 1.0
            // do not turn this on unless really necessary

             recognizeNEL = true;
        }
         else if (!strcmp(argV[argInd], "-p")
              ||  !strcmp(argV[argInd], "-P"))
        {
            printOutEncounteredEles = true;
        }
         else if (!strncmp(argV[argInd], "-locale=", 8))
        {
             // Get out the end of line
             strcpy(localeStr, &(argV[argInd][8]));
        }			
         else
        {
            XERCES_STD_QUALIFIER cerr << "Unknown option '" << argV[argInd]
                 << "', ignoring it\n" << XERCES_STD_QUALIFIER endl;
        }
    }

    //
    //  There should be only one and only one parameter left, and that
    //  should be the file name.
    //
    if (argInd != argC - 1)
    {
        usage();
        return 1;
    }

    // Initialize the XML4C system
    try
    {
        if (strlen(localeStr))
        {
            XMLPlatformUtils::Initialize(localeStr);
        }
        else
        {
            XMLPlatformUtils::Initialize();
        }

        if (recognizeNEL)
        {
            XMLPlatformUtils::recognizeNEL(recognizeNEL);
        }
    }

    catch (const XMLException& toCatch)
    {
         XERCES_STD_QUALIFIER cerr << "Error during initialization! :\n"
              << StrX(toCatch.getMessage()) << XERCES_STD_QUALIFIER endl;
         return 1;
    }

    // Instantiate the DOM parser.
    static const XMLCh gLS[] = { chLatin_L, chLatin_S, chNull };
    DOMImplementation *impl = DOMImplementationRegistry::getDOMImplementation(gLS);
    DOMBuilder        *parser =
((DOMImplementationLS*)impl)->createDOMBuilder(DOMImplementationLS::MODE_SYNCHRONOUS, 0);

    parser->setFeature(XMLUni::fgDOMNamespaces, doNamespaces);
    parser->setFeature(XMLUni::fgXercesSchema, doSchema);
    parser->setFeature(XMLUni::fgXercesSchemaFullChecking, schemaFullChecking);

    if (valScheme == AbstractDOMParser::Val_Auto)
    {
        parser->setFeature(XMLUni::fgDOMValidateIfSchema, true);
    }
    else if (valScheme == AbstractDOMParser::Val_Never)
    {
        parser->setFeature(XMLUni::fgDOMValidation, false);
    }
    else if (valScheme == AbstractDOMParser::Val_Always)
    {
        parser->setFeature(XMLUni::fgDOMValidation, true);
    }

    // enable datatype normalization - default is off
    parser->setFeature(XMLUni::fgDOMDatatypeNormalization, true);

    // And create our error handler and install it
    DOMCountErrorHandler errorHandler;
    parser->setErrorHandler(&errorHandler);

    //
    //  Get the starting time and kick off the parse of the indicated
    //  file. Catch any exceptions that might propogate out of it.
    //
    unsigned long duration;

    bool more = true;
    XERCES_STD_QUALIFIER ifstream fin;

    // the input is a list file
    if (doList)
        fin.open(argV[argInd]);

    if (fin.fail()) {
        XERCES_STD_QUALIFIER cerr <<"Cannot open the list file: " << argV[argInd]
<<
XERCES_STD_QUALIFIER endl;
        return 2;
    }

    while (more)
    {
        char fURI[1000];
        //initialize the array to zeros
        memset(fURI,0,sizeof(fURI));

        if (doList) {
            if (! fin.eof() ) {
                fin.getline (fURI, sizeof(fURI));
                if (!*fURI)
                    continue;
                else {
                    xmlFile = fURI;
                    XERCES_STD_QUALIFIER cerr << "==Parsing== " << xmlFile <<
XERCES_STD_QUALIFIER
endl;
                }
            }
            else
                break;
        }
        else {
            xmlFile = argV[argInd];
            more = false;
        }

        //reset error count first
        errorHandler.resetErrors();

        XERCES_CPP_NAMESPACE_QUALIFIER DOMDocument *doc = 0;

        try
        {
            // reset document pool
            parser->resetDocumentPool();

            const unsigned long startMillis = XMLPlatformUtils::getCurrentMillis();
            doc = parser->parseURI(xmlFile);
            const unsigned long endMillis = XMLPlatformUtils::getCurrentMillis();
            duration = endMillis - startMillis;
        }

        catch (const XMLException& toCatch)
        {
            XERCES_STD_QUALIFIER cerr << "\nError during parsing: '" << xmlFile
<< "'\n"
                 << "Exception message is:  \n"
                 << StrX(toCatch.getMessage()) << "\n" << XERCES_STD_QUALIFIER
endl;
            errorOccurred = true;
            continue;
        }
        catch (const DOMException& toCatch)
        {
            const unsigned int maxChars = 2047;
            XMLCh errText[maxChars + 1];

            XERCES_STD_QUALIFIER cerr << "\nDOM Error during parsing: '" << xmlFile
<< "'\n"
                 << "DOMException code is:  " << toCatch.code << XERCES_STD_QUALIFIER
endl;

            if (DOMImplementation::loadDOMExceptionMsg(toCatch.code, errText, maxChars))
                 XERCES_STD_QUALIFIER cerr << "Message is: " << StrX(errText)
<< XERCES_STD_QUALIFIER
endl;

            errorOccurred = true;
            continue;
        }
        catch (...)
        {
            XERCES_STD_QUALIFIER cerr << "\nUnexpected exception during parsing: '"
<< xmlFile <<
"'\n";
            errorOccurred = true;
            continue;
        }

        //
        //  Extract the DOM tree, get the list of all the elements and report the
        //  length as the count of elements.
        //
        if (errorHandler.getSawErrors())
        {
            XERCES_STD_QUALIFIER cout << "\nErrors occurred, no output available\n"
<<
XERCES_STD_QUALIFIER endl;
            errorOccurred = true;
        }
         else
        {
            unsigned int elementCount = 0;
            if (doc) {
                elementCount = countChildElements((DOMNode*)doc->getDocumentElement(),
printOutEncounteredEles);
                // test getElementsByTagName and getLength
                XMLCh xa[] = {chAsterisk, chNull};
                if (elementCount != doc->getElementsByTagName(xa)->getLength()) {
                    XERCES_STD_QUALIFIER cout << "\nErrors occurred, element count is
wrong\n" <<
XERCES_STD_QUALIFIER endl;
                    errorOccurred = true;
                }
            }

            // Print out the stats that we collected and time taken.
            XERCES_STD_QUALIFIER cout << xmlFile << ": " << duration <<
" ms ("
                 << elementCount << " elems)." << XERCES_STD_QUALIFIER endl;
        }
    }

    //
    //  Delete the parser itself.  Must be done prior to calling Terminate, below.
    //
    parser->release();

    // And call the termination method
    XMLPlatformUtils::Terminate();

     /////////////////////////////////////////////////////////////////////////////
     // Code added for testing
     std::string test;
     int stringSize( 0 );
     for ( int i = 0; i < 100; ++i )
     {
       _sleep( 10 );
       std::cout << i << " " << stringSize << std::endl;

       FILE *hFile = hFile = fopen( "C:\\Test\\test.xml", "rb" );
       if ( hFile )
       {
         // Get the file size so we can allocate our buffer.
         fseek( hFile, 0, SEEK_END );
         const int nLength = ftell( hFile );
         fseek( hFile, 0, SEEK_SET );
         char* pszBuffer = new char[ nLength + 1 ];
         fread( pszBuffer, sizeof( char ), nLength, hFile );
         pszBuffer[ nLength ] = '\0';
         test += std::string( pszBuffer );
         stringSize += nLength;
         delete [] pszBuffer;
         fclose( hFile );
       }
     }
     // End of code added for testing
     ////////////////////////////////////////////////////////////////////////////////

    if (doList)
        fin.close();

    if (errorOccurred)
        return 4;
    else
        return 0;
}




-----Original Message-----
From: Alberto Massari [mailto:amassari@datadirect.com] 
Sent: Wednesday, October 17, 2007 1:54 PM
To: c-users@xerces.apache.org
Subject: Re: question about xerces memory usage

Hi,
I don't get the point of this experiment: if v 2.7 can only parse 10 
times the 48Mb file, while v 2.8 can do it for 15 times, it means 2.8 
uses less memory for the same DOM tree (as I guess you are not 
releasing the DOM tree between the parse() operations, so keeping 
them all in memory). As for your added code, you are concatenating 
the input file in a std::string, so it makes sense that 16 times * 
48Mb = 768Mb crashes your application (btw, the fact that you have 
2Gb of memory doesn't imply that the program can find a contiguous 
chunk of memory of 800Mb).

Alberto

At 13.38 17/10/2007 -0500, Li, PingShan \(Kansas City\) wrote:
>We use Xerces in our C++ project to load XML file as DOM tree.
>
>We have one question related to the memory usage of Xerces C++ 
>version. I made small modification to
>the sample DOMCount project provided by Xerces to demonstrate the question.
>
>Operating system is Windows xp professional. Visual studio 2003 
>VC7.1 is used for the testing.
>
>
>
>The program is tested on a box with 2G RAM.
>
>
>
>Test.xml used in here is a 48M xml file.
>
>For xerces 2.7:
>
>If I add the following code to DOMCount.cpp, I can run 10 iterations 
>before I got "out of memory"
>exception. But if I commented out other code and only run the added 
>code, I can run up to 16
>iterations before I got "out of memory" exception. I would expect after
>"XMLPlatformUtils::Terminate()" is called, there should be no 
>difference on the number of iterations
>for the added code to get the "out of memory" exception.
>
>We used process explorer (HYPERLINK
>"http://www.microsoft.com/technet/sysinternals/utilities/processexplorer.mspx"
>\nhttp://www.microsoft.com/technet/sysinternals/utilities/processexplorer.mspx) 
>to help us find out
>the memory usage of the program. The only thing came to our 
>attention is the virtual memory used by
>Xerces. Physical memory is released after 
>XMLPlatformUtils::Terminate, but virtual memory stays at the
>same level.
>
>Then I think I can try the same code with Xerces 2.8. To my 
>surprise, I can run up to 15 iterations
>before I got the out of memory exception. If I only run the added 
>code, it will throw out of memory
>exception on the 16th iteration.
>
>Is there anything that the 2.7 user need to pay attention to? Could 
>anybody please tell me why there
>is a difference on the number of iterations before I got the "out of 
>memory" exception in 2.7?
>
>Thank you
>
>PingShan Li
>
>
>     //
>     //  Delete the parser itself.  Must be done prior to calling 
> Terminate, below.
>     //
>
>     parser->release();
>
>     // And call the termination method
>     XMLPlatformUtils::Terminate();
>
>
> 
>/////////////////////////////////////////////////////////////////////////////
>     // Code added for testing
>     std::string test;
>     int stringSize( 0 );
>     for ( int i = 0; i < 100; ++i )
>     {
>       _sleep( 10 );
>       std::cout << i << " " << stringSize << std::endl;
>
>       FILE *hFile = hFile = fopen( "C:\\Test\\test.xml", "rb" );
>       if ( hFile )
>       {
>         // Get the file size so we can allocate our buffer.
>         fseek( hFile, 0, SEEK_END );
>         const int nLength = ftell( hFile );
>         fseek( hFile, 0, SEEK_SET );
>         char* pszBuffer = new char[ nLength + 1 ];
>         fread( pszBuffer, sizeof( char ), nLength, hFile );
>         pszBuffer[ nLength ] = '\0';
>         test += std::string( pszBuffer );
>         stringSize += nLength;
>         delete [] pszBuffer;
>         fclose( hFile );
>       }
>     }
> 
>//////////////////////////////////////////////////////////////////////////////// 
>
>
>
>
>
>
>No virus found in this outgoing message.
>Checked by AVG Free Edition.
>Version: 7.5.488 / Virus Database: 269.14.13/1075 - Release Date: 
>10/17/2007 9:38 AM
>


No virus found in this incoming message.
Checked by AVG Free Edition. 
Version: 7.5.488 / Virus Database: 269.14.13/1075 - Release Date: 10/17/2007 9:38 AM
 

No virus found in this outgoing message.
Checked by AVG Free Edition. 
Version: 7.5.488 / Virus Database: 269.14.13/1075 - Release Date: 10/17/2007 9:38 AM
 

Mime
View raw message