pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From build...@apache.org
Subject svn commit: r861229 [17/19] - in /websites/staging/pdfbox/trunk/content: ./ FontAwesome/ FontAwesome/css/ FontAwesome/docs/ FontAwesome/docs/assets/ FontAwesome/docs/assets/css/ FontAwesome/docs/assets/font/ FontAwesome/docs/assets/ico/ FontAwesome/doc...
Date Tue, 07 May 2013 19:46:01 GMT
Added: websites/staging/pdfbox/trunk/content/cookbook/pdfavalidation.html
==============================================================================
--- websites/staging/pdfbox/trunk/content/cookbook/pdfavalidation.html (added)
+++ websites/staging/pdfbox/trunk/content/cookbook/pdfavalidation.html Tue May  7 19:45:58 2013
@@ -0,0 +1,259 @@
+<!DOCTYPE html>
+<html lang="en">
+    
+    <!--
+     
+     Licensed to the Apache Software Foundation (ASF) under one or more
+     contributor license agreements.  See the NOTICE file distributed with
+     this work for additional information regarding copyright ownership.
+     The ASF licenses this file to You under the Apache License, Version 2.0
+     (the "License"); you may not use this file except in compliance with
+     the License.  You may obtain a copy of the License at
+     
+     http://www.apache.org/licenses/LICENSE- 2.0
+     
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+     -->
+    
+  <head>
+    <title>Apache PDFBox | Cookbook - PDF/A Validation</title>
+
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <link href="/bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+    <link href="/FontAwesome/css/font-awesome.css" rel="stylesheet">
+    <link href="/Iconic/iconic fill/iconic_fill.css" rel="stylesheet">
+    <link href="/css/pygments-github.css" rel="stylesheet">
+    <link href="/css/site.css" rel="stylesheet">
+        
+        
+
+    
+
+    
+    
+        <!-- Twitter Bootstrap and jQuery after this line. -->
+        <script src="http://code.jquery.com/jquery-latest.js"></script>
+        <script src="/bootstrap/js/bootstrap.js"</script>
+        <script>
+            $('.nav-collapse').collapse();
+        </script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-fixed-top">
+      <div class="navbar-inner">
+          <a href="index.html"><img class="logo" src="/images/logo-head.gif"></a>
+      </div>
+    </div>
+
+    <header class="main" id="overview">
+        <div class="container">
+        </div>
+    </header>
+
+    <div class="container-fluid">
+        <div class="row-fluid">
+            <div class="span3">
+                <ul class="nav nav-list">
+                    <li class="nav-header">Apache PDFBox</li>
+                    <li><a href="/downloads.html">
+                        <i class="icon-chevron-right"></i>
+                    Downloads</a></li>
+                    <li><a href="/dependencies.html">
+                        <i class="icon-chevron-right"></i>
+                    Dependencies</a></li>
+                    <li><a href="/references.html">
+                        <i class="icon-chevron-right"></i>
+                        References</a></li>
+                <li class="nav-header">Community</li>
+                <li><a href="/support.html">
+                    <i class="icon-chevron-right"></i>
+                    Support
+                </a></li>
+                <li><a href="/mailinglists.html">
+                    <i class="icon-chevron-right"></i>
+                    Mailing Lists
+                </a></li>
+                <li><a href="/team.html">
+                    <i class="icon-chevron-right"></i>
+                    Project Team</a></li>
+                <li  class="nav-header">Documentation</li>
+                <li><a href="/architecture.html">
+                    <i class="icon-chevron-right"></i>
+                    Architecture</a></li>
+                <li><a href="/commandline/">
+                    <i class="icon-chevron-right"></i>
+                    Command Line Tools</a></li>
+                <li class="dropdown"><a  class="dropdown-toggle" data-toggle="dropdown" href="#">
+                    <i class="icon-chevron-right"></i>
+                    PDFBox Cookbook <b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        <li><a href="/cookbook/documentcreation.html">
+                            <i class="icon-chevron-right"></i>
+                            Document Creation</a>
+                        </li>
+                        <li><a href="/cookbook/textextraction.html">
+                            <i class="icon-chevron-right"></i>
+                            Text Extraction</a>
+                        </li>
+                        <li><a href="/cookbook/pdfavalidation.html">
+                            <i class="icon-chevron-right"></i>
+                            PDF/A Validation</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithfonts.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Fonts</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithmetadata.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Metadata</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithattachments.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Attachments</a>
+                        </li>
+                    </ul>
+                </li>
+                <li  class="nav-header">For Developers</li>
+                <li><a href="/building.html">
+                    <i class="icon-chevron-right"></i>
+                    Building PDFBox</a></li>
+                <li><a href="/ideas.html">
+                    <i class="icon-chevron-right"></i>
+                    Ideas</a></li>
+                <li><a href="/codingconventions.html">
+                    <i class="icon-chevron-right"></i>
+                    Coding Conventions</a></li>
+                <li  class="nav-header">Apache Software Foundation</li>
+                <li><a href="http://www.apache.org/">
+                    <i class="icon-chevron-right"></i>
+                    Apache Software Foundation</a></li>
+                <li><a href="http://www.apache.org/foundation/thanks.html">
+                    <i class="icon-chevron-right"></i>
+                    ASF Sponsors</a></li>
+                <li><a href="http://www.apache.org/security/">
+                    <i class="icon-chevron-right"></i>
+                    Security</a></li>
+                </ul>
+            </div>
+            <div class="span9">
+                <p> <h2 id="pdfa-validation">PDF/A Validation</h2>
+<p>The Apache Preflight library is a Java tool that implements a parser compliant with the ISO-19005 specification (aka PDF/A-1).
+Check Compliance with PDF/A-1b</p>
+<p>This small sample shows how to check the compliance of a file with the PDF/A-1b specification.</p>
+<div class="codehilite"><pre><span class="n">ValidationResult</span> <span class="n">result</span> <span class="o">=</span> <span class="kc">null</span><span class="o">;</span>
+
+<span class="n">FileDataSource</span> <span class="n">fd</span> <span class="o">=</span> <span class="k">new</span> <span class="n">FileDataSource</span><span class="o">(</span><span class="n">args</span><span class="o">[</span><span class="mi">0</span><span class="o">]);</span>
+<span class="n">PreflightParser</span> <span class="n">parser</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PreflightParser</span><span class="o">(</span><span class="n">fd</span><span class="o">);</span>
+<span class="k">try</span> <span class="o">{</span>
+
+  <span class="cm">/* Parse the PDF file with PreflightParser that inherits from the NonSequentialParser.</span>
+<span class="cm">   * Some additional controls are present to check a set of PDF/A requirements. </span>
+<span class="cm">   * (Stream length consistency, EOL after some Keyword...)</span>
+<span class="cm">   */</span>
+  <span class="n">parser</span><span class="o">.</span><span class="na">parse</span><span class="o">();</span>
+
+  <span class="cm">/* Once the syntax validation is done, </span>
+<span class="cm">   * the parser can provide a PreflightDocument </span>
+<span class="cm">   * (that inherits from PDDocument) </span>
+<span class="cm">   * This document process the end of PDF/A validation.</span>
+<span class="cm">   */</span>
+  <span class="n">PreflightDocument</span> <span class="n">document</span> <span class="o">=</span> <span class="n">parser</span><span class="o">.</span><span class="na">getPreflightDocument</span><span class="o">();</span>
+  <span class="n">document</span><span class="o">.</span><span class="na">validate</span><span class="o">();</span>
+
+  <span class="c1">// Get validation result</span>
+  <span class="n">result</span> <span class="o">=</span> <span class="n">document</span><span class="o">.</span><span class="na">getResult</span><span class="o">();</span>
+  <span class="n">document</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+
+<span class="o">}</span> <span class="k">catch</span> <span class="o">(</span><span class="n">SyntaxValidationException</span> <span class="n">e</span><span class="o">)</span> <span class="o">{</span>
+  <span class="cm">/* the parse method can throw a SyntaxValidationException </span>
+<span class="cm">   *if the PDF file can&#39;t be parsed.</span>
+<span class="cm">   */</span> <span class="n">In</span> <span class="k">this</span> <span class="k">case</span><span class="o">,</span> <span class="n">the</span> <span class="n">exception</span> <span class="n">contains</span> <span class="n">an</span> <span class="n">instance</span> <span class="n">of</span> <span class="n">ValidationResult</span>  
+  <span class="n">result</span> <span class="o">=</span> <span class="n">e</span><span class="o">.</span><span class="na">getResult</span><span class="o">();</span>
+<span class="o">}</span>
+
+<span class="c1">// display validation result</span>
+<span class="k">if</span> <span class="o">(</span><span class="n">result</span><span class="o">.</span><span class="na">isValid</span><span class="o">())</span> <span class="o">{</span>
+  <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;The file &quot;</span> <span class="o">+</span> <span class="n">args</span><span class="o">[</span><span class="mi">0</span><span class="o">]</span> <span class="o">+</span> <span class="s">&quot; is a valid PDF/A-1b file&quot;</span><span class="o">);</span>
+<span class="o">}</span> <span class="k">else</span> <span class="o">{</span>
+  <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;The file&quot;</span> <span class="o">+</span> <span class="n">args</span><span class="o">[</span><span class="mi">0</span><span class="o">]</span> <span class="o">+</span> <span class="s">&quot; is not valid, error(s) :&quot;</span><span class="o">);</span>
+  <span class="k">for</span> <span class="o">(</span><span class="n">ValidationError</span> <span class="n">error</span> <span class="o">:</span> <span class="n">result</span><span class="o">.</span><span class="na">getErrorsList</span><span class="o">())</span> <span class="o">{</span>
+    <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">error</span><span class="o">.</span><span class="na">getErrorCode</span><span class="o">()</span> <span class="o">+</span> <span class="s">&quot; : &quot;</span> <span class="o">+</span> <span class="n">error</span><span class="o">.</span><span class="na">getDetails</span><span class="o">());</span>
+  <span class="o">}</span>
+<span class="o">}</span>
+</pre></div>
+
+
+<h3 id="categories-of-validation-error">Categories of Validation Error</h3>
+<p>If a validation fails, the ValidationResult object contains all causes of the failure.
+In order to help in the failure understanding, all error codes have the following form X[.Y[.Z]] where :</p>
+<ul>
+<li>'X' is the category (ex : Font validation error...)</li>
+<li>'Y' represent a subsection of the category (ex : "Font with Glyph error")</li>
+<li>'Z' represent the cause of the error (ex : "Font with a missing Glyph")</li>
+</ul>
+<p>Category ('Y') and cause ('Z') may be missing according to the difficulty to identify the error detail.</p>
+<p>Here after, you can find all Categories (for detailed cause, see constants in the PreglihtConstant interface) :</p>
+<table>
+<thead>
+<tr>
+<th>Category</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>1[.y[.z]]</td>
+<td>Syntax Error</td>
+</tr>
+<tr>
+<td>2[.y[.z]]</td>
+<td>Graphic Error</td>
+</tr>
+<tr>
+<td>3[.y[.z]]</td>
+<td>Font Error</td>
+</tr>
+<tr>
+<td>4[.y[.z]]</td>
+<td>Transparency Error</td>
+</tr>
+<tr>
+<td>5[.y[.z]]</td>
+<td>Annotation Error</td>
+</tr>
+<tr>
+<td>6[.y[.z]]</td>
+<td>Action Error</td>
+</tr>
+<tr>
+<td>7[.y[.z]]</td>
+<td>Metadata Error</td>
+</tr>
+</tbody>
+</table> </p>
+            </div>
+        </div>
+    </div>
+
+      <footer id="copyright">
+          <div class="row-fluid">
+              <div class="span3">
+                  <!-- nothing in here on purpose -->
+              </div>
+              
+              <div class="span9">
+                  <p>Copyright © 2013 The Apache Software Foundation, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/>
+                  Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
+              </div>
+          </div>
+      </footer>
+      
+  </body>
+</html>

Added: websites/staging/pdfbox/trunk/content/cookbook/textextraction.html
==============================================================================
--- websites/staging/pdfbox/trunk/content/cookbook/textextraction.html (added)
+++ websites/staging/pdfbox/trunk/content/cookbook/textextraction.html Tue May  7 19:45:58 2013
@@ -0,0 +1,278 @@
+<!DOCTYPE html>
+<html lang="en">
+    
+    <!--
+     
+     Licensed to the Apache Software Foundation (ASF) under one or more
+     contributor license agreements.  See the NOTICE file distributed with
+     this work for additional information regarding copyright ownership.
+     The ASF licenses this file to You under the Apache License, Version 2.0
+     (the "License"); you may not use this file except in compliance with
+     the License.  You may obtain a copy of the License at
+     
+     http://www.apache.org/licenses/LICENSE- 2.0
+     
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+     -->
+    
+  <head>
+    <title>Apache PDFBox | Cookbook - Textextraction</title>
+
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <link href="/bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+    <link href="/FontAwesome/css/font-awesome.css" rel="stylesheet">
+    <link href="/Iconic/iconic fill/iconic_fill.css" rel="stylesheet">
+    <link href="/css/pygments-github.css" rel="stylesheet">
+    <link href="/css/site.css" rel="stylesheet">
+        
+        
+
+    
+
+    
+    
+        <!-- Twitter Bootstrap and jQuery after this line. -->
+        <script src="http://code.jquery.com/jquery-latest.js"></script>
+        <script src="/bootstrap/js/bootstrap.js"</script>
+        <script>
+            $('.nav-collapse').collapse();
+        </script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-fixed-top">
+      <div class="navbar-inner">
+          <a href="index.html"><img class="logo" src="/images/logo-head.gif"></a>
+      </div>
+    </div>
+
+    <header class="main" id="overview">
+        <div class="container">
+        </div>
+    </header>
+
+    <div class="container-fluid">
+        <div class="row-fluid">
+            <div class="span3">
+                <ul class="nav nav-list">
+                    <li class="nav-header">Apache PDFBox</li>
+                    <li><a href="/downloads.html">
+                        <i class="icon-chevron-right"></i>
+                    Downloads</a></li>
+                    <li><a href="/dependencies.html">
+                        <i class="icon-chevron-right"></i>
+                    Dependencies</a></li>
+                    <li><a href="/references.html">
+                        <i class="icon-chevron-right"></i>
+                        References</a></li>
+                <li class="nav-header">Community</li>
+                <li><a href="/support.html">
+                    <i class="icon-chevron-right"></i>
+                    Support
+                </a></li>
+                <li><a href="/mailinglists.html">
+                    <i class="icon-chevron-right"></i>
+                    Mailing Lists
+                </a></li>
+                <li><a href="/team.html">
+                    <i class="icon-chevron-right"></i>
+                    Project Team</a></li>
+                <li  class="nav-header">Documentation</li>
+                <li><a href="/architecture.html">
+                    <i class="icon-chevron-right"></i>
+                    Architecture</a></li>
+                <li><a href="/commandline/">
+                    <i class="icon-chevron-right"></i>
+                    Command Line Tools</a></li>
+                <li class="dropdown"><a  class="dropdown-toggle" data-toggle="dropdown" href="#">
+                    <i class="icon-chevron-right"></i>
+                    PDFBox Cookbook <b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        <li><a href="/cookbook/documentcreation.html">
+                            <i class="icon-chevron-right"></i>
+                            Document Creation</a>
+                        </li>
+                        <li><a href="/cookbook/textextraction.html">
+                            <i class="icon-chevron-right"></i>
+                            Text Extraction</a>
+                        </li>
+                        <li><a href="/cookbook/pdfavalidation.html">
+                            <i class="icon-chevron-right"></i>
+                            PDF/A Validation</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithfonts.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Fonts</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithmetadata.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Metadata</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithattachments.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Attachments</a>
+                        </li>
+                    </ul>
+                </li>
+                <li  class="nav-header">For Developers</li>
+                <li><a href="/building.html">
+                    <i class="icon-chevron-right"></i>
+                    Building PDFBox</a></li>
+                <li><a href="/ideas.html">
+                    <i class="icon-chevron-right"></i>
+                    Ideas</a></li>
+                <li><a href="/codingconventions.html">
+                    <i class="icon-chevron-right"></i>
+                    Coding Conventions</a></li>
+                <li  class="nav-header">Apache Software Foundation</li>
+                <li><a href="http://www.apache.org/">
+                    <i class="icon-chevron-right"></i>
+                    Apache Software Foundation</a></li>
+                <li><a href="http://www.apache.org/foundation/thanks.html">
+                    <i class="icon-chevron-right"></i>
+                    ASF Sponsors</a></li>
+                <li><a href="http://www.apache.org/security/">
+                    <i class="icon-chevron-right"></i>
+                    Security</a></li>
+                </ul>
+            </div>
+            <div class="span9">
+                <p> <h2 id="textextraction">Textextraction</h2>
+<h3 id="extracting-text">Extracting Text</h3>
+<p>See class:org.apache.pdfbox.util.PDFTextStripper<br />
+See class:org.apache.pdfbox.searchengine.lucene.LucenePDFDocument<br />
+See command line app:ExtractText<br />
+</p>
+<p>One of the main features of PDFBox is its ability to quickly and accurately extract text 
+from a variety of PDF documents. This functionality is encapsulated in the 
+org.apache.pdfbox.util.PDFTextStripper and can be easily executed on the command line with 
+org.apache.pdfbox.ExtractText.</p>
+<h3 id="lucene-integration">Lucene Integration</h3>
+<p>Lucene is an open source text search library from the Apache Jakarta Project. In order for
+Lucene to be able to index a PDF document it must first be converted to text. PDFBox provides 
+a simple approach for adding PDF documents into a Lucene index.</p>
+<div class="codehilite"><pre><span class="n">Document</span> <span class="n">luceneDocument</span> <span class="o">=</span> <span class="n">LucenePDFDocument</span><span class="o">.</span><span class="na">getDocument</span><span class="o">(</span> <span class="o">...</span> <span class="o">);</span>
+</pre></div>
+
+
+<p>Now that you hava a Lucene Document object, you can add it to the Lucene index just like 
+you would if it had been created from a text or HTML file. The LucenePDFDocument automatically 
+extracts a variety of metadata fields from the PDF to be added to the index, the javadoc 
+shows details on those fields. This approach is very simple and should be sufficient for 
+most users, if not then you can use some of the advanced text extraction techniques 
+described in the next section.</p>
+<h3 id="advanced-text-extraction">Advanced Text Extraction</h3>
+<p>Some applications will have complex text extraction requiments and neither the command 
+line application nor the LucenePDFDocument will be able to fulfill those requirements. 
+It is possible for users to utilize or extend the PDFTextStripper class to meet some of 
+these requirements.</p>
+<h4 id="limiting-the-extracted-text">Limiting The Extracted Text</h4>
+<p>There are several ways that we can limit the text that is extracted during the extraction 
+process. The simplest is to specify the range of pages that you want to be extracted. 
+For example, to only extract text from the second and third pages of the PDF document 
+you could do this:</p>
+<div class="codehilite"><pre><span class="n">PDFTextStripper</span> <span class="n">stripper</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDFTextStripper</span><span class="o">();</span>
+<span class="n">stripper</span><span class="o">.</span><span class="na">setStartPage</span><span class="o">(</span> <span class="mi">2</span> <span class="o">);</span>
+<span class="n">stripper</span><span class="o">.</span><span class="na">setEndPage</span><span class="o">(</span> <span class="mi">3</span> <span class="o">);</span>
+<span class="n">stripper</span><span class="o">.</span><span class="na">writeText</span><span class="o">(</span> <span class="o">...</span> <span class="o">);</span>
+</pre></div>
+
+
+<p>NOTE: The startPage and endPage properties of PDFTextStripper are 1 based and inclusive.</p>
+<p>If you wanted to start on page 2 and extract to the end of the document then you would just
+set the startPage property. By default all pages in the pdf document are extracted.</p>
+<p>It is also possible to limit the extracted text to be between two bookmarks in the page. 
+If you are not familiar with how to use bookmarks in PDFBox then you should review the 
+Bookmarks page. Similar to the startPage/endPage properties, PDFTextStripper also has 
+startBookmark/endBookmark properties. There are some caveats to be aware of when using this
+feature of the PDFTextStripper. Not all bookmarks point to a page in the current PDF document. </p>
+<p>The possible states of a bookmark are:</p>
+<ul>
+<li>null - The property was not set, this is the default.</li>
+<li>Points to page in the PDF - The property was set and points to a valid page in the PDF</li>
+<li>Bookmark does not point to anything - The property was set but the bookmark does not point to any page</li>
+<li>Bookmark points to external action - The property was set, but it points to a page in a different PDF or performs an action when activated</li>
+</ul>
+<p>The table below will describe how PDFBox behaves in the various scenarios:</p>
+<table>
+<thead>
+<tr>
+<th>Start Bookmark</th>
+<th>End Bookmark</th>
+<th>Result</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>null</td>
+<td>null</td>
+<td>This is the default, the properties have no effect on the text extraction.</td>
+</tr>
+<tr>
+<td>Points to a page in the PDF</td>
+<td>null</td>
+<td>Text extraction will begin on the page that this bookmark points to and go until the end of the document.</td>
+</tr>
+<tr>
+<td>null</td>
+<td>Points to a page in the PDF</td>
+<td>Text extraction will begin on the first page and stop at the end of the page that this bookmark points to.</td>
+</tr>
+<tr>
+<td>Bookmark does not point to anything</td>
+<td>null</td>
+<td>Because the PDFTextStripper cannot determine a start page based on the bookmark, it will start on the first page and go until the end of the document.</td>
+</tr>
+<tr>
+<td>null</td>
+<td>Bookmark does not point to anything</td>
+<td>Because the PDFTextStripper cannot determine a end page based on the bookmark, it will start on the first page and go until the end of the document.</td>
+</tr>
+<tr>
+<td>Bookmark does not point to anything</td>
+<td>Bookmark does not point to anything</td>
+<td>This is a special case! If the startBookmark and endBookmark are exactly the same then no text will be extracted. If they are different then it is not possible for the PDFTextStripper to determine that pages so it will include the entire document.</td>
+</tr>
+<tr>
+<td>Bookmark points to external action</td>
+<td>Bookmark points to external action</td>
+<td>If either the startBookmark or the endBookmark refer to an external page or execute an action then an OutlineNotLocalException will be thrown to indicate to the user that the bookmark is not valid.</td>
+</tr>
+</tbody>
+</table>
+<p>NOTE: PDFTextStripper will check both the startPage/endPage and the startBookmark/endBookmark to determine if text should be extracted from the current page.</p>
+<h4 id="external-glyph-list">External Glyph List</h4>
+<p>Some PDF files need to map between glyph names and Unicode values during text extraction. 
+PDFBox comes with an Adobe Glyph List, but you may encounter files with glyph names that 
+are not in that map. To use your own glyphlist file, supply the file name to the <code>glyphlist_ext</code> JVM property.</p>
+<h4 id="right-to-left-text">Right to Left Text</h4>
+<p>Extracting text in languages whose text goes from right to left (such as Arabic and Hebrew)
+in PDF files can result in text that is backwards. PDFBox can normalize and reverse the text
+if the ICU4J jar file has been placed on the classpath (it is an optional dependency). 
+Note that you should also enable sorting with either org.apache.pdfbox.util.PDFTextStripper 
+or org.apache.pdfbox.ExtractText to ensure accurate output.</p> </p>
+            </div>
+        </div>
+    </div>
+
+      <footer id="copyright">
+          <div class="row-fluid">
+              <div class="span3">
+                  <!-- nothing in here on purpose -->
+              </div>
+              
+              <div class="span9">
+                  <p>Copyright © 2013 The Apache Software Foundation, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/>
+                  Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
+              </div>
+          </div>
+      </footer>
+      
+  </body>
+</html>

Added: websites/staging/pdfbox/trunk/content/cookbook/workingwithattachments.html
==============================================================================
--- websites/staging/pdfbox/trunk/content/cookbook/workingwithattachments.html (added)
+++ websites/staging/pdfbox/trunk/content/cookbook/workingwithattachments.html Tue May  7 19:45:58 2013
@@ -0,0 +1,204 @@
+<!DOCTYPE html>
+<html lang="en">
+    
+    <!--
+     
+     Licensed to the Apache Software Foundation (ASF) under one or more
+     contributor license agreements.  See the NOTICE file distributed with
+     this work for additional information regarding copyright ownership.
+     The ASF licenses this file to You under the Apache License, Version 2.0
+     (the "License"); you may not use this file except in compliance with
+     the License.  You may obtain a copy of the License at
+     
+     http://www.apache.org/licenses/LICENSE- 2.0
+     
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+     -->
+    
+  <head>
+    <title>Apache PDFBox | Cookbook - Working with Attachments</title>
+
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <link href="/bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+    <link href="/FontAwesome/css/font-awesome.css" rel="stylesheet">
+    <link href="/Iconic/iconic fill/iconic_fill.css" rel="stylesheet">
+    <link href="/css/pygments-github.css" rel="stylesheet">
+    <link href="/css/site.css" rel="stylesheet">
+        
+        
+
+    
+
+    
+    
+        <!-- Twitter Bootstrap and jQuery after this line. -->
+        <script src="http://code.jquery.com/jquery-latest.js"></script>
+        <script src="/bootstrap/js/bootstrap.js"</script>
+        <script>
+            $('.nav-collapse').collapse();
+        </script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-fixed-top">
+      <div class="navbar-inner">
+          <a href="index.html"><img class="logo" src="/images/logo-head.gif"></a>
+      </div>
+    </div>
+
+    <header class="main" id="overview">
+        <div class="container">
+        </div>
+    </header>
+
+    <div class="container-fluid">
+        <div class="row-fluid">
+            <div class="span3">
+                <ul class="nav nav-list">
+                    <li class="nav-header">Apache PDFBox</li>
+                    <li><a href="/downloads.html">
+                        <i class="icon-chevron-right"></i>
+                    Downloads</a></li>
+                    <li><a href="/dependencies.html">
+                        <i class="icon-chevron-right"></i>
+                    Dependencies</a></li>
+                    <li><a href="/references.html">
+                        <i class="icon-chevron-right"></i>
+                        References</a></li>
+                <li class="nav-header">Community</li>
+                <li><a href="/support.html">
+                    <i class="icon-chevron-right"></i>
+                    Support
+                </a></li>
+                <li><a href="/mailinglists.html">
+                    <i class="icon-chevron-right"></i>
+                    Mailing Lists
+                </a></li>
+                <li><a href="/team.html">
+                    <i class="icon-chevron-right"></i>
+                    Project Team</a></li>
+                <li  class="nav-header">Documentation</li>
+                <li><a href="/architecture.html">
+                    <i class="icon-chevron-right"></i>
+                    Architecture</a></li>
+                <li><a href="/commandline/">
+                    <i class="icon-chevron-right"></i>
+                    Command Line Tools</a></li>
+                <li class="dropdown"><a  class="dropdown-toggle" data-toggle="dropdown" href="#">
+                    <i class="icon-chevron-right"></i>
+                    PDFBox Cookbook <b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        <li><a href="/cookbook/documentcreation.html">
+                            <i class="icon-chevron-right"></i>
+                            Document Creation</a>
+                        </li>
+                        <li><a href="/cookbook/textextraction.html">
+                            <i class="icon-chevron-right"></i>
+                            Text Extraction</a>
+                        </li>
+                        <li><a href="/cookbook/pdfavalidation.html">
+                            <i class="icon-chevron-right"></i>
+                            PDF/A Validation</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithfonts.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Fonts</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithmetadata.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Metadata</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithattachments.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Attachments</a>
+                        </li>
+                    </ul>
+                </li>
+                <li  class="nav-header">For Developers</li>
+                <li><a href="/building.html">
+                    <i class="icon-chevron-right"></i>
+                    Building PDFBox</a></li>
+                <li><a href="/ideas.html">
+                    <i class="icon-chevron-right"></i>
+                    Ideas</a></li>
+                <li><a href="/codingconventions.html">
+                    <i class="icon-chevron-right"></i>
+                    Coding Conventions</a></li>
+                <li  class="nav-header">Apache Software Foundation</li>
+                <li><a href="http://www.apache.org/">
+                    <i class="icon-chevron-right"></i>
+                    Apache Software Foundation</a></li>
+                <li><a href="http://www.apache.org/foundation/thanks.html">
+                    <i class="icon-chevron-right"></i>
+                    ASF Sponsors</a></li>
+                <li><a href="http://www.apache.org/security/">
+                    <i class="icon-chevron-right"></i>
+                    Security</a></li>
+                </ul>
+            </div>
+            <div class="span9">
+                <p> <h2 id="working-with-attachments">Working with Attachments</h2>
+<h3 id="the-pdf-file-specification">The PDF File Specification</h3>
+<p>See package:org.apache.pdfbox.pdmodel.common.filespecification<br />
+See example:EmbeddedFiles<br />
+</p>
+<p>A PDF can contain references to external files via the file system or a URL to a remote 
+location. It is also possible to embed a binary file into a PDF document.</p>
+<p>There are two classes that can be used when referencing a file. PDSimpleFileSpecification 
+is a simple string reference to a file(e.g. "./movies/BigMovie.avi"). The simple file 
+specification does not allow for any parameters to be set. </p>
+<p>The PDComplexFileSpecification is more feature rich and allows for advanced settings on 
+the file reference.</p>
+<p>It is also possible to embed a file directly into a PDF. Instead of setting the file 
+attribute of the PDComplexFileSpecification, the EmbeddedFile attribute can be used instead.</p>
+<h3 id="adding-a-file-attachment">Adding a File Attachment</h3>
+<p>PDF documents can contain file attachments that are accessed from the Document-&gt;File Attachments 
+menu. PDFBox allows attachments to be added to and extracted from PDF documents. 
+Attachments are part of the named tree that is attached to the document catalog.</p>
+<div class="codehilite"><pre><span class="n">PDEmbeddedFilesNameTreeNode</span> <span class="n">efTree</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDEmbeddedFilesNameTreeNode</span><span class="o">();</span>
+
+<span class="c1">//first create the file specification, which holds the embedded file</span>
+<span class="n">PDComplexFileSpecification</span> <span class="n">fs</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDComplexFileSpecification</span><span class="o">();</span>
+<span class="n">fs</span><span class="o">.</span><span class="na">setFile</span><span class="o">(</span> <span class="s">&quot;Test.txt&quot;</span> <span class="o">);</span>
+<span class="n">InputStream</span> <span class="n">is</span> <span class="o">=</span> <span class="o">...;</span>
+<span class="n">PDEmbeddedFile</span> <span class="n">ef</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDEmbeddedFile</span><span class="o">(</span><span class="n">doc</span><span class="o">,</span> <span class="n">is</span> <span class="o">);</span>
+<span class="c1">//set some of the attributes of the embedded file</span>
+<span class="n">ef</span><span class="o">.</span><span class="na">setSubtype</span><span class="o">(</span> <span class="s">&quot;test/plain&quot;</span> <span class="o">);</span>
+<span class="n">ef</span><span class="o">.</span><span class="na">setSize</span><span class="o">(</span> <span class="n">data</span><span class="o">.</span><span class="na">length</span> <span class="o">);</span>
+<span class="n">ef</span><span class="o">.</span><span class="na">setCreationDate</span><span class="o">(</span> <span class="k">new</span> <span class="n">GregorianCalendar</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">fs</span><span class="o">.</span><span class="na">setEmbeddedFile</span><span class="o">(</span> <span class="n">ef</span> <span class="o">);</span>
+
+<span class="c1">//now add the entry to the embedded file tree and set in the document.</span>
+<span class="n">Map</span> <span class="n">efMap</span> <span class="o">=</span> <span class="k">new</span> <span class="n">HashMap</span><span class="o">();</span>
+<span class="n">efMap</span><span class="o">.</span><span class="na">put</span><span class="o">(</span> <span class="s">&quot;My first attachment&quot;</span><span class="o">,</span> <span class="n">fs</span> <span class="o">);</span>
+<span class="n">efTree</span><span class="o">.</span><span class="na">setNames</span><span class="o">(</span> <span class="n">efMap</span> <span class="o">);</span>
+<span class="c1">//attachments are stored as part of the &quot;names&quot; dictionary in the document catalog</span>
+<span class="n">PDDocumentNameDictionary</span> <span class="n">names</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDDocumentNameDictionary</span><span class="o">(</span> <span class="n">doc</span><span class="o">.</span><span class="na">getDocumentCatalog</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">names</span><span class="o">.</span><span class="na">setEmbeddedFiles</span><span class="o">(</span> <span class="n">efTree</span> <span class="o">);</span>
+<span class="n">doc</span><span class="o">.</span><span class="na">getDocumentCatalog</span><span class="o">().</span><span class="na">setNames</span><span class="o">(</span> <span class="n">names</span> <span class="o">);</span>
+</pre></div> </p>
+            </div>
+        </div>
+    </div>
+
+      <footer id="copyright">
+          <div class="row-fluid">
+              <div class="span3">
+                  <!-- nothing in here on purpose -->
+              </div>
+              
+              <div class="span9">
+                  <p>Copyright © 2013 The Apache Software Foundation, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/>
+                  Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
+              </div>
+          </div>
+      </footer>
+      
+  </body>
+</html>

Added: websites/staging/pdfbox/trunk/content/cookbook/workingwithfonts.html
==============================================================================
--- websites/staging/pdfbox/trunk/content/cookbook/workingwithfonts.html (added)
+++ websites/staging/pdfbox/trunk/content/cookbook/workingwithfonts.html Tue May  7 19:45:58 2013
@@ -0,0 +1,324 @@
+<!DOCTYPE html>
+<html lang="en">
+    
+    <!--
+     
+     Licensed to the Apache Software Foundation (ASF) under one or more
+     contributor license agreements.  See the NOTICE file distributed with
+     this work for additional information regarding copyright ownership.
+     The ASF licenses this file to You under the Apache License, Version 2.0
+     (the "License"); you may not use this file except in compliance with
+     the License.  You may obtain a copy of the License at
+     
+     http://www.apache.org/licenses/LICENSE- 2.0
+     
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+     -->
+    
+  <head>
+    <title>Apache PDFBox | Cookbook - Working with Fonts</title>
+
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <link href="/bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+    <link href="/FontAwesome/css/font-awesome.css" rel="stylesheet">
+    <link href="/Iconic/iconic fill/iconic_fill.css" rel="stylesheet">
+    <link href="/css/pygments-github.css" rel="stylesheet">
+    <link href="/css/site.css" rel="stylesheet">
+        
+        
+
+    
+
+    
+    
+        <!-- Twitter Bootstrap and jQuery after this line. -->
+        <script src="http://code.jquery.com/jquery-latest.js"></script>
+        <script src="/bootstrap/js/bootstrap.js"</script>
+        <script>
+            $('.nav-collapse').collapse();
+        </script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-fixed-top">
+      <div class="navbar-inner">
+          <a href="index.html"><img class="logo" src="/images/logo-head.gif"></a>
+      </div>
+    </div>
+
+    <header class="main" id="overview">
+        <div class="container">
+        </div>
+    </header>
+
+    <div class="container-fluid">
+        <div class="row-fluid">
+            <div class="span3">
+                <ul class="nav nav-list">
+                    <li class="nav-header">Apache PDFBox</li>
+                    <li><a href="/downloads.html">
+                        <i class="icon-chevron-right"></i>
+                    Downloads</a></li>
+                    <li><a href="/dependencies.html">
+                        <i class="icon-chevron-right"></i>
+                    Dependencies</a></li>
+                    <li><a href="/references.html">
+                        <i class="icon-chevron-right"></i>
+                        References</a></li>
+                <li class="nav-header">Community</li>
+                <li><a href="/support.html">
+                    <i class="icon-chevron-right"></i>
+                    Support
+                </a></li>
+                <li><a href="/mailinglists.html">
+                    <i class="icon-chevron-right"></i>
+                    Mailing Lists
+                </a></li>
+                <li><a href="/team.html">
+                    <i class="icon-chevron-right"></i>
+                    Project Team</a></li>
+                <li  class="nav-header">Documentation</li>
+                <li><a href="/architecture.html">
+                    <i class="icon-chevron-right"></i>
+                    Architecture</a></li>
+                <li><a href="/commandline/">
+                    <i class="icon-chevron-right"></i>
+                    Command Line Tools</a></li>
+                <li class="dropdown"><a  class="dropdown-toggle" data-toggle="dropdown" href="#">
+                    <i class="icon-chevron-right"></i>
+                    PDFBox Cookbook <b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        <li><a href="/cookbook/documentcreation.html">
+                            <i class="icon-chevron-right"></i>
+                            Document Creation</a>
+                        </li>
+                        <li><a href="/cookbook/textextraction.html">
+                            <i class="icon-chevron-right"></i>
+                            Text Extraction</a>
+                        </li>
+                        <li><a href="/cookbook/pdfavalidation.html">
+                            <i class="icon-chevron-right"></i>
+                            PDF/A Validation</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithfonts.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Fonts</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithmetadata.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Metadata</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithattachments.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Attachments</a>
+                        </li>
+                    </ul>
+                </li>
+                <li  class="nav-header">For Developers</li>
+                <li><a href="/building.html">
+                    <i class="icon-chevron-right"></i>
+                    Building PDFBox</a></li>
+                <li><a href="/ideas.html">
+                    <i class="icon-chevron-right"></i>
+                    Ideas</a></li>
+                <li><a href="/codingconventions.html">
+                    <i class="icon-chevron-right"></i>
+                    Coding Conventions</a></li>
+                <li  class="nav-header">Apache Software Foundation</li>
+                <li><a href="http://www.apache.org/">
+                    <i class="icon-chevron-right"></i>
+                    Apache Software Foundation</a></li>
+                <li><a href="http://www.apache.org/foundation/thanks.html">
+                    <i class="icon-chevron-right"></i>
+                    ASF Sponsors</a></li>
+                <li><a href="http://www.apache.org/security/">
+                    <i class="icon-chevron-right"></i>
+                    Security</a></li>
+                </ul>
+            </div>
+            <div class="span9">
+                <p> <h2 id="working-with-fonts">Working with Fonts</h2>
+<h3 id="standard-14-fonts">Standard 14 Fonts</h3>
+<p>The PDF specification states that a standard set of 14 fonts will always be available when consuming PDF documents. In PDFBox these are defined as constants in the PDType1Font class.</p>
+<table>
+<thead>
+<tr>
+<th>Standard Font</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>PDType1Font.TIMES_ROMAN</td>
+<td>Times regular</td>
+</tr>
+<tr>
+<td>PDType1Font.TIMES_BOLD</td>
+<td>Times bold</td>
+</tr>
+<tr>
+<td>PDType1Font.TIMES_ITALIC</td>
+<td>Times italic</td>
+</tr>
+<tr>
+<td>PDType1Font.TIMES_BOLD_ITALIC</td>
+<td>Times bold italic</td>
+</tr>
+<tr>
+<td>PDType1Font.HELVETICA</td>
+<td>Helvetica regular</td>
+</tr>
+<tr>
+<td>PDType1Font.HELVETICA_BOLD</td>
+<td>Helvetica bold</td>
+</tr>
+<tr>
+<td>PDType1Font.HELVETICA_OBLIQUE</td>
+<td>Helvetica italic</td>
+</tr>
+<tr>
+<td>PDType1Font.HELVETICA_BOLD_OBLIQUE</td>
+<td>Helvetica bold italic</td>
+</tr>
+<tr>
+<td>PDType1Font.COURIER</td>
+<td>Courier</td>
+</tr>
+<tr>
+<td>PDType1Font.COURIER_BOLD</td>
+<td>Courier bold</td>
+</tr>
+<tr>
+<td>PDType1Font.COURIER_OBLIQUE</td>
+<td>Courier italic</td>
+</tr>
+<tr>
+<td>PDType1Font.COURIER_BOLD_OBLIQUE</td>
+<td>Courier bold italic</td>
+</tr>
+<tr>
+<td>PDType1Font.SYMBOL</td>
+<td>Symbol Set</td>
+</tr>
+<tr>
+<td>PDType1Font.ZAPF_DINGBATS</td>
+<td>Dingbat Typeface</td>
+</tr>
+</tbody>
+</table>
+<h3 id="hello-world-using-a-pdf-base-font">Hello World using a PDF base font</h3>
+<p>This small sample shows how to create a new document and print the text "Hello World" using one of the PDF base fonts.</p>
+<div class="codehilite"><pre><span class="c1">// Create a document and add a page to it</span>
+<span class="n">PDDocument</span> <span class="n">document</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDDocument</span><span class="o">();</span>
+<span class="n">PDPage</span> <span class="n">page</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPage</span><span class="o">();</span>
+<span class="n">document</span><span class="o">.</span><span class="na">addPage</span><span class="o">(</span> <span class="n">page</span> <span class="o">);</span>
+
+<span class="c1">// Create a new font object selecting one of the PDF base fonts</span>
+<span class="n">PDFont</span> <span class="n">font</span> <span class="o">=</span> <span class="n">PDType1Font</span><span class="o">.</span><span class="na">HELVETICA_BOLD</span><span class="o">;</span>
+
+<span class="c1">// Start a new content stream which will &quot;hold&quot; the to be created content</span>
+<span class="n">PDPageContentStream</span> <span class="n">contentStream</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPageContentStream</span><span class="o">(</span><span class="n">document</span><span class="o">,</span> <span class="n">page</span><span class="o">);</span>
+
+<span class="c1">// Define a text content stream using the selected font, moving the cursor and drawing the text &quot;Hello World&quot;</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">beginText</span><span class="o">();</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">setFont</span><span class="o">(</span> <span class="n">font</span><span class="o">,</span> <span class="mi">12</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">moveTextPositionByAmount</span><span class="o">(</span> <span class="mi">100</span><span class="o">,</span> <span class="mi">700</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">drawString</span><span class="o">(</span> <span class="s">&quot;Hello World&quot;</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">endText</span><span class="o">();</span>
+
+<span class="c1">// Make sure that the content stream is closed:</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+
+<span class="c1">// Save the results and ensure that the document is properly closed:</span>
+<span class="n">document</span><span class="o">.</span><span class="na">save</span><span class="o">(</span> <span class="s">&quot;Hello World.pdf&quot;</span><span class="o">);</span>
+<span class="n">document</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+</pre></div>
+
+
+<h3 id="hello-world-using-a-truetype-font">Hello World using a TrueType font</h3>
+<p>This small sample shows how to create a new document and print the text "Hello World" using a TrueType font.</p>
+<div class="codehilite"><pre><span class="c1">// Create a document and add a page to it</span>
+<span class="n">PDDocument</span> <span class="n">document</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDDocument</span><span class="o">();</span>
+<span class="n">PDPage</span> <span class="n">page</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPage</span><span class="o">();</span>
+<span class="n">document</span><span class="o">.</span><span class="na">addPage</span><span class="o">(</span> <span class="n">page</span> <span class="o">);</span>
+
+<span class="c1">// Create a new font object by loading a TrueType font into the document</span>
+<span class="n">PDFont</span> <span class="n">font</span> <span class="o">=</span> <span class="n">PDTrueTypeFont</span><span class="o">.</span><span class="na">loadTTF</span><span class="o">(</span><span class="n">document</span><span class="o">,</span> <span class="s">&quot;Arial.ttf&quot;</span><span class="o">);</span>
+
+<span class="c1">// Start a new content stream which will &quot;hold&quot; the to be created content</span>
+<span class="n">PDPageContentStream</span> <span class="n">contentStream</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPageContentStream</span><span class="o">(</span><span class="n">document</span><span class="o">,</span> <span class="n">page</span><span class="o">);</span>
+
+<span class="c1">// Define a text content stream using the selected font, moving the cursor and drawing the text &quot;Hello World&quot;</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">beginText</span><span class="o">();</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">setFont</span><span class="o">(</span> <span class="n">font</span><span class="o">,</span> <span class="mi">12</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">moveTextPositionByAmount</span><span class="o">(</span> <span class="mi">100</span><span class="o">,</span> <span class="mi">700</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">drawString</span><span class="o">(</span> <span class="s">&quot;Hello World&quot;</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">endText</span><span class="o">();</span>
+
+<span class="c1">// Make sure that the content stream is closed:</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+
+<span class="c1">// Save the results and ensure that the document is properly closed:</span>
+<span class="n">document</span><span class="o">.</span><span class="na">save</span><span class="o">(</span> <span class="s">&quot;Hello World.pdf&quot;</span><span class="o">);</span>
+<span class="n">document</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+</pre></div>
+
+
+<p>While it is recommended to embed all fonts for greatest portability not all PDF producer 
+applications will do this. When displaying a PDF it is necessary to find an external font to use. 
+PDFBox will look for a mapping file to use when substituting fonts.</p>
+<p>PDFBox will load Resources/PDFBox_External_Fonts.properties off of the classpath to map font
+names to TTF font files. The UNKNOWN_FONT property in that file will tell PDFBox which font to 
+use when no mapping exists. </p>
+<h3 id="hello-world-using-a-postscript-type1-font">Hello World using a Postscript Type1 font</h3>
+<p>This small sample shows how to create a new document and print the text "Hello World" using a Postscript Type1 font.</p>
+<div class="codehilite"><pre><span class="c1">// Create a document and add a page to it</span>
+<span class="n">PDDocument</span> <span class="n">document</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDDocument</span><span class="o">();</span>
+<span class="n">PDPage</span> <span class="n">page</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPage</span><span class="o">();</span>
+<span class="n">document</span><span class="o">.</span><span class="na">addPage</span><span class="o">(</span> <span class="n">page</span> <span class="o">);</span>
+
+<span class="c1">// Create a new font object by loading a Postscript Type 1 font into the document</span>
+<span class="n">PDFont</span> <span class="n">font</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDType1AfmPfbFont</span><span class="o">(</span><span class="n">doc</span><span class="o">,</span><span class="s">&quot;cfm.afm&quot;</span><span class="o">);</span>
+
+<span class="c1">// Start a new content stream which will &quot;hold&quot; the to be created content</span>
+<span class="n">PDPageContentStream</span> <span class="n">contentStream</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDPageContentStream</span><span class="o">(</span><span class="n">document</span><span class="o">,</span> <span class="n">page</span><span class="o">);</span>
+
+<span class="c1">// Define a text content stream using the selected font, moving the cursor and drawing the text &quot;Hello World&quot;</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">beginText</span><span class="o">();</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">setFont</span><span class="o">(</span> <span class="n">font</span><span class="o">,</span> <span class="mi">12</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">moveTextPositionByAmount</span><span class="o">(</span> <span class="mi">100</span><span class="o">,</span> <span class="mi">700</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">drawString</span><span class="o">(</span> <span class="s">&quot;Hello World&quot;</span> <span class="o">);</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">endText</span><span class="o">();</span>
+
+<span class="c1">// Make sure that the content stream is closed:</span>
+<span class="n">contentStream</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+
+<span class="c1">// Save the results and ensure that the document is properly closed:</span>
+<span class="n">document</span><span class="o">.</span><span class="na">save</span><span class="o">(</span> <span class="s">&quot;Hello World.pdf&quot;</span><span class="o">);</span>
+<span class="n">document</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
+</pre></div> </p>
+            </div>
+        </div>
+    </div>
+
+      <footer id="copyright">
+          <div class="row-fluid">
+              <div class="span3">
+                  <!-- nothing in here on purpose -->
+              </div>
+              
+              <div class="span9">
+                  <p>Copyright © 2013 The Apache Software Foundation, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/>
+                  Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
+              </div>
+          </div>
+      </footer>
+      
+  </body>
+</html>

Added: websites/staging/pdfbox/trunk/content/cookbook/workingwithmetadata.html
==============================================================================
--- websites/staging/pdfbox/trunk/content/cookbook/workingwithmetadata.html (added)
+++ websites/staging/pdfbox/trunk/content/cookbook/workingwithmetadata.html Tue May  7 19:45:58 2013
@@ -0,0 +1,218 @@
+<!DOCTYPE html>
+<html lang="en">
+    
+    <!--
+     
+     Licensed to the Apache Software Foundation (ASF) under one or more
+     contributor license agreements.  See the NOTICE file distributed with
+     this work for additional information regarding copyright ownership.
+     The ASF licenses this file to You under the Apache License, Version 2.0
+     (the "License"); you may not use this file except in compliance with
+     the License.  You may obtain a copy of the License at
+     
+     http://www.apache.org/licenses/LICENSE- 2.0
+     
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+     -->
+    
+  <head>
+    <title>Apache PDFBox | Cookbook - Working with Metadata</title>
+
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <link href="/bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+    <link href="/FontAwesome/css/font-awesome.css" rel="stylesheet">
+    <link href="/Iconic/iconic fill/iconic_fill.css" rel="stylesheet">
+    <link href="/css/pygments-github.css" rel="stylesheet">
+    <link href="/css/site.css" rel="stylesheet">
+        
+        
+
+    
+
+    
+    
+        <!-- Twitter Bootstrap and jQuery after this line. -->
+        <script src="http://code.jquery.com/jquery-latest.js"></script>
+        <script src="/bootstrap/js/bootstrap.js"</script>
+        <script>
+            $('.nav-collapse').collapse();
+        </script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-fixed-top">
+      <div class="navbar-inner">
+          <a href="index.html"><img class="logo" src="/images/logo-head.gif"></a>
+      </div>
+    </div>
+
+    <header class="main" id="overview">
+        <div class="container">
+        </div>
+    </header>
+
+    <div class="container-fluid">
+        <div class="row-fluid">
+            <div class="span3">
+                <ul class="nav nav-list">
+                    <li class="nav-header">Apache PDFBox</li>
+                    <li><a href="/downloads.html">
+                        <i class="icon-chevron-right"></i>
+                    Downloads</a></li>
+                    <li><a href="/dependencies.html">
+                        <i class="icon-chevron-right"></i>
+                    Dependencies</a></li>
+                    <li><a href="/references.html">
+                        <i class="icon-chevron-right"></i>
+                        References</a></li>
+                <li class="nav-header">Community</li>
+                <li><a href="/support.html">
+                    <i class="icon-chevron-right"></i>
+                    Support
+                </a></li>
+                <li><a href="/mailinglists.html">
+                    <i class="icon-chevron-right"></i>
+                    Mailing Lists
+                </a></li>
+                <li><a href="/team.html">
+                    <i class="icon-chevron-right"></i>
+                    Project Team</a></li>
+                <li  class="nav-header">Documentation</li>
+                <li><a href="/architecture.html">
+                    <i class="icon-chevron-right"></i>
+                    Architecture</a></li>
+                <li><a href="/commandline/">
+                    <i class="icon-chevron-right"></i>
+                    Command Line Tools</a></li>
+                <li class="dropdown"><a  class="dropdown-toggle" data-toggle="dropdown" href="#">
+                    <i class="icon-chevron-right"></i>
+                    PDFBox Cookbook <b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        <li><a href="/cookbook/documentcreation.html">
+                            <i class="icon-chevron-right"></i>
+                            Document Creation</a>
+                        </li>
+                        <li><a href="/cookbook/textextraction.html">
+                            <i class="icon-chevron-right"></i>
+                            Text Extraction</a>
+                        </li>
+                        <li><a href="/cookbook/pdfavalidation.html">
+                            <i class="icon-chevron-right"></i>
+                            PDF/A Validation</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithfonts.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Fonts</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithmetadata.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Metadata</a>
+                        </li>
+                        <li><a href="/cookbook/workingwithattachments.html">
+                            <i class="icon-chevron-right"></i>
+                            Working with Attachments</a>
+                        </li>
+                    </ul>
+                </li>
+                <li  class="nav-header">For Developers</li>
+                <li><a href="/building.html">
+                    <i class="icon-chevron-right"></i>
+                    Building PDFBox</a></li>
+                <li><a href="/ideas.html">
+                    <i class="icon-chevron-right"></i>
+                    Ideas</a></li>
+                <li><a href="/codingconventions.html">
+                    <i class="icon-chevron-right"></i>
+                    Coding Conventions</a></li>
+                <li  class="nav-header">Apache Software Foundation</li>
+                <li><a href="http://www.apache.org/">
+                    <i class="icon-chevron-right"></i>
+                    Apache Software Foundation</a></li>
+                <li><a href="http://www.apache.org/foundation/thanks.html">
+                    <i class="icon-chevron-right"></i>
+                    ASF Sponsors</a></li>
+                <li><a href="http://www.apache.org/security/">
+                    <i class="icon-chevron-right"></i>
+                    Security</a></li>
+                </ul>
+            </div>
+            <div class="span9">
+                <p> <h2 id="working-with-metadata">Working with Metadata</h2>
+<h3 id="introduction">Introduction</h3>
+<p>PDF documents can contain information describing the document itself or certain objects 
+within the document such as the author of the document or it's creation date. 
+Basic information can be set and retrieved using the PDDocumentInformation object.</p>
+<p>In addition to that more metadata can be retrieved using the XML metadata as decribed below.
+Getting basic Metadata</p>
+<p>To set or retrieve basic information about the document the PDDocumentInformation object 
+provides a high level API to that information:</p>
+<div class="codehilite"><pre><span class="n">PDDocumentInformation</span> <span class="n">info</span> <span class="o">=</span> <span class="n">document</span><span class="o">.</span><span class="na">getDocumentInformation</span><span class="o">();</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Page Count=&quot;</span> <span class="o">+</span> <span class="n">document</span><span class="o">.</span><span class="na">getNumberOfPages</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Title=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getTitle</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Author=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getAuthor</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Subject=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getSubject</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Keywords=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getKeywords</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Creator=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getCreator</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Producer=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getProducer</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Creation Date=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getCreationDate</span><span class="o">()</span> <span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Modification Date=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getModificationDate</span><span class="o">());</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span> <span class="s">&quot;Trapped=&quot;</span> <span class="o">+</span> <span class="n">info</span><span class="o">.</span><span class="na">getTrapped</span><span class="o">()</span> <span class="o">);</span>
+</pre></div>
+
+
+<h3 id="accessing-pdf-metadata">Accessing PDF Metadata</h3>
+<p>See class:org.apache.pdfbox.pdmodel.common.PDMetadata<br />
+See example:AddMetadataFromDocInfo<br />
+See Adobe Documentation:XMP Specification<br />
+</p>
+<p>PDF documents can have XML metadata associated with certain objects within a PDF document.
+For example, the following PD Model objects have the ability to contain metadata:</p>
+<div class="codehilite"><pre><span class="n">PDDocumentCatalog</span>
+<span class="n">PDPage</span>
+<span class="n">PDXObject</span>
+<span class="n">PDICCBased</span>
+<span class="n">PDStream</span>
+</pre></div>
+
+
+<p>The metadata that is stored in PDF objects conforms to the XMP specification, it is 
+recommended that you review that specification. Currently there is no high level API for 
+managing the XML metadata, PDFBox uses standard java InputStream/OutputStream to retrieve 
+or set the XML metadata.</p>
+<div class="codehilite"><pre><span class="n">PDDocument</span> <span class="n">doc</span> <span class="o">=</span> <span class="n">PDDocument</span><span class="o">.</span><span class="na">load</span><span class="o">(</span> <span class="o">...</span> <span class="o">);</span>
+<span class="n">PDDocumentCatalog</span> <span class="n">catalog</span> <span class="o">=</span> <span class="n">doc</span><span class="o">.</span><span class="na">getDocumentCatalog</span><span class="o">();</span>
+<span class="n">PDMetadata</span> <span class="n">metadata</span> <span class="o">=</span> <span class="n">catalog</span><span class="o">.</span><span class="na">getMetadata</span><span class="o">();</span>
+
+<span class="c1">//to read the XML metadata</span>
+<span class="n">InputStream</span> <span class="n">xmlInputStream</span> <span class="o">=</span> <span class="n">metadata</span><span class="o">.</span><span class="na">createInputStream</span><span class="o">();</span>
+
+<span class="c1">//or to write new XML metadata</span>
+<span class="n">InputStream</span> <span class="n">newXMPData</span> <span class="o">=</span> <span class="o">...;</span>
+<span class="n">PDMetadata</span> <span class="n">newMetadata</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PDMetadata</span><span class="o">(</span><span class="n">doc</span><span class="o">,</span> <span class="n">newXMLData</span><span class="o">,</span> <span class="kc">false</span> <span class="o">);</span>
+<span class="n">catalog</span><span class="o">.</span><span class="na">setMetadata</span><span class="o">(</span> <span class="n">newMetadata</span> <span class="o">);</span>
+</pre></div> </p>
+            </div>
+        </div>
+    </div>
+
+      <footer id="copyright">
+          <div class="row-fluid">
+              <div class="span3">
+                  <!-- nothing in here on purpose -->
+              </div>
+              
+              <div class="span9">
+                  <p>Copyright © 2013 The Apache Software Foundation, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/>
+                  Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
+              </div>
+          </div>
+      </footer>
+      
+  </body>
+</html>



Mime
View raw message