Added: poi/site/publish/slideshow/how-to-shapes.html URL: http://svn.apache.org/viewvc/poi/site/publish/slideshow/how-to-shapes.html?rev=1423805&view=auto ============================================================================== --- poi/site/publish/slideshow/how-to-shapes.html (added) +++ poi/site/publish/slideshow/how-to-shapes.html Wed Dec 19 09:27:20 2012 @@ -0,0 +1,1015 @@ + + + + + + + + + +Busy Developers' Guide to HSLF drawing layer + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + +
+ Search Apache POI
+
+
+
+
+
+
+

Busy Developers' Guide to HSLF drawing layer

+
+
+ + + + +
+

Busy Developers' Guide to HSLF drawing layer

+
+ + +
+

Index of Features

+
+ + + + + +
+

Features

+
+ + + +
+

New Presentation

+
+ +
+    //create a new empty slide show
+    SlideShow ppt = new SlideShow();
+
+    //add first slide
+    Slide s1 = ppt.createSlide();
+
+    //add second slide
+    Slide s2 = ppt.createSlide();
+    
+    //save changes in a file
+    FileOutputStream out = new FileOutputStream("slideshow.ppt");
+    ppt.write(out);
+    out.close();
+                 
+ + + + +
+

How to retrieve or change slide size

+
+ +
+    SlideShow ppt = new SlideShow(new HSLFSlideShow("slideshow.ppt"));
+    //retrieve page size. Coordinates are expressed in points (72 dpi)
+    java.awt.Dimension pgsize = ppt.getPageSize();
+    int pgx = pgsize.width; //slide width
+    int pgy = pgsize.height; //slide height
+
+    //set new page size
+    ppt.setPageSize(new java.awt.Dimension(1024, 768));
+    //save changes 
+    FileOutputStream out = new FileOutputStream("slideshow.ppt");
+    ppt.write(out);
+    out.close();
+                  
+ + + + +
+

How to get shapes contained in a particular slide

+
+ +

+ The following code demonstrates how to iterate over shapes for each slide. +

+ +
+  SlideShow ppt = new SlideShow(new HSLFSlideShow("slideshow.ppt"));
+  //get slides 
+  Slide[] slide = ppt.getSlides();
+  for (int i = 0; i < slide.length; i++){
+    Shape[] sh = slide[i].getShapes();
+    for (int j = 0; j < sh.length; j++){
+      //name of the shape
+      String name = sh[j].getShapeName();
+
+      //shapes's anchor which defines the position of this shape in the slide
+      java.awt.Rectangle anchor = sh[j].getAnchor();
+
+      if (sh[j] instanceof Line){
+        Line line = (Line)sh[j];
+        //work with Line
+      } else if (sh[j] instanceof AutoShape){
+        AutoShape shape = (AutoShape)sh[j];
+        //work with AutoShape
+      } else if (sh[j] instanceof TextBox){
+        TextBox shape = (TextBox)sh[j];
+        //work with TextBox
+      } else if (sh[j] instanceof Picture){
+        Picture shape = (Picture)sh[j];
+        //work with Picture
+      }
+    }
+  }
+                  
+ + + + +
+

Drawing a shape on a slide

+
+ +
+
Warning
+
+ To work with graphic objects HSLF uses Java2D classes + that may throw exceptions if graphical environment is not available. In case if graphical environment + is not available, you must tell Java that you are running in headless mode and + set the following system property: java.awt.headless=true + (either via -Djava.awt.headless=true startup parameter or via System.setProperty("java.awt.headless", "true")). +
+
+ +

+ When you add a shape, you usually specify the dimensions of the shape and the position + of the upper left corner of the bounding box for the shape relative to the upper left + corner of the slide. Distances in the drawing layer are measured in points (72 points = 1 inch). +

+ +
+  SlideShow ppt = new SlideShow();
+
+  Slide slide = ppt.createSlide();
+
+  //Line shape
+  Line line = new Line();
+  line.setAnchor(new java.awt.Rectangle(50, 50, 100, 20));
+  line.setLineColor(new Color(0, 128, 0));
+  line.setLineStyle(Line.LINE_DOUBLE);
+  slide.addShape(line);
+
+  //TextBox
+  TextBox txt = new TextBox();
+  txt.setText("Hello, World!");
+  txt.setAnchor(new java.awt.Rectangle(300, 100, 300, 50));
+
+  //use RichTextRun to work with the text format
+  RichTextRun rt = txt.getTextRun().getRichTextRuns()[0];
+  rt.setFontSize(32);
+  rt.setFontName("Arial");
+  rt.setBold(true);
+  rt.setItalic(true);
+  rt.setUnderlined(true);
+  rt.setFontColor(Color.red);
+  rt.setAlignment(TextBox.AlignRight);
+
+  slide.addShape(txt);
+
+  //Autoshape
+  //32-point star
+  AutoShape sh1 = new AutoShape(ShapeTypes.Star32);
+  sh1.setAnchor(new java.awt.Rectangle(50, 50, 100, 200));
+  sh1.setFillColor(Color.red);
+  slide.addShape(sh1);
+
+  //Trapezoid
+  AutoShape sh2 = new AutoShape(ShapeTypes.Trapezoid);
+  sh2.setAnchor(new java.awt.Rectangle(150, 150, 100, 200));
+  sh2.setFillColor(Color.blue);
+  slide.addShape(sh2);
+
+  FileOutputStream out = new FileOutputStream("slideshow.ppt");
+  ppt.write(out);
+  out.close();
+                    
+                  
+ + + + +
+

How to work with pictures

+
+ + +

+ Currently, HSLF API supports the following types of pictures: +

+ +
    + +
  • Windows Metafiles (WMF)
  • + +
  • Enhanced Metafiles (EMF)
  • + +
  • JPEG Interchange Format
  • + +
  • Portable Network Graphics (PNG)
  • + +
  • Macintosh PICT
  • + +
+ + +
+  SlideShow ppt = new SlideShow(new HSLFSlideShow("slideshow.ppt"));
+
+  //extract all pictures contained in the presentation
+  PictureData[] pdata = ppt.getPictureData();
+  for (int i = 0; i < pdata.length; i++){
+    PictureData pict = pdata[i];
+
+    // picture data
+    byte[] data = pict.getData();
+
+    int type = pict.getType();
+    String ext;
+    switch (type){
+      case Picture.JPEG: ext=".jpg"; break;
+      case Picture.PNG: ext=".png"; break;
+      case Picture.WMF: ext=".wmf"; break;
+      case Picture.EMF: ext=".emf"; break;
+      case Picture.PICT: ext=".pict"; break;
+      default: continue;
+    }
+    FileOutputStream out = new FileOutputStream("pict_"+i + ext);
+      out.write(data);
+      out.close();
+
+  }
+
+  // add a new picture to this slideshow and insert it in a  new slide
+  int idx = ppt.addPicture(new File("clock.jpg"), Picture.JPEG);
+
+  Picture pict = new Picture(idx);
+
+  //set image position in the slide
+  pict.setAnchor(new java.awt.Rectangle(100, 100, 300, 200));
+
+  Slide slide = ppt.createSlide();
+  slide.addShape(pict);
+
+  //now retrieve pictures containes in the first slide and save them on disk
+  slide = ppt.getSlides()[0];
+  Shape[] sh = slide.getShapes();
+  for (int i = 0; i < sh.length; i++){
+    if (sh[i] instanceof Picture){
+      Picture pict = (Picture)sh[i];
+      PictureData pictData = pict.getPictureData();
+      byte[] data = pictData.getData();
+      int type = pictData.getType();
+      if (type == Picture.JPEG){
+        FileOutputStream out = new FileOutputStream("slide0_"+i+".jpg");
+        out.write(data);
+        out.close();
+      } else if (type == Picture.PNG){
+        FileOutputStream out = new FileOutputStream("slide0_"+i+".png");
+        out.write(data);
+        out.close();
+      }
+    }
+  }
+
+  FileOutputStream out = new FileOutputStream("slideshow.ppt");
+  ppt.write(out);
+  out.close();
+
+                    
+ + + + +
+

How to set slide title

+
+ +
+    SlideShow ppt = new SlideShow();
+    Slide slide = ppt.createSlide();
+    TextBox title = slide.addTitle();
+    title.setText("Hello, World!");
+    
+    //save changes 
+    FileOutputStream out = new FileOutputStream("slideshow.ppt");
+    ppt.write(out);
+    out.close();
+                  
+ +

+ Below is the equivalent code in PowerPoint VBA: +

+ +
+    Set myDocument = ActivePresentation.Slides(1)
+    myDocument.Shapes.AddTitle.TextFrame.TextRange.Text = "Hello, World!"
+                  
+ + + + +
+

How to modify background of a slide master

+
+ +
+        SlideShow ppt = new SlideShow();
+        SlideMaster master = ppt.getSlidesMasters()[0];
+
+        Fill fill = master.getBackground().getFill();
+        int idx = ppt.addPicture(new File("background.png"), Picture.PNG);
+        fill.setFillType(Fill.FILL_PICTURE);
+        fill.setPictureData(idx);
+                  
+ + + +
+

How to modify background of a slide

+
+ +
+        SlideShow ppt = new SlideShow();
+        Slide slide = ppt.createSlide();
+        
+        //This slide has its own background. 
+        //Without this line it will use master's background.
+        slide.setFollowMasterBackground(false);
+        Fill fill = slide.getBackground().getFill();
+        int idx = ppt.addPicture(new File("background.png"), Picture.PNG);
+        fill.setFillType(Fill.FILL_PATTERN);
+        fill.setPictureData(idx);
+                  
+ + + +
+

How to modify background of a shape

+
+ +
+        SlideShow ppt = new SlideShow();
+        Slide slide = ppt.createSlide();
+        
+        Shape shape = new AutoShape(ShapeTypes.Rectangle);
+        shape.setAnchor(new java.awt.Rectangle(100, 100, 200, 200));
+        Fill fill = shape.getFill();
+        fill.setFillType(Fill.FILL_SHADE);
+        fill.setBackgroundColor(Color.red);
+        fill.setForegroundColor(Color.green);
+        
+        slide.addShape(shape);
+                  
+ + + + +
+

How to create bulleted lists

+
+ +
+  SlideShow ppt = new SlideShow();
+
+  Slide slide = ppt.createSlide();
+
+  TextBox shape = new TextBox();
+  RichTextRun rt = shape.getTextRun().getRichTextRuns()[0];
+  shape.setText(
+          "January\r" +
+          "February\r" +
+          "March\r" +
+          "April");
+  rt.setFontSize(42);
+  rt.setBullet(true);
+  rt.setBulletOffset(0);  //bullet offset
+  rt.setTextOffset(50);   //text offset (should be greater than bullet offset)
+  rt.setBulletChar('\u263A'); //bullet character
+  slide.addShape(shape);
+
+  shape.setAnchor(new java.awt.Rectangle(50, 50, 500, 300));  //position of the text box in the slide
+  slide.addShape(shape);
+
+  FileOutputStream out = new FileOutputStream("bullets.ppt");
+  ppt.write(out);
+  out.close();
+                
+ + + + +
+

How to read hyperlinks from a slide show

+
+ +
+    FileInputStream is = new FileInputStream("slideshow.ppt");
+    SlideShow ppt = new SlideShow(is);
+    is.close();
+
+    Slide[] slide = ppt.getSlides();
+    for (int j = 0; j < slide.length; j++) {
+
+        //read hyperlinks from the text runs
+        TextRun[] txt = slide[j].getTextRuns();
+        for (int k = 0; k < txt.length; k++) {
+            String text = txt[k].getText();
+            Hyperlink[] links = txt[k].getHyperlinks();
+            if(links != null) for (int l = 0; l < links.length; l++) {
+                Hyperlink link = links[l];
+                String title = link.getTitle();
+                String address = link.getAddress();
+                String substring = text.substring(link.getStartIndex(), link.getEndIndex()-1); //in ppt end index is inclusive
+            }
+        }
+
+        //in PowerPoint you can assign a hyperlink to a shape without text,
+        //for example to a Line object. The code below demonstrates how to
+        //read such hyperlinks
+        Shape[] sh = slide[j].getShapes();
+        for (int k = 0; k < sh.length; k++) {
+            Hyperlink link = sh[k].getHyperlink();
+            if(link != null)  {
+                String title = link.getTitle();
+                String address = link.getAddress();
+            }
+        }
+    }
+                
+ + + + +
+

How to create tables

+
+ +
+      //table data              
+      String[][] data = {
+          {"INPUT FILE", "NUMBER OF RECORDS"},
+          {"Item File", "11,559"},
+          {"Vendor File", "300"},
+          {"Purchase History File", "10,000"},
+          {"Total # of requisitions", "10,200,038"}
+      };
+
+      SlideShow ppt = new SlideShow();
+
+      Slide slide = ppt.createSlide();
+      //create a table of 5 rows and 2 columns
+      Table table = new Table(5, 2);
+      for (int i = 0; i < data.length; i++) {
+          for (int j = 0; j < data[i].length; j++) {
+              TableCell cell = table.getCell(i, j);
+              cell.setText(data[i][j]);
+
+              RichTextRun rt = cell.getTextRun().getRichTextRuns()[0];
+              rt.setFontName("Arial");
+              rt.setFontSize(10);
+
+              cell.setVerticalAlignment(TextBox.AnchorMiddle);
+              cell.setHorizontalAlignment(TextBox.AlignCenter);
+          }
+      }
+
+      //set table borders
+      Line border = table.createBorder();
+      border.setLineColor(Color.black);
+      border.setLineWidth(1.0);
+      table.setAllBorders(border);
+
+      //set width of the 1st column
+      table.setColumnWidth(0, 300);
+      //set width of the 2nd column
+      table.setColumnWidth(1, 150);
+
+      slide.addShape(table);
+      table.moveTo(100, 100);
+
+      FileOutputStream out = new FileOutputStream("hslf-table.ppt");
+      ppt.write(out);
+      out.close();
+    
+                    
+ + + + + +
+

How to remove shapes from a slide

+
+ +
+
+        Shape[] shape = slide.getShapes();
+        for (int i = 0; i < shape.length; i++) {
+    
+            //remove the shape
+            boolean ok = slide.removeShape(shape[i]);
+            if(ok){
+              //the shape was removed. Do something.
+            }
+        }
+                    
+ + + + +
+

How to retrieve embedded OLE objects

+
+ +
+
+        Shape[] shape = slide.getShapes();
+        for (int i = 0; i < shape.length; i++) {
+            if (shape[i] instanceof OLEShape) {
+                OLEShape ole = (OLEShape) shape[i];
+                ObjectData data = ole.getObjectData();
+                String name = ole.getInstanceName();
+                if ("Worksheet".equals(name)) {
+                    HSSFWorkbook wb = new HSSFWorkbook(data.getData());
+                } else if ("Document".equals(name)) {
+                    HWPFDocument doc = new HWPFDocument(data.getData());
+                }
+            }
+        }
+                    
+ + + + + +
+

How to retrieve embedded sounds

+
+ +
+
+        FileInputStream is = new FileInputStream(args[0]);
+        SlideShow ppt = new SlideShow(is);
+        is.close();
+
+        SoundData[] sound = ppt.getSoundData();
+        for (int i = 0; i < sound.length; i++) {
+            //save *WAV sounds on disk
+            if(sound[i].getSoundType().equals(".WAV")){
+                FileOutputStream out = new FileOutputStream(sound[i].getSoundName());
+                out.write(sound[i].getData());
+                out.close();
+            }
+        }
+                    
+ + + + + +
+

How to create shapes of arbitrary geometry

+
+ +
+
+        SlideShow ppt = new SlideShow();
+        Slide slide = ppt.createSlide();
+
+        java.awt.geom.GeneralPath path = new java.awt.geom.GeneralPath();
+        path.moveTo(100, 100);
+        path.lineTo(200, 100);
+        path.curveTo(50, 45, 134, 22, 78, 133);
+        path.curveTo(10, 45, 134, 56, 78, 100);
+        path.lineTo(100, 200);
+        path.closePath();
+        
+        Freeform shape = new Freeform();
+        shape.setPath(path);
+        slide.addShape(shape);
+                    
+ + + + + +
+

How to draw into a slide using Graphics2D

+
+ +
+
Warning
+
+ Current implementation of the PowerPoint Graphics2D driver is not fully compliant with the java.awt.Graphics2D specification. + Some features like clipping, drawing of images are not yet supported. +
+
+ +
+        SlideShow ppt = new SlideShow();
+        Slide slide = ppt.createSlide();
+
+        //draw a simple bar graph
+        //bar chart data. The first value is the bar color, the second is the width
+        Object[] def = new Object[]{
+            Color.yellow, new Integer(100),
+            Color.green, new Integer(150),
+            Color.gray, new Integer(75),
+            Color.red, new Integer(200),
+        };
+
+        //all objects are drawn into a shape group so we need to create one
+
+        ShapeGroup group = new ShapeGroup();
+        //define position of the drawing in the slide
+        Rectangle bounds = new java.awt.Rectangle(200, 100, 350, 300);
+        //if you want to draw in the entire slide area then define the anchor as follows:
+        //Dimension pgsize = ppt.getPageSize();
+        //java.awt.Rectangle bounds = new java.awt.Rectangle(0, 0, pgsize.width, pgsize.height);
+
+        group.setAnchor(bounds);
+        slide.addShape(group);
+
+        //draw a simple bar chart
+        Graphics2D graphics = new PPGraphics2D(group);
+        int x = bounds.x + 50, y = bounds.y + 50;
+        graphics.setFont(new Font("Arial", Font.BOLD, 10));
+        for (int i = 0, idx = 1; i < def.length; i+=2, idx++) {
+            graphics.setColor(Color.black);
+            int width = ((Integer)def[i+1]).intValue();
+            graphics.drawString("Q" + idx, x-20, y+20);
+            graphics.drawString(width + "%", x + width + 10, y + 20);
+            graphics.setColor((Color)def[i]);
+            graphics.fill(new Rectangle(x, y, width, 30));
+            y += 40;
+        }
+        graphics.setColor(Color.black);
+        graphics.setFont(new Font("Arial", Font.BOLD, 14));
+        graphics.draw(bounds);
+        graphics.drawString("Performance", x + 70, y + 40);
+
+        FileOutputStream out = new FileOutputStream("hslf-graphics2d.ppt");
+        ppt.write(out);
+        out.close();
+
+                   
+ + + + + +
+

Export PowerPoint slides into java.awt.Graphics2D

+
+ +

+ HSLF provides a way to export slides into images. You can capture slides into java.awt.Graphics2D object (or any other) + and serialize it into a PNG or JPEG format. Please note, although HSLF attempts to render slides as close to PowerPoint as possible, + the output may look differently from PowerPoint due to the following reasons: +

+ +
    + +
  • Java2D renders fonts differently vs PowerPoint. There are always some differences in the way the font glyphs are painted
  • + +
  • HSLF uses java.awt.font.LineBreakMeasurer to break text into lines. PowerPoint may do it in a different way.
  • + +
  • If a font from the presentation is not avaiable, then the JDK default font will be used.
  • + +
+ +

+ Current Limitations: +

+ +
    + +
  • Some types of shapes are not yet supported (WordArt, complex auto-shapes)
  • + +
  • Only Bitmap images (PNG, JPEG, DIB) can be rendered in Java
  • + +
+ +
+        FileInputStream is = new FileInputStream("slideshow.ppt");
+        SlideShow ppt = new SlideShow(is);
+        is.close();
+        
+        Dimension pgsize = ppt.getPageSize();
+
+        Slide[] slide = ppt.getSlides();
+        for (int i = 0; i < slide.length; i++) {
+
+            BufferedImage img = new BufferedImage(pgsize.width, pgsize.height, BufferedImage.TYPE_INT_RGB);
+            Graphics2D graphics = img.createGraphics();
+            //clear the drawing area
+            graphics.setPaint(Color.white);
+            graphics.fill(new Rectangle2D.Float(0, 0, pgsize.width, pgsize.height));
+
+            //render
+            slide[i].draw(graphics);
+
+            //save the output
+            FileOutputStream out = new FileOutputStream("slide-"  + (i+1) + ".png");
+            javax.imageio.ImageIO.write(img, "png", out);
+            out.close();
+        }
+
+                  
+ + + + + + +
+

How to extract Headers / Footers from an existing presentation

+
+ +
+
+          FileInputStream is = new FileInputStream("slideshow.ppt");
+          SlideShow ppt = new SlideShow(is);
+          is.close();
+          Slide[] slides = ppt.getSlides();
+
+          //presentation-scope headers / footers
+          HeadersFooters hdd = ppt.getSlideHeadersFooters();
+          if(hdd.isFooterVisible()) {
+              String footerText = hdd.getFooterText();
+          }
+
+          //per-slide headers / footers
+          for (int i=0; i < slides.length; i++){
+              HeadersFooters hdd2 = slides[i].getHeadersFooters();
+              if(hdd2.isFooterVisible()) {
+                  String footerText = hdd2.getFooterText();
+              }
+              if(hdd2.isUserDateVisible()) {
+                 String customDate = hdd2.getDateTimeText();
+              }
+              if(hdd2.isSlideNumberVisible()){
+                  int slideNUm = slides[i].getSlideNumber();
+              }
+
+          }
+                
+ + + +
+

How to set Headers / Footers

+
+ +
+
+          SlideShow ppt = new SlideShow();
+
+          //presentation-scope headers / footers
+          HeadersFooters hdd = ppt.getSlideHeadersFooters();
+          hdd.setSlideNumberVisible(true);
+          hdd.setFootersText("Created by POI-HSLF");
+                
+ + + + +
by Yegor Kozlov
+
+
+
+
+ + + + + + Propchange: poi/site/publish/slideshow/how-to-shapes.html ------------------------------------------------------------------------------ svn:executable = * Added: poi/site/publish/slideshow/index.html URL: http://svn.apache.org/viewvc/poi/site/publish/slideshow/index.html?rev=1423805&view=auto ============================================================================== --- poi/site/publish/slideshow/index.html (added) +++ poi/site/publish/slideshow/index.html Wed Dec 19 09:27:20 2012 @@ -0,0 +1,235 @@ + + + + + + + + + +POI-HSLF and and POI-XLSF - Java API To Access Microsoft Powerpoint Format Files + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + +
+ Search Apache POI
+
+
+
+
+
+
+

POI-HSLF and and POI-XLSF - Java API To Access Microsoft Powerpoint Format Files

+
+
+ + + + + +
+

POI-HSLF

+
+ + + +

HSLF is the POI Project's pure Java implementation of the Powerpoint '97(-2007) file format.

+ +

HSLF provides a way to read, create or modify PowerPoint presentations. In particular, it provides: +

+ +
    + +
  • api for data extraction (text, pictures, embedded objects, sounds)
  • + +
  • usermodel api for creating, reading and modifying ppt files
  • + +
+ +
+
Note
+
+ This code currently lives the + scratchpad area + of the POI SVN repository. + Ensure that you have the scratchpad jar or the scratchpad + build area in your classpath before experimenting with + this code - the main POI jar is not enough. +
+
+ +

The quick guide documentation provides + information on using this API. Comments and fixes gratefully accepted on the POI + dev mailing lists.

+ + + +
+

POI-XSLF

+
+ + +

+ XSLF is the POI Project's pure Java implementation of the PowerPoint 2007 OOXML (.xlsx) file format. + Whilst HSLF and XSLF provide similar features, there is not a common interface across the two of them at this time. +

+ +

+ Please note that XSLF is still in early development and is a subject to incompatible changes in future. +

+ +

+ A quick guide is available in the XSLF Cookbook + +

+ + + +
by Avik Sengupta, Nick Burch, Yegor Kozlov
+
+
+
+
+ + + + + + Propchange: poi/site/publish/slideshow/index.html ------------------------------------------------------------------------------ svn:executable = * Added: poi/site/publish/slideshow/ppt-file-format.html URL: http://svn.apache.org/viewvc/poi/site/publish/slideshow/ppt-file-format.html?rev=1423805&view=auto ============================================================================== --- poi/site/publish/slideshow/ppt-file-format.html (added) +++ poi/site/publish/slideshow/ppt-file-format.html Wed Dec 19 09:27:20 2012 @@ -0,0 +1,579 @@ + + + + + + + + + +POI-HSLF - A Guide to the PowerPoint File Format + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + +
+ Search Apache POI
+
+
+
+
+
+
+

POI-HSLF - A Guide to the PowerPoint File Format

+
+
+ + + + + +
+

Records, Containers and Atoms

+
+ +

+ PowerPoint documents are made up of a tree of records. A record may + contain either other records (in which case it is a Container), + or data (in which case it's an Atom). A record can't hold both. +

+ +

+ PowerPoint documents don't have one overall container record. Instead, + there are a number of different container records to be found at + the top level. +

+ +

+ Any numbers or strings stored in the records are always stored in + Little Endian format (least important bytes first). This is the case + no matter what platform the file was written on - be that a + Little Endian or a Big Endian system. +

+ +

+ PowerPoint may have Escher (DDF) records embeded in it. These + are always held as the children of a PPDrawing record (record + type 1036). Escher records have the same format as PowerPoint + records. +

+ + + + +
+

Record Headers

+
+ +

+ All records, be they containers or atoms, have the same standard + 8 byte header. It is: +

+ +
    +
  • 1/2 byte container flag
  • + +
  • 1.5 byte option field
  • + +
  • 2 byte record type
  • + +
  • 4 byte record length
  • +
+ +

+ If the first byte of the header, BINARY_AND with 0x0f, is 0x0f, + then the record is a container. Otherwise, it's an atom. The rest + of the first two bytes are used to store the "options" for the + record. Most commonly, this is used to indicate the version of + the record, but the exact useage is record specific. +

+ +

+ The record type is a little endian number, which tells you what + kind of record you're dealing with. Each different kind of record + has it's own value that gets stored here. PowerPoint records have + a type that's normally less than 6000 (decimal). Escher records + normally have a type between 0xF000 and 0xF1FF. +

+ +

+ The record length is another little endian number. For an atom, + it's the size of the data part of the record, i.e. the length + of the record less its 8 byte record header. For a + container, it's the size of all the records that are children of + this record. That means that the size of a container record is the + length, plus 8 bytes for its record header. +

+ + + + +
+

CurrentUserAtom, UserEditAtom and PersistPtrIncrementalBlock

+
+ +

+aka Records that care about the byte level position of other records +

+ +

+ A small number of records contain byte level position offsets to other + records. If you change the position of any records in the file, then + there's a good chance that you will need to update some of these + special records. +

+ +

+ First up, CurrentUserAtom. This is actually stored in a different + OLE2 (POIFS) stream to the main PowerPoint document. It contains + a few bits of information on who lasted edited the file. Most + importantly, at byte 8 of its contents, it stores (as a 32 bit + little endian number) the offset in the main stream to the most + recent UserEditAtom. +

+ +

+ The UserEditAtom contains two byte level offsets (again as 32 bit + little endian numbers). At byte 12 is the offset to the + PersistPtrIncrementalBlock associated with this UserEditAtom + (each UserEditAtom has one and only one PersistPtrIncrementalBlock). + At byte 8, there's the offset to the previous UserEditAtom. If this + is 0, then you're at the first one. +

+ +

+ Every time you do a non full save in PowerPoint, it tacks on another + UserEditAtom and another PersistPtrIncrementalBlock. The + CurrentUserAtom is updated to point to this new UserEditAtom, and the + new UserEditAtom points back to the previous UserEditAtom. You then + end up with a chain, starting from the CurrentUserAtom, linking + back through all the UserEditAtoms, until you reach the first one + from a full save. +

+ +
+/-------------------------------\
+| CurrentUserAtom (own stream)  |
+|   OffsetToCurrentEdit = 10562 |==\
+\-------------------------------/  |
+                                   |
+/==================================/
+|                                         /-----------------------------------\
+|                                         | PersistPtrIncrementalBlock @ 6144 |
+|                                         \-----------------------------------/
+|  /---------------------------------\                  |
+|  | UserEditAtom @ 6176             |                  |
+|  |   LastUserEditAtomOffset = 0    |                  |
+|  |   PersistPointersOffset =  6144 |==================/
+|  \---------------------------------/
+|                 |                       /-----------------------------------\
+|                 \====================\  | PersistPtrIncrementalBlock @ 8646 |
+|                                      |  \-----------------------------------/
+|  /---------------------------------\ |                |
+|  | UserEditAtom @ 8674             | |                |
+|  |   LastUserEditAtomOffset = 6176 |=/                |
+|  |   PersistPointersOffset =  8646 |==================/
+|  \---------------------------------/
+|                 |                       /------------------------------------\
+|                 \====================\  | PersistPtrIncrementalBlock @ 10538 |
+|                                      |  \------------------------------------/
+|  /---------------------------------\ |                |
+\==| UserEditAtom @ 10562            | |                |
+   |   LastUserEditAtomOffset = 8674 |=/                |
+   |   PersistPointersOffset = 10538 |==================/
+   \---------------------------------/
+
+ +

+ The PersistPtrIncrementalBlock contains byte offsets to all the + Slides, Notes, Documents and MasterSlides in the file. The first + PersistPtrIncrementalBlock will point to all the ones that + were present the first time the file was saved. Subsequent + PersistPtrIncrementalBlocks will contain pointers to all the ones + that were changed in that edit. To find the offset to a given + sheet in the latest version, then start with the most recent + PersistPtrIncrementalBlock. If this knows about the sheet, use the + offset it has. If it doesn't, then work back through older + PersistPtrIncrementalBlocks until you find one which does, and + use that. +

+ +

+ Each PersistPtrIncrementalBlock can contain a number of entries + blocks. Each block holds information on a sequence of sheets. + Each block starts with a 32 bit little endian integer. Once read + into memory, the lower 20 bits contain the starting number for the + sequence of sheets to be described. The higher 12 bits contain + the count of the number of sheets described. Following that is + one 32 bit little endian integer for each sheet in the sequence, + the value being the offset to that sheet. If there is any data + left after parsing a block, then it corresponds to the next block. +

+ +
+hex on disk      decimal        description
+-----------      -------        -----------
+0000             0              No options
+7217             6002           Record type is 6002
+2000 0000        32             Length of data is 32 bytes
+0100 5000        5242881        Count is 5 (12 highest bits)
+                                Starting number is 1 (20 lowest bits)
+0000 0000        0              Sheet (1+0)=1 starts at offset 0
+900D 0000        3472           Sheet (1+1)=2 starts at offset 3472
+E403 0000        996            Sheet (1+2)=3 starts at offset 996
+9213 0000        5010           Sheet (1+3)=4 starts at offset 5010
+BE15 0000        5566           Sheet (1+4)=5 starts at offset 5566
+0900 1000        1048585        Count is 1 (12 highest bits)
+                                Starting number is 9 (20 lowest bits)
+4418 0000        6212           Sheet (9+0)=9 starts at offset 9212
+
+ + + + +
+

Paragraph and Text Styling

+
+ +

+ There are quite a number of records that affect the styling + of text, and a smaller number that are responsible for the + styling of paragraphs. +

+ +

+ By default, a given set of text will inherit paragraph and text + stylings from the appropriate master sheet. If anything differs + from the master sheet, then appropriate styling records will + follow the text record. +

+ +

+ +(We don't currently know enough about master sheet styling + to write about it) + +

+ +

+ Normally, powerpoint will have one text record (TextBytesAtom + or TextCharsAtom) for every paragraph, with a preceeding + TextHeaderAtom to describe what sort of paragraph it is. + If any of the stylings differ from the master's, then a + StyleTextPropAtom will follow the text record. This contains + the paragraph style information, and the styling information + for each section of the text which has a different style. + (More on StyleTextPropAtom later) +

+ +

+ For every font used, a FontEntityAtom must exist for that font. + The FontEntityAtoms live inside a FontCollection record, and + there's one of those inside Environment record inside the + Document record. (More on Fonts to be discovered) + +

+ + + + +
+

StyleTextPropAtom

+
+ +

+ If the text or paragraph stylings for a given text record + differ from those of the appropriate master, then there will + be one of these records. +

+ +

+ This record is made up of two lists of lists. Firstly, + there's a list of paragraph stylings - each made up of the + number of characters it applies two, followed by the matching + styling elements. Following that is the equivalent for + character stylings. +

+ +

+ Each styling list (in either list) starts with the number + of characters it applies to, stored in a 2 byte little + endian number. If it is a paragraph styling, it will be + followed by a 2 byte number (of unknown use). After this is + a four byte number, which is a mask indicating which stylings + will follow. You then have an entry for each of the stylings + indicated in the mask. Finally, you move onto the next set + of stylings. +

+ +

+ Each styling has a specific mask flag to indicate its + presence. (The list may be found towards the top of + org.apache.poi.hslf.record.StyleTextPropAtom.java, and is + too long to sensibly include here). For each styling entry + will occur in the order of its mask value (so one with mask + 1 will come first, followed by the next higest mask value). + Depending on the styling, it is either made up of a 2 byte + or 4 byte numeric value. The meaning of the value will + depend on the styling (eg for font.size, it is the font + size in points). +

+ +

+ Some stylings are actually mask stylings. For these, the + value will be a 4 byte number. This is then processed as + mask, to indicate a number of different sub-stylings. + The styling for bold/italic/underline is one such example. +

+ +
+hex on disk      decimal        description
+-----------      -------        -----------
+
+0000             0              No options
+A10F             4001           Record type is 4001
+8000 0000        128            Length of data is 128 bytes
+1E00 0000        30             The paragraph styling applies to 30 characters
+0000             0              Paragraph options are 0
+0018 0000        6144           0x0800=Text Alignment, 0x1000=Line Spacing
+0000             0              Text Alignment = Left
+5000             80             Line Spacing = 80
+
+1C00 0000        28             The paragraph styling applies to 28 characters
+0000             0              Paragraph options are 0
+0010 0000        4096           0x1000=Line Spacing
+5000             80             Line Spacing = 80
+
+1900 0000        25             The paragraph styling applies to 25 characters
+0000             0              Paragraph options are 0
+0018 0000        6144           0x0800=Text Alignment, 0x1000=Line Spacing
+0200             0              Text Alignment = Right
+5000             80             Line Spacing = 80
+
+6100 0000        61             The paragraph styling applies to 61 characters
+                                (includes final CR)
+0000             0              Paragraph options are 0
+0018 0000        6144           0x0800=Text Alignment, 0x1000=Line Spacing
+0000             0              Text Alignment = Left
+5000             80             Line Spacing = 80
+
+1E00 0000        30             The character styling applies to 30 characters
+0100 0200        131073         0x0001=Char Props Mask, 0x20000=Font Size
+0100             1              Char Props 0x0001=Bold
+1400             20             Font Size = 20
+
+1C00 0000        28             The character styling applies to 28 characters
+0200 0600        393218         0x0002=Char Props Mask, 0x20000=Font Size,  0x40000=Font Color
+0200             2              Char Props 0x0002=Italic
+1400             20             Font Size = 20
+0000 0005        83886080       Blue
+ 
+1900 0000        25             The character styling applies to 25 characters
+0000 0600        393216         0x20000=Font Size,  0x40000=Font Color
+1400             20             Font Size = 20
+FF33 00FE        4261426175     Red
+
+6000 0000        96             The character styling applies to 96 characters
+0400 0300        196612         0x0004=Char Props Mask, 0x10000=Font Index, 0x20000=Font Size
+0400             4              Char Props 0x0004=Underlined
+0100             1              Font Index = 1 (2nd Font in table)
+1800             24             Font Size = 24
+
+ + + + +
+

Fonts in PowerPoint

+
+ +

+ PowerPoint stores information about the fonts used in FontEntityAtoms, + which live inside Document.Environment.FontCollection. For every different + font used, a FontEntityAtom must exist for that font. There is always at + least one FontEntityAtom in Document.Environment.FontCollection, + which describes the default font. +

+ + + + +
+

FontEntityAtom

+
+ +

+ The instance field of the record header contains the zero based index of the + font. Font index entries in StyleTextPropAtoms will refer to their required + font via this index. +

+ +

+ The length of FontEntityAtoms is always 68 bytes. The first 64 bytes of + it hold the typeface name of the font to be used. This is stored as + a null-terminated string, and encoded as little endian unicode. (The + length of the string must not exceed 32 characters including the null + termination, so the typeface name cannot exceed 31 characters). +

+ + +

+ After the typeface name there are 4 bytes of bitmask flags. The details of these + can be found in the Windows API, under the LOGFONT structure. + The 65th byte is the output precision, which defines how closely the system chosen + font must match the requested font, in terms of heigh, width, pitch etc. + The 66th byte is the clipping precision, which defines how to clip characters + that occur partly outside the clipping region. + The 67th byte is the output quality, which defines how closely the system + must match the logical font's attributes to those of the physical font used. + The 68th (and final) byte is the pitch and family, which is used by the + system when matching fonts. +

+ + + +
by Nick Burch, Yegor Kozlov
+
+
+
+
+ + + + + + Propchange: poi/site/publish/slideshow/ppt-file-format.html ------------------------------------------------------------------------------ svn:executable = * Added: poi/site/publish/slideshow/quick-guide.html URL: http://svn.apache.org/viewvc/poi/site/publish/slideshow/quick-guide.html?rev=1423805&view=auto ============================================================================== --- poi/site/publish/slideshow/quick-guide.html (added) +++ poi/site/publish/slideshow/quick-guide.html Wed Dec 19 09:27:20 2012 @@ -0,0 +1,337 @@ + + + + + + + + + +POI-HSLF - A Quick Guide + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + +
+ Search Apache POI
+
+
+
+
+
+
+

POI-HSLF - A Quick Guide

+
+
+ + + + + +
+

Basic Text Extraction

+
+ +

For basic text extraction, make use of +org.apache.poi.hslf.extractor.PowerPointExtractor. It accepts a file or an input +stream. The getText() method can be used to get the text from the slides, and the getNotes() method can be used to get the text +from the notes. Finally, getText(true,true) will get the text +from both. +

+ + + + +
+

Specific Text Extraction

+
+ +

To get specific bits of text, first create a org.apache.poi.hslf.usermodel.SlideShow +(from a org.apache.poi.hslf.HSLFSlideShow, which accepts a file or an input +stream). Use getSlides() and getNotes() to get the slides and notes. +These can be queried to get their page ID (though they should be returned +in the right order).

+ +

You can then call getTextRuns() on these, to get +their blocks of text. (One TextRun normally holds all the text in a +given area of the page, eg in the title bar, or in a box). +From the TextRun, you can extract the text, and check +what type of text it is (eg Body, Title). You can allso call +getRichTextRuns(), which will return the +RichTextRuns that make up the TextRun. A +RichTextRun is made up of a sequence of text, all having the +same character and paragraph formatting. +

+ + + + +
+

Poor Quality Text Extraction

+
+ +

If speed is the most important thing for you, you don't care + about getting duplicate blocks of text, you don't care about + getting text from master sheets, and you don't care about getting + old text, then + org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor + might be of use.

+ +

QuickButCruddyTextExtractor doesn't use the normal record + parsing code, instead it uses a tree structure blind search + method to get all text holding records. You will get all the text, + including lots of text you normally wouldn't ever want. However, + you will get it back very very fast!

+ +

There are two ways of getting the text back. + getTextAsString() will return a single string with all + the text in it. getTextAsVector() will return a + vector of strings, one for each text record found in the file. +

+ + + + +
+

Changing Text

+
+ +

It is possible to change the text via + TextRun.setText(String) or + RichTextRun.setText(String). It is not yet possible + to add additional TextRuns or RichTextRuns.

+ +

When calling TextRun.setText(String), all + the text will end up with the same formatting. When calling + RichTextRun.setText(String), the text will retain + the old formatting of that RichTextRun. +

+ + + + +
+

Adding Slides

+
+ +

You may add new slides by calling + SlideShow.createSlide(), which will add a new slide + to the end of the SlideShow. It is not currently possible to + re-order slides, nor to add new text to slides (currently only + adding Escher objects to new slides is supported). +

+ + + + +
+

Guide to key classes

+
+ +
    + +
  • +org.apache.poi.hslf.HSLFSlideShow + Handles reading in and writing out files. Calls + org.apache.poi.hslf.record.record to build a tree + of all the records in the file, which it allows access to. +
  • + +
  • +org.apache.poi.hslf.record.record + Base class of all records. Also provides the main record generation + code, which will build up a tree of records for a file. +
  • + +
  • +org.apache.poi.hslf.usermodel.SlideShow + Builds up model entries from the records, and presents a user facing + view of the file +
  • + +
  • +org.apache.poi.hslf.model.Slide + A user facing view of a Slide in a slidesow. Allows you to get at the + Text of the slide, and at any drawing objects on it. +
  • + +
  • +org.apache.poi.hslf.model.TextRun + Holds all the Text in a given area of the Slide, and will + contain one or more RichTextRuns. +
  • + +
  • +org.apache.poi.hslf.usermodel.RichTextRun + Holds a run of text, all having the same character and + paragraph stylings. It is possible to modify text, and/or text stylings. +
  • + +
  • +org.apache.poi.hslf.extractor.PowerPointExtractor + Uses the model code to allow extraction of text from files +
  • + +
  • +org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor + Uses the record code to extract all the text from files very fast, + but including deleted text (and other bits of Crud). +
  • + +
+ + + +
by Nick Burch
+
+
+
+
+ + + + + + Propchange: poi/site/publish/slideshow/quick-guide.html ------------------------------------------------------------------------------ svn:executable = * --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org For additional commands, e-mail: commits-help@poi.apache.org