pdfbox-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Moshe Liaks <ajli...@gmail.com>
Subject PDFBox - Read pdf file line by line using C#.Net
Date Mon, 16 Feb 2009 17:15:50 GMT
PDFBox - Read pdf file line by line using  C#.Net

Hi guys,

I use the code below to read a pdf file.
The code is working fine. The problem is that I have to read the pdf
line by line and not like "one big string".
I have this need, because the text is a complex one, and I need to
apply some filters while reading each line from the original.

How can I work it around?

Thanks in advance,
Aldo.

Note: The code below is C#.Net - Visual Studio 2008.

using System;
using System.IO;
using System.Windows.Forms;
using System.Collections;

using java.io;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.Text;

namespace Pdf2Text
{
      class Program
      {
        [STAThread]
        static void Main(string[] args)
        {
            string initialDir = @"C:\...\Pdf_Files\";
            OpenFileDialog ofd = new OpenFileDialog();
            ofd.InitialDirectory = initialDir;
            string fileIn = "";
            string fileOut = initialDir + "x.txt";

            if (ofd.ShowDialog() == DialogResult.OK){fileIn = ofd.FileName;}

            // Get file encoding
            System.Text.Encoding encIn = MyFileStream.GetFileEncoding(fileIn);
            System.Text.Encoding encOut = System.Text.Encoding.Unicode;

            // Read from PDF.
            WriteToFile(fileIn, fileOut, encIn, encOut);
        }
        public static void WriteToFile(string fileIn, string
fileOut,Encoding encIn, Encoding encOut)
        {
            using (FileStream fs = new FileStream(fileOut,
FileMode.Create, FileAccess.Write))
            {
                using (StreamWriter sw = new StreamWriter(fs, encOut))
                {
                    string text = ParseUsingPDFBox(fileIn, encIn.EncodingName);

                    // Normalize text.
                    text = text.Normalize();

                    sw.Write(text);
                }
            }
        }

        public static string ParseUsingPDFBox(string input, string encName)
        {
            java.io.InputStream iStream = new java.io.FileInputStream(input);
            java.io.InputStreamReader isr = new
java.io.InputStreamReader(iStream, encName);
            isr.read();

            PDDocument doc = PDDocument.load(iStream);
            PDFTextStripper stripper = new PDFTextStripper();

            isr.close();
            return stripper.getText(doc);



            // ---------------------------------------------------------------
            // I was trying as below, but getting gibberish…
            java.io.InputStream iStream2 = new java.io.FileInputStream(input);
            java.io.InputStreamReader isr2 = new
java.io.InputStreamReader(iStream2, encName);
            LineNumberReader lnr2 = new LineNumberReader(isr2);
            PDFTextStripper lineStripper = new PDFTextStripper();
            for (int lineNo = 1; lineNo < 25; lineNo++)
            {
                lnr2.setLineNumber(lineNo);
                MessageBox.Show(lineStripper.getText());
            }
            // ---------------------------------------------------------------
        }
    }
}

Mime
View raw message