pdfbox-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Moshe Liaks <ajli...@gmail.com>
Subject PDFBox - Read pdf file line by line using C#.Net
Date Mon, 16 Feb 2009 17:15:50 GMT
PDFBox - Read pdf file line by line using  C#.Net

Hi guys,

I use the code below to read a pdf file.
The code is working fine. The problem is that I have to read the pdf
line by line and not like "one big string".
I have this need, because the text is a complex one, and I need to
apply some filters while reading each line from the original.

How can I work it around?

Thanks in advance,

Note: The code below is C#.Net - Visual Studio 2008.

using System;
using System.IO;
using System.Windows.Forms;
using System.Collections;

using java.io;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.Text;

namespace Pdf2Text
      class Program
        static void Main(string[] args)
            string initialDir = @"C:\...\Pdf_Files\";
            OpenFileDialog ofd = new OpenFileDialog();
            ofd.InitialDirectory = initialDir;
            string fileIn = "";
            string fileOut = initialDir + "x.txt";

            if (ofd.ShowDialog() == DialogResult.OK){fileIn = ofd.FileName;}

            // Get file encoding
            System.Text.Encoding encIn = MyFileStream.GetFileEncoding(fileIn);
            System.Text.Encoding encOut = System.Text.Encoding.Unicode;

            // Read from PDF.
            WriteToFile(fileIn, fileOut, encIn, encOut);
        public static void WriteToFile(string fileIn, string
fileOut,Encoding encIn, Encoding encOut)
            using (FileStream fs = new FileStream(fileOut,
FileMode.Create, FileAccess.Write))
                using (StreamWriter sw = new StreamWriter(fs, encOut))
                    string text = ParseUsingPDFBox(fileIn, encIn.EncodingName);

                    // Normalize text.
                    text = text.Normalize();


        public static string ParseUsingPDFBox(string input, string encName)
            java.io.InputStream iStream = new java.io.FileInputStream(input);
            java.io.InputStreamReader isr = new
java.io.InputStreamReader(iStream, encName);

            PDDocument doc = PDDocument.load(iStream);
            PDFTextStripper stripper = new PDFTextStripper();

            return stripper.getText(doc);

            // ---------------------------------------------------------------
            // I was trying as below, but getting gibberish…
            java.io.InputStream iStream2 = new java.io.FileInputStream(input);
            java.io.InputStreamReader isr2 = new
java.io.InputStreamReader(iStream2, encName);
            LineNumberReader lnr2 = new LineNumberReader(isr2);
            PDFTextStripper lineStripper = new PDFTextStripper();
            for (int lineNo = 1; lineNo < 25; lineNo++)
            // ---------------------------------------------------------------

View raw message