Render .pdf to single Canvas using pdf.js and ImageData
Asked Answered
A

3

24

I am trying to read an entire .pdf Document using PDF.js and then render all the pages on a single canvas.

My idea: render each page onto a canvas and get the ImageData (context.getImageData()), clear the canvas do the next page. I store all the ImageDatas in an array and once all pages are in there I want to put all the ImageDatas from the array onto a single canvas.

var pdf = null;
PDFJS.disableWorker = true;
var pages = new Array();
    //Prepare some things
    var canvas = document.getElementById('cv');
    var context = canvas.getContext('2d');
    var scale = 1.5;
    PDFJS.getDocument(url).then(function getPdfHelloWorld(_pdf) {
        pdf = _pdf;
        //Render all the pages on a single canvas
        for(var i = 1; i <= pdf.numPages; i ++){
            pdf.getPage(i).then(function getPage(page){
                var viewport = page.getViewport(scale);
                canvas.width = viewport.width;
                canvas.height = viewport.height;
                page.render({canvasContext: context, viewport: viewport});
                pages[i-1] = context.getImageData(0, 0, canvas.width, canvas.height);
                context.clearRect(0, 0, canvas.width, canvas.height);
                p.Out("pre-rendered page " + i);
            });
        }

    //Now we have all 'dem Pages in "pages" and need to render 'em out
    canvas.height = 0;
    var start = 0;
    for(var i = 0; i < pages.length; i++){
        if(canvas.width < pages[i].width) canvas.width = pages[i].width;
        canvas.height = canvas.height + pages[i].height;
        context.putImageData(pages[i], 0, start);
        start += pages[i].height;
    }
    });

So from the way I understnad thing this should work, right? When I run this I end up with the canvas that is big enought to contain all the pages of the pdf but doesn't show the pdf...

Thank you for helping.

Archaimbaud answered 11/3, 2013 at 14:25 Comment(0)
C
9

I can’t speak to the part of your code that renders the pdf into a canvas, but I do see some problems.

  • Every resetting canvas.width or canvas.height automatically clears the canvas contents. So in the top section, your clearRect is not needed because the canvas is cleared by canvas.width prior to your every page.render.
  • More importantly, in the bottom section, all your previous pdf drawings are cleared by every canvas resizing (oops!).
  • getImageData() gets an array where each pixel is represented by 4 consecutive elements of that array (red then green then blue then alpha). Since getImageData() is an array, so it doesn’t have a pages[i].width or pages[i].height—it only has a pages[i].length. That array length cannot be used to determine widths or heights.

So to get you started, I would start by changing your code to this (very, very untested!):

var pdf = null;
PDFJS.disableWorker = true;
var pages = new Array();
//Prepare some things
var canvas = document.getElementById('cv');
var context = canvas.getContext('2d');
var scale = 1.5;
var canvasWidth=0;
var canvasHeight=0;
var pageStarts=new Array();
pageStarts[0]=0;

PDFJS.getDocument(url).then(function getPdfHelloWorld(_pdf) {
    pdf = _pdf;
    //Render all the pages on a single canvas
    for(var i = 1; i <= pdf.numPages; i ++){
        pdf.getPage(i).then(function getPage(page){
            var viewport = page.getViewport(scale);
            // changing canvas.width and/or canvas.height auto-clears the canvas
            canvas.width = viewport.width;
            canvas.height = viewport.height;
            page.render({canvasContext: context, viewport: viewport});
            pages[i-1] = context.getImageData(0, 0, canvas.width, canvas.height);
            // calculate the width of the final display canvas
            if(canvas.width>maxCanvasWidth){
              maxCanvasWidth=canvas.width;
            }
            // calculate the accumulated with of the final display canvas
            canvasHeight+=canvas.height;
            // save the "Y" starting position of this pages[i]
            pageStarts[i]=pageStarts[i-1]+canvas.height;
            p.Out("pre-rendered page " + i);
        });
    }


    canvas.width=canvasWidth; 
    canvas.height = canvasHeight;  // this auto-clears all canvas contents
    for(var i = 0; i < pages.length; i++){
        context.putImageData(pages[i], 0, pageStarts[i]);
    }

});

Alternatively, here’s a more traditional way of accomplishing your task:

Use a single “display” canvas and allow the user to “page through” each desired page.

Since you already start by drawing each page into a canvas, why not keep a separate, hidden canvas for each page. Then when the user wants to see page#6, you just copy the hidden canvas#6 onto your display canvas.

The Mozilla devs use this approach in their pdfJS demo here: http://mozilla.github.com/pdf.js/web/viewer.html

You can check out the code for the viewer here: http://mozilla.github.com/pdf.js/web/viewer.js

Caloric answered 11/3, 2013 at 19:16 Comment(5)
The clearing of content really turned out to be an issue thanks ^^Archaimbaud
@markE, I tried you solution but it didn't work. All I need is to use the Helloworld example to display the whole pdf pages (The pdf.js project is too complicated and doesn't suit my needs). Would you suggest some corrections?Valeta
Many/all browsers impose a max size limitation on canvas elements, so for sufficiently large PDFs, it won't work anyway. I've been struggling with this a whole lot lately, and the best solution IMO was, as you suggest, to show one page at a time.Charliecharline
Can anyone suggest how to use findcontroller for searching text with this example. Please suggest how to search text with this pdf.Primarily
Possible link but don't see mention of canvas element github.com/mozilla/pdf.js/blob/master/web/viewer.htmlSpelt
A
31

The PDF operations are asynchronous at all stages. This means you also need to catch the promise at the last render as well. If you not catch it you will only get a blank canvas as the rendering isn't finished before the loop continues to the next page.

Tip: I would also recommend that you use something else than getImageData as this will store uncompressed bitmap, for example the data-uri instead which is compressed data.

Here is a slightly different approach eliminating the for-loop and uses the promises better for this purpose:

LIVE FIDDLE

var canvas = document.createElement('canvas'), // single off-screen canvas
    ctx = canvas.getContext('2d'),             // to render to
    pages = [],
    currentPage = 1,
    url = 'path/to/document.pdf';              // specify a valid url

PDFJS.getDocument(url).then(iterate);   // load PDF document

/* To avoid too many levels, which easily happen when using chained promises,
   the function is separated and just referenced in the first promise callback
*/

function iterate(pdf) {

    // init parsing of first page
    if (currentPage <= pdf.numPages) getPage();

    // main entry point/function for loop
    function getPage() {

        // when promise is returned do as usual
        pdf.getPage(currentPage).then(function(page) {

            var scale = 1.5;
            var viewport = page.getViewport(scale);

            canvas.height = viewport.height;
            canvas.width = viewport.width;

            var renderContext = {
                canvasContext: ctx,
                viewport: viewport
            };

            // now, tap into the returned promise from render:
            page.render(renderContext).then(function() {

                // store compressed image data in array
                pages.push(canvas.toDataURL());

                if (currentPage < pdf.numPages) {
                    currentPage++;
                    getPage();        // get next page
                }
                else {
                    done();           // call done() when all pages are parsed
                }
            });
        });
    }

}

When you then need to retrieve a page you simply create an image element and set the data-uri as source:

function drawPage(index, callback) {
    var img = new Image;
    img.onload = function() {
        /* this will draw the image loaded onto canvas at position 0,0
           at the optional width and height of the canvas.
           'this' is current image loaded 
        */
        ctx.drawImage(this, 0, 0, ctx.canvas.width, ctx.canvas.height);
        callback();          // invoke callback when we're done
    }
    img.src = pages[index];  // start loading the data-uri as source
}

Due to the image loading it will be asynchronous in nature as well which is why we need the callback. If you don't want the asynchronous nature then you could also do this step (creating and setting the image element) in the render promise above storing image elements instead of data-uris.

Hope this helps!

Alar answered 29/4, 2014 at 22:14 Comment(20)
Thanks very much for your response, Espistemex. I tried you solutions but didn't succeed. Due to my unfamiliarity with canvas, please enlighten me further. Questions: 1) I just add another statement "var url = 'example.pdf';" in front of your codes to specify the source of the pdf file. It that correct? 2) I have no idea how to use the second piece of your code (the img part). Would you elaborate more? Thanks.Valeta
@yltang52 I added a fiddle/demo. The url must be specified first with a valid relative or absolute url. I added more comment/info in the answer but perhaps the demo is even more clear as it shows what goes on.Alar
I tried your fiddle and it worked. Terrific! Further questions: 1) I changed the url to http://www.cyut.edu.tw/~yltang/example.pdf and it won't render. The file is actually there though. Do you mind giving it a try to see why your file worked but not mine? 2) How can I display the file one page after another vertically? 3) The solution works only on Firefox, right?Valeta
@yltang52 1) it's a CORS issue. The file need to be on the same server or allow CORS usage. I use a CORS proxy for the other file to work, I did the same with your file here: jsfiddle.net/epistemex/LUNaJ/3. This is a security mechanism in the browsers. 2) It's all about formatting, CSS, parent containers etc. 3) Works in Chrome/Opera as well, and a little bit faster too :) Hope this helps! I would open new questions though for the details in 2).Alar
Thanks a million, Epistemex. Since the hard part of the problem is solved, now I'll be working on the formatting issue. Let me know if you have the details of 2).Valeta
Do you have any suggestion about how to display pdf pages one by one vertically?Valeta
Thanks. Its takes lots of time to reading all the pages and then converting into images. I would like to draw image for the last read files. Can you please explain it ?Donetsk
What am I doing wrong here: #27724795Dickdicken
Thanks for your answer. I used it in my project while found it not working on IE9. the Mozilla/pdf.js examples do support IE9, so do you know what cause the code not working in IE9? Thanks again!Morrie
@FangCao any error messages in console and/or at what line do the code break?Alar
@KenFyrstenberg Thanks for your response. Here is the console log "uint8array is undefined in ie9"Morrie
@FangCao typed arrays are not supported in IE9 it turns out (caniuse.com/#search=typed%20arrays). This is related to the PDF.js code. You may get around with a polyfill but I would expect that to be very slow in this context.Alar
This is a brilliant solution K3N. However I tried you solution for http://infolab.stanford.edu/pub/papers/google.pdf in jsfiddle.net/LUNaJ/82 and it doesn't work. Why is this not working? Please help.Broadfaced
@K3N This is a brilliant solution. However I tried you solution for http://infolab.stanford.edu/pub/papers/google.pdf in jsfiddle.net/LUNaJ/82 and it doesn't work. Why is this not working? Please help.Broadfaced
This is by far the best answer I have read and it must be the right answer to the question. My case was to show multiple pdf files one after the other so I have done it using your solution. Thank you a lot you have saved my day.Petrology
This fiddle may have worked at one time but I have been unsuccessful at getting it to do anything but output "Generating Pages" and a progress bar that doesn't move. I tried with several different PDFs.Playmate
@MichaelKupietz it's due to cors and the cors-proxy being used. Updated with a different cors-proxy so until that goes down. See if the new update works.. In real-world you would of course use links that are either within your page's origin or at least which allows cors usage from your origin.Alar
Ah, thanks! This looks to be a big help for a project I'm working on.Playmate
i get error "Uncaught (in promise) TypeError: page.render(...).then is not a function" please advise? (in 2022 year)Postern
the fiddle links don't work anymore :(Vendee
C
9

I can’t speak to the part of your code that renders the pdf into a canvas, but I do see some problems.

  • Every resetting canvas.width or canvas.height automatically clears the canvas contents. So in the top section, your clearRect is not needed because the canvas is cleared by canvas.width prior to your every page.render.
  • More importantly, in the bottom section, all your previous pdf drawings are cleared by every canvas resizing (oops!).
  • getImageData() gets an array where each pixel is represented by 4 consecutive elements of that array (red then green then blue then alpha). Since getImageData() is an array, so it doesn’t have a pages[i].width or pages[i].height—it only has a pages[i].length. That array length cannot be used to determine widths or heights.

So to get you started, I would start by changing your code to this (very, very untested!):

var pdf = null;
PDFJS.disableWorker = true;
var pages = new Array();
//Prepare some things
var canvas = document.getElementById('cv');
var context = canvas.getContext('2d');
var scale = 1.5;
var canvasWidth=0;
var canvasHeight=0;
var pageStarts=new Array();
pageStarts[0]=0;

PDFJS.getDocument(url).then(function getPdfHelloWorld(_pdf) {
    pdf = _pdf;
    //Render all the pages on a single canvas
    for(var i = 1; i <= pdf.numPages; i ++){
        pdf.getPage(i).then(function getPage(page){
            var viewport = page.getViewport(scale);
            // changing canvas.width and/or canvas.height auto-clears the canvas
            canvas.width = viewport.width;
            canvas.height = viewport.height;
            page.render({canvasContext: context, viewport: viewport});
            pages[i-1] = context.getImageData(0, 0, canvas.width, canvas.height);
            // calculate the width of the final display canvas
            if(canvas.width>maxCanvasWidth){
              maxCanvasWidth=canvas.width;
            }
            // calculate the accumulated with of the final display canvas
            canvasHeight+=canvas.height;
            // save the "Y" starting position of this pages[i]
            pageStarts[i]=pageStarts[i-1]+canvas.height;
            p.Out("pre-rendered page " + i);
        });
    }


    canvas.width=canvasWidth; 
    canvas.height = canvasHeight;  // this auto-clears all canvas contents
    for(var i = 0; i < pages.length; i++){
        context.putImageData(pages[i], 0, pageStarts[i]);
    }

});

Alternatively, here’s a more traditional way of accomplishing your task:

Use a single “display” canvas and allow the user to “page through” each desired page.

Since you already start by drawing each page into a canvas, why not keep a separate, hidden canvas for each page. Then when the user wants to see page#6, you just copy the hidden canvas#6 onto your display canvas.

The Mozilla devs use this approach in their pdfJS demo here: http://mozilla.github.com/pdf.js/web/viewer.html

You can check out the code for the viewer here: http://mozilla.github.com/pdf.js/web/viewer.js

Caloric answered 11/3, 2013 at 19:16 Comment(5)
The clearing of content really turned out to be an issue thanks ^^Archaimbaud
@markE, I tried you solution but it didn't work. All I need is to use the Helloworld example to display the whole pdf pages (The pdf.js project is too complicated and doesn't suit my needs). Would you suggest some corrections?Valeta
Many/all browsers impose a max size limitation on canvas elements, so for sufficiently large PDFs, it won't work anyway. I've been struggling with this a whole lot lately, and the best solution IMO was, as you suggest, to show one page at a time.Charliecharline
Can anyone suggest how to use findcontroller for searching text with this example. Please suggest how to search text with this pdf.Primarily
Possible link but don't see mention of canvas element github.com/mozilla/pdf.js/blob/master/web/viewer.htmlSpelt
C
0

You can pass the number page to the promises , get that page canvas data and render in the right order on canvas

var renderPageFactory = function (pdfDoc, num) {
    return function () {

        var localCanvas = document.createElement('canvas');

        ///return pdfDoc.getPage(num).then(renderPage);
        return  pdfDoc.getPage(num).then((page) => {
            renderPage(page, localCanvas, num);
        });
    };
};

var renderPages = function (pdfDoc) {
    var renderedPage = $q.resolve();
    for (var num = 1; num <= pdfDoc.numPages; num++) {
        // Wait for the last page t render, then render the next
        renderedPage = renderedPage.then(renderPageFactory(pdfDoc, num));
    }
};

renderPages(pdf);

Complete example

function renderPDF(url, canvas) {

    var pdf = null;
    PDFJS.disableWorker = true;
    var pages = new Array();

    var context = canvas.getContext('2d');
    var scale = 1;

    var canvasWidth = 256;
    var canvasHeight = 0;
    var pageStarts = new Array();
    pageStarts[0] = 0;

    var k = 0;

    function finishPage(localCanvas, num) {
        var ctx = localCanvas.getContext('2d');

        pages[num] = ctx.getImageData(0, 0, localCanvas.width, localCanvas.height);

        // calculate the accumulated with of the final display canvas
        canvasHeight += localCanvas.height;
        // save the "Y" starting position of this pages[i]
        pageStarts[num] = pageStarts[num -1] + localCanvas.height;

        if (k + 1 >= pdf.numPages) {
            canvas.width = canvasWidth;
            canvas.height = canvasHeight;  // this auto-clears all canvas contents
            for (var i = 0; i < pages.length; i++) {
                context.putImageData(pages[i+1], 0, pageStarts[i]);
            }

            var img = canvas.toDataURL("image/png");
            $scope.printPOS(img);
        }

        k++;
    }

    function renderPage(page, localCanvas, num) {

        var ctx = localCanvas.getContext('2d');

        var viewport = page.getViewport(scale);


        // var viewport = page.getViewport(canvas.width / page.getViewport(1.0).width);
        // changing canvas.width and/or canvas.height auto-clears the canvas
        localCanvas.width = viewport.width;

        /// viewport.width = canvas.width;
        localCanvas.height = viewport.height;

        var renderTask = page.render({canvasContext: ctx, viewport: viewport});


        renderTask.then(() => {
            finishPage(localCanvas, num);
        });
    }


    PDFJS.getDocument(url).then(function getPdfHelloWorld(_pdf) {

        pdf = _pdf;

        var renderPageFactory = function (pdfDoc, num) {
            return function () {

                var localCanvas = document.createElement('canvas');

                ///return pdfDoc.getPage(num).then(renderPage);
                return  pdfDoc.getPage(num).then((page) => {
                    renderPage(page, localCanvas, num);
                });
            };
        };

        var renderPages = function (pdfDoc) {
            var renderedPage = $q.resolve();
            for (var num = 1; num <= pdfDoc.numPages; num++) {
                // Wait for the last page t render, then render the next
                renderedPage = renderedPage.then(renderPageFactory(pdfDoc, num));
            }
        };

        renderPages(pdf);
    });
}
Cordate answered 10/3, 2018 at 3:7 Comment(1)
what is $q is it part or another library?Slacken

© 2022 - 2024 — McMap. All rights reserved.