save html output of page after execution of the page's javascript
Asked Answered
D

7

51

There is a site I am trying to scrape, that first loads an html/js modifies the form input fields using js and then POSTs. How can I get the final html output of the POSTed page?

I tried to do this with phantomjs, but it seems to only have an option to render image files. Googling around suggests it should be possible , but I can't figure out how. My attempt:

var page = require('webpage').create();
var fs = require('fs');
page.open('https://www.somesite.com/page.aspx', function () {
    page.evaluate(function(){

    });

    page.render('export.png');
    fs.write('1.html', page.content, 'w');
    phantom.exit();
});

This code will be used for a client, I can't expect him to install too many packages (nodejs , casperjs etc)

Thanks

Dishevel answered 31/5, 2013 at 11:17 Comment(2)
Do you have to use Python? Is Java an option?Parik
You can also get the content of the page using document.outerHTMLSidetrack
M
27

the output code you have is correct, but there is an issue with synchronicity. The output lines that you have are being executed before the page is done loading. You can tie into the onLoadFinished Callback to find out when that happens. See full code below.

    var page = new WebPage()
    var fs = require('fs');

    page.onLoadFinished = function() {
      console.log("page load finished");
      page.render('export.png');
      fs.write('1.html', page.content, 'w');
      phantom.exit();
    };

    page.open("http://www.google.com", function() {
      page.evaluate(function() {
      });
    });

When using a site like google, it can be deceiving because it loads so quicker, that you can often execute a screengrab inline like you have it. Timing is a tricky thing in phantomjs, sometimes I test with setTimeout to see if timing is an issue.

Marcela answered 14/8, 2015 at 22:53 Comment(1)
what about images and styles?Bayonne
E
4

When I copied your code directly, and changed the URL to www.google.com, it worked fine, with two files saved:

  • 1.html
  • export.png

Bear in mind that the files will be written to the location you run the script from, not where your .js file is located

Energize answered 31/5, 2013 at 11:28 Comment(0)
S
2

After 2 long days of struggling and frustration I finally got my similar issue solved. What did the trick was the waitfor.js example in PhantomJS' official website. Be happy!

"use strict";

function waitFor(testFx, onReady, timeOutMillis) {
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
        start = new Date().getTime(),
        condition = false,
        interval = setInterval(function() {
            if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                // If not time-out yet and condition not yet fulfilled
                condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
            } else {
                if(!condition) {
                    // If condition still not fulfilled (timeout but condition is 'false')
                    console.log("'waitFor()' timeout");
                    phantom.exit(1);
                } else {
                    // Condition fulfilled (timeout and/or condition is 'true')
                    console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                    typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                    clearInterval(interval); //< Stop this interval
                }
            }
        }, 250); //< repeat check every 250ms
};


var page = require('webpage').create();

// Open Twitter on 'sencha' profile and, onPageLoad, do...
page.open("http://twitter.com/#!/sencha", function (status) {
    // Check for page load success
    if (status !== "success") {
        console.log("Unable to access network");
    } else {
        // Wait for 'signin-dropdown' to be visible
        waitFor(function() {
            // Check in the page if a specific element is now visible
            return page.evaluate(function() {
                return $("#signin-dropdown").is(":visible");
            });
        }, function() {
           console.log("The sign-in dialog should be visible now.");
           phantom.exit();
        });
    }
});
Seemaseeming answered 27/9, 2017 at 8:52 Comment(0)
F
0

I tried several approaches to similar task and the best results I got using Selenium.

Before I tried PhantomJS and Cheerio. Phantom was crashing too often while executing JS on the page.

Fahrenheit answered 5/7, 2014 at 16:22 Comment(0)
J
0

I'm using CasperJS to run tests with PhantomJS. I added this code to my tearDown function:

var require = patchRequire(require);
var fs = require('fs');

casper.test.begin("My Test", {
    tearDown: function(){
        casper.capture("export.png");
        fs.write("1.html", casper.getHTML(undefined, true), 'w');
    },
    test: function(test){
        // test code

        casper.run(function(){
            test.done();
        });
    }
});

See docs for capture and getHTML.

Joelynn answered 11/4, 2015 at 1:13 Comment(0)
S
-2

one approach that comes to my mind, besides using a headless browser is obviously to simulate the ajax calls and to ensemble the page post-process, request by request.. this however is often kind of tricky and should be used as a last resort, unless you really like to dig through javascript code..

Seineetmarne answered 15/4, 2014 at 5:6 Comment(0)
A
-10

This can easily be done with some php code and javascript use fopen() and fwrite() and this function to save it: var generatedSource = new XMLSerializer().serializeToString(document);

Acoustician answered 5/7, 2014 at 16:3 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.