How can I cache results from playwright so the next time the same URL is called, it can pull directly from cache?

const { chromium } = require('playwright'); const LRU = require('lru-cache'); const cache = new LRU({ max: 500, // Max number of entries in the cache ttl: 86400 // Time to live for an entry in the cache, in milliseconds }); async function fetchHTML(url) { const browser = await chromium.launch(); const context = await browser.newContext(); const page = await context.newPage(); page.setDefaultTimeout(10000); // Check if the URL is in the cache let html = cache.get(url); if (html) { // This is NEVER called. 😞 console.log('Retrieved HTML from cache'); } else { await page.goto(url); // html = await page.content(); html = await page.evaluate(() => document.body.innerHTML); console.log('Setting cache for ' + url); cache.set(url, html); let cachedHtml = cache.get(url); // 😄 This works though. 🤷 console.log('Cached HTML length: ' + cachedHtml.length); // Returns: Cached HTML length: 1234 } await browser.close(); return html; } let url = 'https://example.com'; let html = fetchHTML(url); // successful fetch! console.log(html.length); // Returns: 1234 // but is it cached? let cachedHtml = fetchHTML(url); // this also fetches, but not from cache. Why? console.log(cachedHtml.length);

The basic idea for caching results with Playwright is to use route interception.

Here's an example:

const NodeCache = require('node-cache');
const { chromium } = require('playwright');

// Create a simple cache
const cache = new NodeCache({ stdTTL: 3600 }); // Cache for 1 hour

// Implement the caching logic using route interception
async function fetchWithCache(url) {
  const browser = await chromium.launch();
  const page = await browser.newPage();

  // Key step: Set up route interception
  await page.route('**/*', async (route) => {
    const request = route.request();
    const requestUrl = request.url();

    if (cache.has(requestUrl)) {
      console.log('Serving from cache:', requestUrl);
      const cachedResponse = cache.get(requestUrl);
      await route.fulfill(cachedResponse);
    } else {
      console.log('Fetching from network:', requestUrl);
      const response = await route.fetch();
      const responseData = {
        status: response.status(),
        headers: response.headers(),
        body: await response.text()
      };
      cache.set(requestUrl, responseData);
      await route.continue();
    }
  });

  await page.goto(url);
  const content = await page.content();
  await browser.close();
  return content;
}

// Usage
(async () => {
  const url = 'https://example.com';
  
  console.time('First request');
  await fetchWithCache(url);
  console.timeEnd('First request');

  console.time('Second request (cached)');
  await fetchWithCache(url);
  console.timeEnd('Second request (cached)');
})();

Note 1: I only test the Python implementation, the JavaScript implementation above has not been tested.

Note 2: This example uses in-memory caching. For persistence across script runs, consider using a disk-based cache.

Recommended topics

Hot tags