How can I cache results from playwright so the next time the same URL is called, it can pull directly from cache?
Asked Answered
A

1

6

I'm running playwright in a react app. My goal is to return results right away for cached results.

The following is able to set and get cache. But the next time the same URL is requested, it acts like it's not saved in cache. What am I missing?

const { chromium } = require('playwright');
const LRU = require('lru-cache');

const cache = new LRU({
  max: 500,  // Max number of entries in the cache
  ttl: 86400 // Time to live for an entry in the cache, in milliseconds
});

async function fetchHTML(url) {
  const browser = await chromium.launch();
  const context = await browser.newContext();
  const page = await context.newPage();
  page.setDefaultTimeout(10000);
  // Check if the URL is in the cache
  let html = cache.get(url);
  if (html) {
    // This is NEVER called. 😞
    console.log('Retrieved HTML from cache');
  } else {
    await page.goto(url);
    // html = await page.content();
    html = await page.evaluate(() => document.body.innerHTML);
    console.log('Setting cache for ' + url);
    cache.set(url, html);

    let cachedHtml = cache.get(url);
    // 😄 This works though. 🤷
    console.log('Cached HTML length: ' + cachedHtml.length); // Returns: Cached HTML length: 1234
  }
  await browser.close();
  return html;
}

let url = 'https://example.com';
let html = fetchHTML(url); // successful fetch!
console.log(html.length); // Returns: 1234
// but is it cached?
let cachedHtml = fetchHTML(url); // this also fetches, but not from cache. Why?
console.log(cachedHtml.length);

Aftersensation answered 28/12, 2022 at 23:32 Comment(0)
H
1

The basic idea for caching results with Playwright is to use route interception.

Here's an example:

const NodeCache = require('node-cache');
const { chromium } = require('playwright');

// Create a simple cache
const cache = new NodeCache({ stdTTL: 3600 }); // Cache for 1 hour

// Implement the caching logic using route interception
async function fetchWithCache(url) {
  const browser = await chromium.launch();
  const page = await browser.newPage();

  // Key step: Set up route interception
  await page.route('**/*', async (route) => {
    const request = route.request();
    const requestUrl = request.url();

    if (cache.has(requestUrl)) {
      console.log('Serving from cache:', requestUrl);
      const cachedResponse = cache.get(requestUrl);
      await route.fulfill(cachedResponse);
    } else {
      console.log('Fetching from network:', requestUrl);
      const response = await route.fetch();
      const responseData = {
        status: response.status(),
        headers: response.headers(),
        body: await response.text()
      };
      cache.set(requestUrl, responseData);
      await route.continue();
    }
  });

  await page.goto(url);
  const content = await page.content();
  await browser.close();
  return content;
}

// Usage
(async () => {
  const url = 'https://example.com';
  
  console.time('First request');
  await fetchWithCache(url);
  console.timeEnd('First request');

  console.time('Second request (cached)');
  await fetchWithCache(url);
  console.timeEnd('Second request (cached)');
})();

Note 1: I only test the Python implementation, the JavaScript implementation above has not been tested.

Note 2: This example uses in-memory caching. For persistence across script runs, consider using a disk-based cache.

Handmaid answered 15/7, 2024 at 7:16 Comment(1)
I've implemented exactly that approach as a library, that caches network requests on the filesystem: github.com/vitalets/playwright-network-cacheTedtedd

© 2022 - 2025 — McMap. All rights reserved.