Updated version of @Mattwmaster58's answer that works with the latest version of selenium-wire
(5.1.0 at the time of this writing). Also adds support for nonce
attributes on inline script tags.
from lxml import html
from lxml.etree import ParserError
from lxml.html import builder
from seleniumwire import webdriver
from seleniumwire.request import Request, Response
from seleniumwire.thirdparty.mitmproxy.net.http import encoding as decoder
SCRIPT_BODY_TO_INJECT = 'alert("injected")'
def has_mime_type(header: str, expected_type: str) -> bool:
return header == expected_type or header.startswith(expected_type + ";")
def response_interceptor(request: Request, response: Response) -> None:
content_type = response.headers.get("Content-Type")
if (
response.status_code != 200
or not content_type
or not has_mime_type(content_type, "text/html")
):
return
encoding = response.headers.get("Content-Encoding", "identity")
try:
parsed_html = html.fromstring(decoder.decode(response.body, encoding))
except ParserError:
return
# Preserve nonce attribute to allow inline script.
# https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/nonce
attrs = {}
if (nonce_script := parsed_html.find(".//script[@nonce]")) is not None:
attrs["nonce"] = nonce_script.get("nonce")
try:
injected_script = builder.SCRIPT(SCRIPT_BODY_TO_INJECT, **attrs)
parsed_html.head.insert(0, injected_script)
except IndexError: # No head element.
return
response.body = decoder.encode(
html.tostring(parsed_html.getroottree()), encoding
)
del response.headers["Content-Length"] # Avoid duplicate header.
response.headers["Content-Length"] = str(len(response.body))
def main():
with webdriver.Firefox() as session:
session.response_interceptor = response_interceptor
session.get("https://example.com")
if __name__ == "__main__":
main()
As an alternative to generating output with lxml
(which can alter the structure of the HTML), you can also use a regex to insert the tag and preserve existing formatting:
from lxml import html
from lxml.etree import ParserError
from lxml.html import builder
from mimeparse import parse_mime_type
from seleniumwire import webdriver
from seleniumwire.request import Request, Response
from seleniumwire.thirdparty.mitmproxy.net.http import encoding as decoder
import re
SCRIPT_BODY_TO_INJECT = 'alert("injected")'
HEAD_TAG_RE = re.compile(r"<head\s*>()", re.IGNORECASE)
INLINE_SCRIPT_TAG_RE = re.compile(
r"()<script\b(?:(?!\bsrc\b\s*=\s*['\"]).)*?>", re.IGNORECASE
)
def response_interceptor(request: Request, response: Response) -> None:
content_type = response.headers.get("content-type")
if not content_type:
return
mime_type, mime_subtype, mime_params = parse_mime_type(content_type)
if (
response.status_code != 200
or mime_type != "text"
or mime_subtype != "html"
):
return
encoding = response.headers.get("content-encoding", "identity")
charset = mime_params.get("charset", "iso-8859-1")
try:
decoded_body = decoder.decode(response.body, encoding).decode(charset)
parsed_html = html.fromstring(decoded_body)
except ParserError:
return
# Preserve nonce attribute to allow inline script.
# https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/nonce
attrs = {}
if (nonce_script := parsed_html.find(".//script[@nonce]")) is not None:
attrs["nonce"] = nonce_script.get("nonce")
# Some sites inject scripts before the DOCTYPE, which isn't valid markup
# but still runs.
if m := min((x for regex in (INLINE_SCRIPT_TAG_RE, HEAD_TAG_RE)
if (x := regex.search(decoded_body))),
key=lambda x: x.start()):
injected_script_text = html.tostring(
builder.SCRIPT(SCRIPT_BODY_TO_INJECT, **attrs), encoding="unicode"
)
replacement = (
m.string[m.start(): m.start(1)]
+ injected_script_text
+ m.string[m.start(1): m.end()]
)
modified_body = m.string[:m.start()] + replacement + m.string[m.end():]
response.body = decoder.encode(modified_body.encode(charset), encoding)
del response.headers["Content-Length"] # Avoid duplicate header.
response.headers["Content-Length"] = str(len(response.body))
def main():
with webdriver.Firefox() as session:
session.response_interceptor = response_interceptor
session.get("https://example.com")
if __name__ == "__main__":
main()
onLoad
trigger to execute the script on page load. This is one of the most straightforward ways as I see it happening. – Chun