# This Section imports the necessary classes from the PyPDF2 library
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import ContentStream, NameObject, TextStringObject
from PyPDF2.utils import b_
# The watermark says SAMPLE on it so I've tried different
# capitalization cases
wm_text = "Sample"
replace_with = ""
# I'm hoping to just replace the SAMPLE watermark with nothing
# so a space could suffice
# Load PDF into pyPDF
reader = PdfFileReader("input.pdf")
writer = PdfFileWriter()
for page in reader.pages:
# Get the current page's contents
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, reader)
# Loop over all pdf elements
for operands, operator in content.operations:
# Was told to adapt this part dependent on my PDF file
if operator == b_("TJ"):
text = operands[0][0]
if isinstance(text, TextStringObject) and text.startswith(
wm_text
):
operands[0] = TextStringObject(replace_with)
# Set the modified content as content object on the page
page.__setitem__(NameObject("/Contents"), content)
# Add the page to the output
writer.addPage(page)
# Write the stream
with open("output.pdf", "wb") as fh:
writer.write(fh)
Watermark Removal on PDF with PyPDF2
Using the code from the question here is a function that works in Python 3.
def remove_watermark(wm_text, inputFile, outputFile):
from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.pdf import ContentStream
from PyPDF4.generic import TextStringObject, NameObject
from PyPDF4.utils import b_
with open(inputFile, "rb") as f:
source = PdfFileReader(f, "rb")
output = PdfFileWriter()
for page in range(source.getNumPages()):
page = source.getPage(page)
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, source)
for operands, operator in content.operations:
if operator == b_("Tj"):
text = operands[0]
if isinstance(text, str) and text.startswith(wm_text):
operands[0] = TextStringObject('')
page.__setitem__(NameObject('/Contents'), content)
output.addPage(page)
with open(outputFile, "wb") as outputStream:
output.write(outputStream)
wm_text = 'wm_text'
inputFile = r'input.pdf'
outputFile = r"output.pdf"
remove_watermark(wm_text, inputFile, outputFile)
don't support CJK character –
Raffo
i tried to use the code to remove logo image, by setting
operands[0] = NullObject()
, but output pdf gets corrupted. –
Horehound The code above worked for me for removing a particular text on each page, I haven't considered other cases. –
Ulna
I'd recommend to use PyPDF2 instead of PyPDF4. Despite the name, PyPDF2 is the more up-to-date project (in 2022). I'm the new maintainer of PyPDF2. –
Gupta
Interesting, thank you. I'm going to leave the answer as is as I don't work on this at the moment, but others can see your comment. –
Ulna
As of March 2023
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import ContentStream, NameObject, TextStringObject
# The watermark says SAMPLE on it so I've tried different
# capitalization cases
wm_text = "WATERMARK TEXT"
replace_with = ""
# I'm hoping to just replace the SAMPLE watermark with nothing
# so a space could suffice
# Load PDF into pyPDF
reader = PdfReader("INPUT FILE")
writer = PdfWriter()
for page in reader.pages:
# Get the current page's contents
content_object = page["/Contents"]
content = ContentStream(content_object, reader)
# Loop over all pdf elements
for operands, operator in content.operations:
# Was told to adapt this part dependent on my PDF file
if operator == b"TJ":
text = operands[0][0]
if isinstance(text, TextStringObject) and text.startswith(
wm_text
):
operands[0] = TextStringObject(replace_with)
# Set the modified content as content object on the page
page.__setitem__(NameObject("/Contents"), content)
# Add the page to the output
writer.add_page(page)
# Write the stream
with open("OUTPUT FILE", "wb") as fh:
writer.write(fh)
PyPDF2 is deprecated. Use pypdf. –
Gupta
This solution does not work for me, unlike the solution using PyPDF4. –
Pleasantry
@Pleasantry for PyPDF2? –
Sacculate
© 2022 - 2025 — McMap. All rights reserved.