Extracting text from highlighted annotations in a PDF file

import popplerqt4 import sys import PyQt4 def main(): doc = popplerqt4.Poppler.Document.load(sys.argv[1]) total_annotations = 0 for i in range(doc.numPages()): page = doc.page(i) annotations = page.annotations() if len(annotations) > 0: for annotation in annotations: if isinstance(annotation, popplerqt4.Poppler.Annotation): total_annotations += 1 if(isinstance(annotation, popplerqt4.Poppler.HighlightAnnotation)): print str(page.text(annotation.boundary())) if total_annotations > 0: print str(total_annotations) + " annotation(s) found" else: print "no annotations found" if __name__ == "__main__": main()

Looking at the documentation for Annotations it seems that the boundary property Returns this annotation's boundary rectangle in normalized coordinates. Although this seems a strange decision we can simply scale the coordinates by the page.pageSize().width() and .height() values.

import popplerqt4
import sys
import PyQt4


def main():

    doc = popplerqt4.Poppler.Document.load(sys.argv[1])
    total_annotations = 0
    for i in range(doc.numPages()):
        #print("========= PAGE {} =========".format(i+1))
        page = doc.page(i)
        annotations = page.annotations()
        (pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())
        if len(annotations) > 0:
            for annotation in annotations:
                if  isinstance(annotation, popplerqt4.Poppler.Annotation):
                    total_annotations += 1
                    if(isinstance(annotation, popplerqt4.Poppler.HighlightAnnotation)):
                        quads = annotation.highlightQuads()
                        txt = ""
                        for quad in quads:
                            rect = (quad.points[0].x() * pwidth,
                                    quad.points[0].y() * pheight,
                                    quad.points[2].x() * pwidth,
                                    quad.points[2].y() * pheight)
                            bdy = PyQt4.QtCore.QRectF()
                            bdy.setCoords(*rect)
                            txt = txt + unicode(page.text(bdy)) + ' '

                        #print("========= ANNOTATION =========")
                        print(unicode(txt))

    if total_annotations > 0:
        print str(total_annotations) + " annotation(s) found"
    else:
        print "no annotations found"

if __name__ == "__main__":
    main()

Additionally, I decided to concatenate the .highlightQuads() to get a better representation of what was actually highlighted.

Please be aware of the explicit <space> I have appended to each quad region of text.

In the example document the returned QString could not be passed directly to print() or str(), the solution to this was to use unicode() instead.

I hope this helps someone as it helped me.

Note: Page rotation may affect the scaling values, I have not been able to test this.

Recommended topics

Hot tags