Here a little example using PyPDF2 ,requests and BeautifulSoup ...pls check the notes comment , this is for first block ...if you need more is necesary change the value in url variable
# You need install :
# pip install PyPDF2 - > Read and parse your content pdf
# pip install requests - > request for get the pdf
# pip install BeautifulSoup - > for parse the html and find all url hrf with ".pdf" final
from PyPDF2 import PdfFileReader
import requests
import io
from bs4 import BeautifulSoup
url=requests.get('https://usda.library.cornell.edu/concern/publications/3t945q76s?locale=en#release-items')
soup = BeautifulSoup(url.content,"lxml")
for a in soup.find_all('a', href=True):
mystr= a['href']
if(mystr[-4:]=='.pdf'):
print ("url with pdf final:", a['href'])
urlpdf = a['href']
response = requests.get(urlpdf)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
# Here the metadata of your pdf
print(txt)
# numpage for the number page
numpage=20
page = pdf.getPage(numpage)
page_content = page.extractText()
# print the content in the page 20
print(page_content)
edit
orextract
. – Mulfordrequests
orurllib
you can get HTML from server, usingBeautifulSoup
you can find links to PDF in HTML, using these links withrequests
orurllib
you can download PDF. Later you would have to use other tools to work with PDF. There are modulesPDFMiner
,PyPDF2
to work with PDF in Python but I don't have experience with this. – Mulford