I figured out how to bypass redirect by the following:
1- check if am redirected in parse().
2- if redirected, then arrange to simulate the action of escaping this redirection and return back to your required URL for scraping, you may need to check Network behavior in google chrome and simulate the POST of a request to get back to your page.
3- go to another process , using callback, and then be within this process to complete all scraping work by recursive loop calling itself, and put condition to break this loop at the end.
below example I used to bypass Disclaimer page and return back to my main url and start scraping.
from scrapy.http import FormRequest
import requests
class ScrapeClass(scrapy.Spider):
name = 'terrascan'
page_number = 0
start_urls = [
Your MAin URL , Or list of your URLS, or Read URLs fro file to a list
]
def parse(self, response):
''' Here I killed Disclaimer page and continued in below proc with follow !!!'''
# Get Currently Requested URL
current_url = response.request.url
# Get All Followed Redirect URLs
redirect_url_list = response.request.meta.get('redirect_urls')
# Get First URL Followed by Spiders
redirect_url_list = response.request.meta.get('redirect_urls')[0]
# handle redirection as below ( check redirection !! , got it from redirect.py
# in \downloadermiddlewares Folder
allowed_status = (301, 302, 303, 307, 308)
if 'Location' in response.headers or response.status in allowed_status: # <== this is condition of redirection
print(current_url, '<========= am not redirected @@@@@@@@@@')
else:
print(current_url, '<====== kill that please %%%%%%%%%%%%%')
session_requests = requests.session()
# got all below data from monitoring network behavior in google chrome when simulating clicking on 'I Agree'
headers_ = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'ctl00$cphContent$btnAgree': 'I Agree'
}
# headers_ = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
# Post_ = session_requests.post(current_url, headers=headers_)
Post_ = session_requests.post(current_url, headers=headers_)
# if Post_.status_code == 200: print('heeeeeeeeeeeeeeeeeeeeeey killed it')
print(response.url , '<========= check this please')
return FormRequest.from_response(Post_,callback=self.parse_After_disclaimer)
def parse_After_disclaimer(self, response):
print(response.status)
print(response.url)
# put your condition to make sure that the current url is what you need, other wise escape again until you kill redirection
if response.url not in [your lis of URLs]:
print('I am here brother')
yield scrapy.Request(Your URL,callback=self.parse_After_disclaimer)
else:
# here you are good to go for scraping work
items = TerrascanItem()
all_td_tags = response.css('td')
print(len(all_td_tags),'all_td_results',response.url)
# for tr_ in all_tr_tags:
parcel_No = all_td_tags.css('#ctl00_cphContent_ParcelOwnerInfo1_lbParcelNumber::text').extract()
Owner_Name = all_td_tags.css('#ctl00_cphContent_ParcelOwnerInfo1_lbOwnerName::text').extract()
if parcel_No:items['parcel_No'] = parcel_No
else: items['parcel_No'] =''
yield items
# Here you put the condition to recursive call of this process again
#
ScrapeClass.page_number += 1
# next_page = 'http://terrascan.whitmancounty.net/Taxsifter/Search/results.aspx?q=[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]&page=' + str(terraScanSpider.page_number) + '&1=1#rslts'
next_page = Your URLS[ScrapeClass.page_number]
print('am in page #', ScrapeClass.page_number, '===', next_page)
if ScrapeClass.page_number < len(ScrapeClass.start_urls_AfterDisclaimer)-1: # 20
# print('I am loooooooooooooooooooooooping again')
yield response.follow(next_page, callback=self.parse_After_disclaimer)