I am trying to use Scrapy to login to a website in the init then after confirming login I want to initialize and start the standard crawl through start_urls. Im not sure what is going wrong but i get clear to the login and every thing confirms but parse_item never starts. Any help would be well appreciated.
I can get it up to "================Successfully logged in================="
but
I can not get to "==========================PARSE ITEM=========================="
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from selenium import webdriver
class ProductDetailsSpider(InitSpider):
name = 'product_details_spider'
allowed_domains = ['my_domain.com']
login_page = 'http://www.my_domain.com/'
start_urls = ['http://www.my_domain.com/nextpage1/',
'http://www.my_domain.com/nextpage2/',
'http://www.my_domain.com/nextpage3/']
rules = (
Rule(SgmlLinkExtractor(allow=()),
callback='parse_item',
follow=True),
)
def get_cookies(self):
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "http://www.my_domain.com"
driver.get(base_url + "/")
driver.find_element_by_name("USR").clear()
driver.find_element_by_name("USR").send_keys("my_user")
driver.find_element_by_name("PASSWRD").clear()
driver.find_element_by_name("PASSWRD").send_keys("my_pass")
driver.find_element_by_name("submit").click()
cookies = driver.get_cookies()
driver.close()
cookie_dic = {}
for c in cookies:
cookie_dic[c['name']] = c['value']
return cookie_dic
def init_request(self):
print '=======================INIT======================='
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return [FormRequest.from_response(response,formname='login_form',
formdata={'USR': 'my_user', 'PASSWRD': 'my_pass'},
callback=self.login_cookies)]
def login_cookies(self, response):
print '=======================COOKIES======================='
return Request(url='http://www.my_domain.com/home',
cookies=self.get_cookies(),
callback=self.check_login_response)
def check_login_response(self, response):
print '=======================CHECK LOGIN======================='
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logoff" in response.body:
print "=========Successfully logged in.========="
self.initialized()
# Now the crawling can begin..
else:
print "==============Bad times :(==============="
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
print "==============PARSE ITEM=========================="
# Scrape data from page