@Lori
I implemented the code but it still tries opening it from putty so takes a lot of time to get the work done. The code is as follows: 'code'
import sys
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from scrapy.http import Request
from selenium import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
class DmozSpider(BaseSpider):
name = "linkedin_crawler"
#defence news
global company
global query
companyFilename = '<filename>'
f=open(companyFilename,"r")
f.seek(0)
company = f.readline().strip()
f.close()
queryFilename = '/var/www/Symantec/recon/recon/' +company+ '/Spider/LinkedIn/query.txt'
f = open(queryFilename)
f.seek(0)
query=f.readline().strip()
f.close()
start_urls = ['https://www.linkedin.com/uas/login'];
def __init__(self):
BaseSpider.__init__(self)
capabilities = webdriver.DesiredCapabilities()
self.selenium = webdriver.Remote(command_executor = 'http://localhost:5000/wd/hub', desired_capabilities = capabilities.FIREFOX)
def __del__(self):
self.selenium.quit()
def parse(self, response):
sel= self.selenium
sel.get(response.url)
global query
elem1 = sel.find_element_by_name("session_key")
elem2 = sel.find_element_by_name("session_password")
elem1.send_keys("myemailid")
elem2.send_keys("mypassword")
elem2.send_keys(Keys.RETURN)
return Request(query, callback=self.page_parse)
def page_parse(self,response):
global query
global company
sel= self.selenium
sel.get(query)
for i in xrange(10):
#for i in xrange(5):
nameFilename = ''
#print hxs
nlist = sel.find_elements_by_xpath('//ol[@class="search-results"]/li/div/h3/a')
fh = open(nameFilename,"a")
for j in xrange(len(nlist)):
url = nlist[j].get_attribute("href").encode('utf-8')
name = nlist[j].text.encode('utf-8')
fh.write(name)
fh.write("<next>")
fh.write(url)
fh.write('\n')
fh.close()
next = sel.find_elements_by_xpath('//a[@class="page-link"]')
next[0].click()
time.sleep(5)
To tun this script on server, I am using putty to fire the command. But then it again uses Xming to open the browser which makes the process slow again. So, how to run the script without opening the browser on my local machine via Xming so that this does not become the bottleneck. Thanks