Slow scrolling down the page using Selenium
Asked Answered
C

10

12

I'm trying to scrape some data from flight search page.

This page works this way:

You fill in a form and then you click on the button search - this is ok. When you click the button you are redirected to the page with results and here is the problem. This page is adding continuously results for example for one minute which is not a big deal - problem is to get all of these results. When you are in real browser, you have to scroll down the page and these results are appearing. So I've tried to scroll down using Selenium. It scrolls down at the bottom of the page probably so fast or it is a jump instead of scrolling that the page doesn't load any new results.

When you are scrolling down slowly, it reloads results but if you do it very quickly it stops loading.

I'm not sure if my code helps to understand that so I'm attaching it.

SEARCH_STRING = """URL"""

class spider():

    def __init__(self):
        self.driver = webdriver.Firefox()

    @staticmethod
    def prepare_get(dep_airport,arr_airport,dep_date,arr_date):
        string = SEARCH_STRING%(dep_airport,arr_airport,arr_airport,dep_airport,dep_date,arr_date)
        return string


    def find_flights_html(self,dep_airport, arr_airport, dep_date, arr_date):
        if isinstance(dep_airport, list):
            airports_string = str(r'%20').join(dep_airport)
            dep_airport = airports_string

        wait = WebDriverWait(self.driver, 60) # wait for results
        self.driver.get(spider.prepare_get(dep_airport, arr_airport, dep_date, arr_date))
        wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
        wait.until(EC.invisibility_of_element_located((By.XPATH, u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))
        self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

        self.driver.find_element_by_xpath('//body').send_keys(Keys.CONTROL+Keys.END)
        return self.driver.page_source

    @staticmethod 
    def get_info_from_borderbox(div):
        arrival = div.find('div',class_='departure').text
        price = div.find('div',class_='pricebox').find('div',class_=re.compile('price'))
        departure = div.find_all('div',class_='departure')[1].contents
        date_departure = departure[1].text 
        airport_departure = departure[5].text
        arrival = div.find_all('div', class_= 'arrival')[0].contents
        date_arrival = arrival[1].text
        airport_arrival = arrival[3].text[1:]
        print 'DEPARTURE: ' 
        print date_departure,airport_departure
        print 'ARRIVAL: '
        print date_arrival,airport_arrival

    @staticmethod
    def get_flights_from_result_page(html):

        def match_tag(tag, classes):
            return (tag.name == 'div'
                    and 'class' in tag.attrs
                    and all([c in tag['class'] for c in classes]))

        soup = mLib.getSoup_html(html)
        divs = soup.find_all(lambda t: match_tag(t, ['borderbox', 'flightbox', 'p2']))

        for div in divs:
            spider.get_info_from_borderbox(div)

        print len(divs)


spider_inst = spider() 

print spider.get_flights_from_result_page(spider_inst.find_flights_html(['BTS','BRU','PAR'], 'MAD', '2015-07-15', '2015-08-15'))

So the main problem is in my opinion that it scrolls too fast to trigger new loading of the results.

Have you any idea how to make it work?

Castello answered 19/6, 2015 at 15:34 Comment(1)
scrolling and scrolling in soft is shared here: https://mcmap.net/q/891851/-slow-scrolling-down-the-page-using-seleniumErythro
R
2

Here is a different approach that worked for me involving scrolling into view of the last search result and waiting for additional elements to load before scrolling again:

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            count = len(EC._find_elements(driver, self.locator))
            return count >= self.count
        except StaleElementReferenceException:
            return False


driver = webdriver.Firefox()

dep_airport = ['BTS', 'BRU', 'PAR']
arr_airport = 'MAD'
dep_date = '2015-07-15'
arr_date = '2015-08-15'

airports_string = str(r'%20').join(dep_airport)
dep_airport = airports_string

url = "https://www.pelikan.sk/sk/flights/list?dfc=C%s&dtc=C%s&rfc=C%s&rtc=C%s&dd=%s&rd=%s&px=1000&ns=0&prc=&rng=1&rbd=0&ct=0" % (dep_airport, arr_airport, arr_airport, dep_airport, dep_date, arr_date)
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 60)
wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
wait.until(EC.invisibility_of_element_located((By.XPATH,
                                               u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))

while True:  # TODO: make the endless loop end
    results = driver.find_elements_by_css_selector("div.flightbox")
    print "Results count: %d" % len(results)

    # scroll to the last element
    driver.execute_script("arguments[0].scrollIntoView();", results[-1])

    # wait for more results to load
    wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, 'div.flightbox'), len(results)))

Notes:

  • you would need to figure out when to stop the loop - for example, at a particular len(results) value
  • wait_for_more_than_n_elements is a custom Expected Condition which helps to identify when the next portion is loaded and we can scroll again
Rozanna answered 19/6, 2015 at 15:50 Comment(7)
I'm afraid it does not work. It returns 10 in loop and when I tried to put this: for result in results: print result.text I found out that it returns the same values.Castello
@Milan well, I see the results count increasing with each iteration of the loop which means additional results are loading. Extract the results once you end the loop.Rozanna
To check whether it is finding new results, I'm adding results into the set and in each loop printing length of the set. It stays on 15. Here you can find the code and the results printed: pastebin.com/fkUrCvAmCastello
@Milan interesting, thanks. Do you have the same results if Chrome would be used instead? Also, which selenium and firefox versions are you using? Thanks.Rozanna
It raises error, I've installed this chromedriver chromedriver.storage.googleapis.com/index.html?path=2.15. I also tried PhantomJS. Here you can find outputs from 3 drivers: pastebin.com/9gRYbEh4Castello
@Milan let's skip trying in other browsers and focus on firefox. What firefox and selenium versions are you using? Thanks.Rozanna
Selenium is probably 2.46.0 version and I can't find out which version of Firefox but it should be the newest.Castello
F
12

I needed it for the same issue , i needed to scrape a social media website

y = 1000
for timer in range(0,50):
     driver.execute_script("window.scrollTo(0, "+str(y)+")")
     y += 1000  
     time.sleep(1)

the sleep every 1000 is to allow to load

Fortuitous answered 13/9, 2019 at 15:26 Comment(0)
F
11

After some experiments, finally I found a good solution:

def __scroll_down_page(self, speed=8):
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += speed
        self.__driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = self.__driver.execute_script("return document.body.scrollHeight")
Forwards answered 9/4, 2019 at 13:16 Comment(4)
This solution work for me. I am scraping one website. Some element are present when I inspect element in dev tools but missing inside page source. I tried with selenium and scroll down slowly with above method and now I can see them in page source. Thanks@ForwardsAdvertent
I guess that this is a function, how do you use it anyway?Southpaw
Just call the function after instantiate the self.__driver objectForwards
I ran it not as a definition/function and it works great! -------------------------------------- speed=8 current_scroll_position, new_height= 0, 1 while current_scroll_position <= new_height: current_scroll_position += speed driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position)) new_height = driver.execute_script("return document.body.scrollHeight")Bryce
C
8

You can do smooth scrolling using Selenium like below:

total_height = int(driver.execute_script("return document.body.scrollHeight"))

for i in range(1, total_height, 5):
    driver.execute_script("window.scrollTo(0, {});".format(i))
Chary answered 22/7, 2020 at 6:41 Comment(3)
HI, I am new to python and I was wondering how does the 3rd parameter changes the speed of scrolling?Flashcube
@SasanAngel, 3rd parameter tells by how much i would change. In other languages its like i=i+5. So here instead of directly scrolling from 0 to n. It will scroll by every 5 to reach n.Chary
This is a great solution. Thanks, Astik. Note that as it is written this will work well for pages that load as you scroll down but that do not delete any previously loaded content that scroll out of view (like some websites). If that is the case for you, you'll need to do the scraping as you scroll, so in the for same for loop. Just a small note for others.Kort
B
3

time.sleep() make the program slower not good for production

This is a more efficient and controlled way to scroll down to the bottom.
Use the below function written by me.

if you increase the  increment value  a+=5 scroll speed become higher (Warning Not more than new_height) and vice versa

def pageBottom(driver):
    bottom=False
    a=0
    while not bottom:
        new_height = driver.execute_script("return document.body.scrollHeight")
        driver.execute_script(f"window.scrollTo(0, {a});")
        if a > new_height:
            bottom=True
        a+=5

Example Usage:

service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get("https://docs.scrapy.org/en/latest/intro/tutorial.html")
pageBottom(driver) #<---Go to Bottom
Beal answered 9/10, 2022 at 15:4 Comment(1)
This worked well for me today. ThanksThirtieth
R
2

Here is a different approach that worked for me involving scrolling into view of the last search result and waiting for additional elements to load before scrolling again:

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            count = len(EC._find_elements(driver, self.locator))
            return count >= self.count
        except StaleElementReferenceException:
            return False


driver = webdriver.Firefox()

dep_airport = ['BTS', 'BRU', 'PAR']
arr_airport = 'MAD'
dep_date = '2015-07-15'
arr_date = '2015-08-15'

airports_string = str(r'%20').join(dep_airport)
dep_airport = airports_string

url = "https://www.pelikan.sk/sk/flights/list?dfc=C%s&dtc=C%s&rfc=C%s&rtc=C%s&dd=%s&rd=%s&px=1000&ns=0&prc=&rng=1&rbd=0&ct=0" % (dep_airport, arr_airport, arr_airport, dep_airport, dep_date, arr_date)
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 60)
wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
wait.until(EC.invisibility_of_element_located((By.XPATH,
                                               u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))

while True:  # TODO: make the endless loop end
    results = driver.find_elements_by_css_selector("div.flightbox")
    print "Results count: %d" % len(results)

    # scroll to the last element
    driver.execute_script("arguments[0].scrollIntoView();", results[-1])

    # wait for more results to load
    wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, 'div.flightbox'), len(results)))

Notes:

  • you would need to figure out when to stop the loop - for example, at a particular len(results) value
  • wait_for_more_than_n_elements is a custom Expected Condition which helps to identify when the next portion is loaded and we can scroll again
Rozanna answered 19/6, 2015 at 15:50 Comment(7)
I'm afraid it does not work. It returns 10 in loop and when I tried to put this: for result in results: print result.text I found out that it returns the same values.Castello
@Milan well, I see the results count increasing with each iteration of the loop which means additional results are loading. Extract the results once you end the loop.Rozanna
To check whether it is finding new results, I'm adding results into the set and in each loop printing length of the set. It stays on 15. Here you can find the code and the results printed: pastebin.com/fkUrCvAmCastello
@Milan interesting, thanks. Do you have the same results if Chrome would be used instead? Also, which selenium and firefox versions are you using? Thanks.Rozanna
It raises error, I've installed this chromedriver chromedriver.storage.googleapis.com/index.html?path=2.15. I also tried PhantomJS. Here you can find outputs from 3 drivers: pastebin.com/9gRYbEh4Castello
@Milan let's skip trying in other browsers and focus on firefox. What firefox and selenium versions are you using? Thanks.Rozanna
Selenium is probably 2.46.0 version and I can't find out which version of Firefox but it should be the newest.Castello
E
1
@Test
public void clickMeButton() {
   WebElement clickMeButton = driver.findElement(By.name("et_builder_submit_button"));

   // softly scroll to the element group
   JavascriptExecutor js = (JavascriptExecutor) driver;

   for (int i = 0; i < 800; i += 7) {
      js.executeScript("window.scrollTo(0, " + i + ")");
   }

   System.out.println("selected button");
   clickMeButton.click();
   System.out.println("clicked");
}
Erythro answered 25/9, 2021 at 9:17 Comment(0)
A
1

Assuming the page is being lazy loaded, jumping to points iteratively with a window.scrollTo() could possibly leave out elements that fall out of the range. Also, assuming the height of the page was dynamic pertaining to the content loaded, items that haven't been loaded yet could be left out. So I opted for window.scrollBy() instead:

height = driver.execute_script("return document.body.scrollHeight")
for i in range(height):
   driver.execute_script('window.scrollBy(0,20)') # scroll by 20 on each iteration
   height = driver.execute_script("return document.body.scrollHeight") # reset height to the new height after scroll-triggered elements have been loaded. 

In my case, a new batch of elements was loaded after scrolling to the 30th, giving the page a new height I needed to account for.

Abiding answered 27/9, 2021 at 9:4 Comment(0)
J
0
from selenium import webdriver
import time

driver = webdriver.Chrome()
driver.get("https://en.wikipedia.org")
height = browser.execute_script("return document.body.scrollHeight")
for scrol in range(100,height,100):
    browser.execute_script(f"window.scrollTo(0,{scrol})")
    time.sleep(0.1)

It's worked for me. If you want to scroll the page till the end to visible all the page elements may it's valuable for you. if you want to increase the scroll speed change the scroll speed just change 100 to 200.

Jana answered 1/3, 2021 at 3:31 Comment(0)
I
0

In Python Selenium, get Y position of your element, and then slowly scroll down.

y = driver.execute_script("return document.querySelector('YOUR-CSS-SELECTOR').getBoundingClientRect()['y']")
for x in range(0, int(y), 100):
    driver.execute_script("window.scrollTo(0, "+str(x)+");")
Intumesce answered 24/6, 2021 at 8:53 Comment(0)
L
0

Found a very easy (maybe too easy) solution for my project:

links = driver.find_elements("xpath", "//a[@href]")
for link in links:
    # Scroll to the target div element
    driver.execute_script("arguments[0].scrollIntoView();", link)

Insert this into the loop you use to get through the page (link in links in my code), and it will scroll the current div into view as you loop on down.

Lozada answered 8/7, 2023 at 21:27 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.