python - Why doesn't pagination work in this case using Selenium? - Stack Overflow

admin2025-04-16  4

Most websites display data across multiple pages. This is done to improve user experience and reduce loading times. But when I wanted to automate the data extraction process using Selenium, I noticed that my script only retrieves information from page one and then stops. What am I doing wrong?

from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdrivermon.by import By
import pandas as pd
import undetected_chromedriver as uc

url = "/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"

# Handle elements that may not be currently visible
def etext(e):
    """Extracts text from an element, handling visibility issues."""
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

driver = uc.Chrome()

# Initialize result list to store data
result = []

with Chrome() as driver:
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    
    while True:
        # Wait for the main content to load
        sel = By.CSS_SELECTOR, "div[data-testid=result-item]"
        houses = wait.until(EC.presence_of_all_elements_located(sel))
        
        # Extract and store data from the current page
        for house in houses:
            try:
                item = {
                    "address": etext(house.find_element(By.CSS_SELECTOR, "h2")),
                    "DateLast_sold": etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
                }
                result.append(item)
            except Exception as e:
                print(f"Error extracting address or date: {e}")
        
        # Check for "Next" button and move to the next page
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, '#main-content div._12n2exy2 nav div._14xj7k72')
            next_button.click()
            wait.until(EC.staleness_of(houses[0]))  # Wait for the new page to load
        except Exception as e:
            print("No more pages to scrape or error:", e)
            break  # Stop if no more pages

# Convert results to a DataFrame and display
df = pd.DataFrame(result)
print(df)

Most websites display data across multiple pages. This is done to improve user experience and reduce loading times. But when I wanted to automate the data extraction process using Selenium, I noticed that my script only retrieves information from page one and then stops. What am I doing wrong?

from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import undetected_chromedriver as uc

url = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"

# Handle elements that may not be currently visible
def etext(e):
    """Extracts text from an element, handling visibility issues."""
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

driver = uc.Chrome()

# Initialize result list to store data
result = []

with Chrome() as driver:
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    
    while True:
        # Wait for the main content to load
        sel = By.CSS_SELECTOR, "div[data-testid=result-item]"
        houses = wait.until(EC.presence_of_all_elements_located(sel))
        
        # Extract and store data from the current page
        for house in houses:
            try:
                item = {
                    "address": etext(house.find_element(By.CSS_SELECTOR, "h2")),
                    "DateLast_sold": etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
                }
                result.append(item)
            except Exception as e:
                print(f"Error extracting address or date: {e}")
        
        # Check for "Next" button and move to the next page
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, '#main-content div._12n2exy2 nav div._14xj7k72')
            next_button.click()
            wait.until(EC.staleness_of(houses[0]))  # Wait for the new page to load
        except Exception as e:
            print("No more pages to scrape or error:", e)
            break  # Stop if no more pages

# Convert results to a DataFrame and display
df = pd.DataFrame(result)
print(df)
Share Improve this question edited Feb 3 at 13:03 Chioma Okoroafor asked Feb 2 at 11:11 Chioma OkoroaforChioma Okoroafor 571 silver badge7 bronze badges 6
  • Looking at the web page there are multiple buttons with the "_14xj7k74" class (namely the page number buttons, "1" "2" "3" etc.) Maybe try with XPATH or another class? – David Commented Feb 2 at 16:13
  • This is a follow-up to your previous question. For the page you're trying to scrape, your best option (in principle) would be to identify the "Next" button then click that. However, if you do that too often you'll get blocked by Cloudflare even if you choose to install and use *undetected_chromedriver" – Adon Bilivit Commented Feb 3 at 7:47
  • Running your code, I got blocked by cloudfare. Make sure you are not getting blocked by cloudfare. As @AdonBilivit mentioned, You can use undetected chromedriver. – user29264006 Commented Feb 3 at 12:37
  • Thank you @AdonBilivit for the help on the previous question. well i have modified my code using "undetected_chromedriver" which has been installed and imported as uc. I also inspected the CSS selector of the "Next" button. which has all been implemented in the code above, but it still scraps only the first page. can you help me look at the code, i don't know what i am doing wrong. – Chioma Okoroafor Commented Feb 3 at 12:47
  • @ChiomaOkoroafor After next_button.click() do you see the page change in the browser? Have you considered that you might need to "click through" the cookie accept/deny popup? Also, I should reiterate that even if you get the paging strategy implemented correctly, you will eventually get blocked by Cloudflare. In my testing, I get a variable number of pages until Cloudflare intervenes - usually in the range 20-25 whereas there are actually around 40 pages – Adon Bilivit Commented Feb 3 at 12:58
 |  Show 1 more comment

1 Answer 1

Reset to default 2

Different websites often require bespoke strategies in order to scrape them with any level of success.

This site is protected by Cloudflare. When Cloudflare detects too many automated invocations it will intervene and present a page that requires you to prove that you're not a robot. In this case, the number of pages that you can scrape before this happens is variable although it seems to be anywhere between 20 and 30 pages which is unfortunate because there are ~40 pages available.

The code below will handle the cookie prompt (if it appears) and then will try to get as many addresses as possible. You should be able to adapt this to your specific needs.

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from typing import cast
from collections.abc import Iterator

URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5

# get text from webelement that may not be visible
def etext(e: WebElement) -> str:
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

# click the WebElement
def click(driver: WebDriver, e: WebElement) -> None:
    ActionChains(driver).click(e).perform()

# get all WebElements that match the given css
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
    wait = WebDriverWait(driver, TIMEOUT)
    ec = EC.presence_of_all_elements_located
    sel = By.CSS_SELECTOR, css
    try:
        yield from wait.until(ec(sel))
    except TimeoutException:
        pass

# look for the Next button and click it
def click_next(driver: WebDriver) -> None:
    for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
        if etext(a) == "Next":
            click(driver, a)
            break

# look for the shadow root
def get_shadow_root(driver: WebDriver) -> WebDriver:
    wait = WebDriverWait(driver, TIMEOUT)
    ec = EC.presence_of_element_located
    sel = By.ID, "usercentrics-root"
    sre = wait.until(ec(sel))
    return cast(WebDriver, sre.shadow_root)

# you may be required to accept or decline cookies
# ignore any exceptions that may arise
def click_through(driver: WebDriver) -> None:
    try:
        wait = WebDriverWait(get_shadow_root(driver), TIMEOUT)
        ec = EC.element_to_be_clickable
        sel = By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]"
        button = wait.until(ec(sel))
        click(driver, button)
    except Exception:
        pass

if __name__ == "__main__":
    with Chrome() as driver:
        driver.get(URL)
        click_through(driver)
        prev_url = ""
        npages = 0
        # if and when Cloudflare intervenes, the current URL does not change
        while prev_url != driver.current_url:
            prev_url = driver.current_url
            for h2 in get_all(driver, "div[data-testid=result-item] h2"):
                print(etext(h2))
            click_next(driver)
            npages += 1
        print(f"Processed {npages=}")
转载请注明原文地址:http://anycun.com/QandA/1744803680a87841.html