1. Anuncie Aqui ! Entre em contato fdantas@4each.com.br

[Python] Getting Index out of range error while trying to scrape product code from a website...

Discussão em 'Python' iniciado por Stack, Outubro 4, 2024 às 04:02.

  1. Stack

    Stack Membro Participativo

    Issue Description:

    I am trying to automate a process where I can visit a website and scrape product details of top 100 products on that page and put it in an excel file.

    Code Explanation:

    I have a class Webscraper inside which I am calling two functions. First I am calling scroll_and_click_view_more function which is simply scrolling down the webpage that I am visiting. Then I am calling prod_vitals function which is extracting product code and product names from that webpage.

    Error Description:

    Whenever I am running below code upto a certain maximum no. of products, the code gets stuck after a point and throws Index out of range error. If I set max_count_of_products=50, code got stuck at line, If I set max_count_of_products=100, code got stuck at 93. There is no fixed index where I am getting stuck, if I change the value of max_count_of_products, the point at which the code gets stuck is also changing.

    I am attaching screenshots of the error below.

    max_count_of_products=50

    [​IMG]

    max_count_of_products=100

    [​IMG]

    Please find my code below:

    products_summary = []
    max_count_of_products=100

    def scroll_and_click_view_more(driver,href):
    flag=False
    last_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    while True:
    try:
    driver.execute_script("window.scrollBy(0, 800);")
    time.sleep(4)
    new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product-tile')))
    except Exception as e:
    if new_height == last_height and flag==False:
    print("Reached the end of the page and no product tiles were found: ",href)
    return "No product tiles found"
    else:
    last_height = new_height
    continue
    div_count = 0
    flag=True
    response = driver.page_source
    soup = BeautifulSoup(response, 'html.parser')
    div_elements = soup.find_all('div', class_ = 'product-tile')
    div_count = len(div_elements)
    if(div_count > max_count_of_products):
    return(driver.page_source)
    else:
    driver.execute_script("window.scrollBy(0, 300);")
    time.sleep(3)
    new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    #print(new_height)
    if new_height == last_height:
    return(driver.page_source)
    else:
    last_height = new_height
    except Exception as e:
    print(e)
    break

    def prod_vitals(soup,title,url):
    count_of_items=1
    products_data = [] # Array to store all product data for our excel sheet
    for div in soup.find_all('div', class_ = 'product-tile'): # Iterate over each individual product-tile div tag
    if count_of_items<=max_count_of_products:
    count_of_items = count_of_items+1;
    pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
    pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
    products_data.append({'Product Code': pro_code, 'Product Name': pro_name}) # Append the extracted data to the list
    print("Count: ", count_of_items)
    print("Product Code: ",pro_code)
    print("Product Name: ",pro_name)
    print("\n")
    else:
    break
    time.sleep(5)

    class WebScraper:
    def __init__(self):
    self.url = "https://staging1-japan.coach.com/shop/new/women/?auto=true"
    options = Options()
    options.add_argument("--remote-debugging-port=9222")
    self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)

    def scrape(self):
    self.driver.get(self.url)
    time.sleep(5)
    soup = BeautifulSoup(self.driver.page_source, 'html.parser') # Refresh the page source and parse it
    response = scroll_and_click_view_more(self.driver, 'Link')
    time.sleep(3)
    if response != "No product tiles found" and response != "Reached the end of the page.":
    soup = BeautifulSoup(response, 'html.parser')
    prod_vitals(soup,'TITLE', self.url)
    time.sleep(2)
    else:
    self.driver.execute_script("window.scrollTo(0,0);")
    time.sleep(3)
    self.driver.close()
    scraper = WebScraper()
    scraper.scrape()
    time.sleep(5)
    scraper.driver.quit()


    > Attaching the product div structure below:

    <div class="css-0">
    <div class="css-1fg6eq7">
    <div tabindex="-1" style="padding-top: 125%;"></div>
    <img width="237.01" height="296" class="chakra-image css-14ql1gk" src="https://coach.scene7.com/is/image/Coach/cn731_b4ous_a0?$desktopProductTile$" fetchpriority="high" id="CN731 B4OUS" name="タビー 12" data-qa="cm_tile_link_pt_img" contain="none" alt="COACH®,タビー 12,ボディバッグ&amp;斜めがけバッグ,トゥルー ピンク">
    </div>
    </div>

    Continue reading...

Compartilhe esta Página