This code returns a csv file with product name, price, rating, review counts, and URL based on search terms entered. Multiple search terms can be entered with comma separator.
# import liabraries
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
# define URL using template
def get_url(search_text):
template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
search_term = search_text.replace(' ', '+')
url = template.format(search_term)
url += '&page{}'
return url
# Extract and return data from a single record
def extract_record(item):
try:
atag = item.h2.a
description = atag.text.strip()
url = 'https://www.amazon.com' + atag.get('href')
except AttributeError:
description =''
url = ''
try:
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
except AttributeError:
return
try:
rating = item.i.text
review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
except AttributeError:
rating = ''
review_count = ''
result = (description, price, rating, review_count, url)
return result
# Extract and return data from a single record
def main(search_term):
options = Options()
# open chrome with incognito option
options.add_argument("--incognito")
# by turning this on, code will run with chrome browser hidden
#options.add_argument("--headless")
# change chrome driver path accordingly
driver = webdriver.Chrome("C:\\webdrivers\\chromedriver.exe", options=options)
records = []
url = get_url(search_term)
for page in range(1, 11):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
# save data to csv file
with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
writer.writerows(records)
# run program
search_terms = {'ultrawide curved monitor', 'gaming monitor curved'}
for search_term in search_terms:
main(search_term)
This was originally from YouTube channel, Izzy Analytics. I have another post for product view data, which the same concept is used. Please visit Izzy Analytics channel for a original posting. https://www.youtube.com/channel/UCWHJGQc7Vqo37dlUmpiK6hQ.
