Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of keywords and crawls first 2 pages per each keyword. You can increase the number of pages.
Download link – https://chromedriver.chromium.org/downloads
# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob
# %%
def get_url(search_text):
"""Generate a url from search text"""
template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
search_term = search_text.replace(' ', '+')
# add term query to url
url = template.format(search_term)
# add page query placeholder
url += '&page{}'
return url
# %%
def extract_records(item):
try:
title = item.find('span',{'id':'productTitle'}).text.strip()
rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
except AttributeError:
title=''
rating =''
review_count = ''
price_whole = ''
price_praction = ''
price = price_whole + price_praction
# asin = url_product.find('dp/',10)
result = (search_term, asin, rating, review_count, price,title)
return result
# %%
def main(search_term):
global url_product
global asin
options = Options()
options.add_argument("--incognito")
options.add_argument("--headless")
driver = webdriver.Chrome("chromedriver.exe", options=options)
options = Options()
url = get_url(search_term)
records = []
for page in range(1, 2):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
atag = item.h2.a
url_product = 'https://www.amazon.com' + atag.get('href')
driver.get(url_product)
soup_products = BeautifulSoup(driver.page_source, 'html.parser')
if url_product.find('dp%') >= 2:
x = url_product.find('dp%')
asin = url_product[x+4:x+14]
else:
x = url_product.find('dp/')
asin = url_product[x+3:x+13]
# print(asin)
for item in soup_products:
record = extract_records(item)
if record:
records.append(record)
driver.close
to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'
with open(to_path + search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Search', 'ASIN', 'Rating', 'ReviewCount', 'Price', 'Title'])
writer.writerows(records)
# %%
def combine(from_path):
joined_files = os.path.join(from_path,'*csv')
joined_list = glob.glob(joined_files)
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
for f in joined_list:
os.remove(f)
df.to_csv(from_path + 'result.csv')
# %%
def read_asin(file):
# global search_terms
df = pd.read_csv(file, names=['search'])
search_terms = df['search'].tolist()
# %% [markdown]
# file='search.csv'
# df = pd.read_csv(file, names=['search'])
# search_terms = df['search'].tolist()
# search_terms
# %%
file='search.csv'
for search_term in search_terms:
main(search_term)
from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'
combine(from_path)