Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of ASINs and crawls per each ASIN.
Download link – https://chromedriver.chromium.org/downloads
# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob
# %%
def get_url(search_ASIN):
"""Generate a url from search text"""
template = 'https://www.amazon.com/dp/{}'
ASIN = search_ASIN.replace(' ', '')
# add term query to url
url = template.format(ASIN)
# print(url)
return url
# %%
def main(search_ASIN):
options = Options()
options.add_argument("--incognito")
options.add_argument("--headless")
driver = webdriver.Chrome("chromedriver.exe", options=options)
options = Options()
url = get_url(search_ASIN)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# results = soup.find_all('div', {'class':'centerColAlign'})
records = []
for item in soup:
record = extract_records(item)
if record:
records.append(record)
driver.close()
to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'
with open(to_path + search_ASIN + '.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['ASIN', 'Rating', 'ReviewCount','Prince', 'Title'])
writer.writerows(records)
# %%
def extract_records(item):
try:
title = item.find('span',{'id':'productTitle'}).text.strip()
rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
except AttributeError:
title = ''
rating = ''
review_count = ''
price_whole = ''
price_praction = ''
price = price_whole + price_praction
result = (search_ASIN, rating, review_count, price,title)
return result
# %%
def combine(from_path):
joined_files = os.path.join(from_path,'*csv')
joined_list = glob.glob(joined_files)
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
for f in joined_list:
os.remove(f)
df.to_csv(from_path + 'result.csv')
# %%
def read_asin(file):
# global search_ASINs
df = pd.read_csv(file, names=['asin'])
search_ASINs = df['asin'].tolist()
# %%
file = 'asin.csv'
read_asin(file)
for search_ASIN in search_ASINs:
main(search_ASIN)
from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'
combine(from_path)