Web scraping – Amazon keyword search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of keywords and crawls first 2 pages per each keyword. You can increase the number of pages.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

# %%
def extract_records(item):
    try:

        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]  
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
            
    except AttributeError:
        title=''
        rating =''
        review_count = ''
        price_whole = ''
        price_praction = ''
    
    price = price_whole + price_praction 

    # asin = url_product.find('dp/',10)
    
    result = (search_term, asin, rating, review_count, price,title)

    return result

# %%
def main(search_term):
    global url_product
    global asin

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_term)
   
    records = []

    for page in range(1, 2):

        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
        for item in results:
            atag = item.h2.a
            url_product = 'https://www.amazon.com' + atag.get('href')
            driver.get(url_product)
            soup_products = BeautifulSoup(driver.page_source, 'html.parser')
            
            if url_product.find('dp%') >= 2:
                x = url_product.find('dp%')
                asin = url_product[x+4:x+14]
            else:
                x = url_product.find('dp/')
                asin = url_product[x+3:x+13]
                
            # print(asin)
    
            for item in soup_products:
                record = extract_records(item)
                if record:
                    records.append(record)

            driver.close

    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

    with open(to_path + search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Search', 'ASIN', 'Rating', 'ReviewCount', 'Price', 'Title'])
        writer.writerows(records)



# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')

# %%
def read_asin(file):
    # global search_terms
    df = pd.read_csv(file, names=['search'])
    search_terms = df['search'].tolist()

# %% [markdown]
# file='search.csv'
# df = pd.read_csv(file, names=['search'])
# search_terms = df['search'].tolist()
# search_terms

# %%
file='search.csv'

for search_term in search_terms:
    main(search_term)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)