Web scrapping – Amazon ASIN search


Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of ASINs and crawls per each ASIN.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_ASIN):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/dp/{}'
    ASIN = search_ASIN.replace(' ', '')
    
    # add term query to url
    url = template.format(ASIN)  
    # print(url) 
       
    return url

# %%
def main(search_ASIN):

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_ASIN)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # results = soup.find_all('div', {'class':'centerColAlign'})

    records = []
    for item in soup:
        record = extract_records(item)
        if record:
            records.append(record)
    
    driver.close()
    
    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'


    with open(to_path + search_ASIN + '.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Rating', 'ReviewCount','Prince', 'Title'])
        writer.writerows(records)


# %%
def extract_records(item):
    try:
        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
    except AttributeError:
        title = ''
        rating = ''
        review_count = ''
        price_whole = ''
        price_praction = ''
        
    price = price_whole + price_praction
    
    result = (search_ASIN, rating, review_count, price,title)

    return result

# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')


    

# %%
def read_asin(file):
    # global search_ASINs
    df = pd.read_csv(file, names=['asin'])
    search_ASINs = df['asin'].tolist()



# %%
file = 'asin.csv'
read_asin(file)

for search_ASIN in search_ASINs:
    
    main(search_ASIN)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)






Web scraping – Amazon keyword search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of keywords and crawls first 2 pages per each keyword. You can increase the number of pages.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

# %%
def extract_records(item):
    try:

        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]  
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
            
    except AttributeError:
        title=''
        rating =''
        review_count = ''
        price_whole = ''
        price_praction = ''
    
    price = price_whole + price_praction 

    # asin = url_product.find('dp/',10)
    
    result = (search_term, asin, rating, review_count, price,title)

    return result

# %%
def main(search_term):
    global url_product
    global asin

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_term)
   
    records = []

    for page in range(1, 2):

        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
        for item in results:
            atag = item.h2.a
            url_product = 'https://www.amazon.com' + atag.get('href')
            driver.get(url_product)
            soup_products = BeautifulSoup(driver.page_source, 'html.parser')
            
            if url_product.find('dp%') >= 2:
                x = url_product.find('dp%')
                asin = url_product[x+4:x+14]
            else:
                x = url_product.find('dp/')
                asin = url_product[x+3:x+13]
                
            # print(asin)
    
            for item in soup_products:
                record = extract_records(item)
                if record:
                    records.append(record)

            driver.close

    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

    with open(to_path + search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Search', 'ASIN', 'Rating', 'ReviewCount', 'Price', 'Title'])
        writer.writerows(records)



# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')

# %%
def read_asin(file):
    # global search_terms
    df = pd.read_csv(file, names=['search'])
    search_terms = df['search'].tolist()

# %% [markdown]
# file='search.csv'
# df = pd.read_csv(file, names=['search'])
# search_terms = df['search'].tolist()
# search_terms

# %%
file='search.csv'

for search_term in search_terms:
    main(search_term)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)


Executable stand-alone Amazon Price Tracking tool using Python with tkinter library

creates stand-alone executable tool to scrap amazon price

Please send me your email address and public IP if you are interested in the tool. The customizations are also available (a different store, save the search result to a database, schedule to send a report as an email attachment, etc)

Requirement:

  1. Create ‘webscraptool.py’, and copy & paste ‘Main Code’ below (Assuming you have python installed in your machine)
  2. Download and save chromedriver under ./driver/ folder. Create ‘driver’ folder in the same folder that you have save ‘webscraptool.py’. (chrome driver download: https://chromedriver.chromium.org/downloads)
  3. To create stand-alone ‘webscraptool.exe’, install ‘pyinstaller’ (pip install pyinstaller)
  4. Run the code below in command line. This will create the stand-alone ‘.exe’ file.
 pyinstaller -F -w --add-binary "./driver/chromedriver.exe;./driver"

Main Code:

from tkinter import *
import tkinter.ttk as ttk
from tkinter import filedialog
import tkinter.messagebox as msgbox
import tkinter.font as font
import os
import time

from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

# web scrap main code block
def resource_path(relative_path):
    try:
        base_path = sys._MEIPASS
    except Exception:
        base_path = os.path.abspath(".")

    return os.path.join(base_path, relative_path)

def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    try:
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.com' + atag.get('href')
    except AttributeError:
        description =''
        url = ''
    try:
        # product price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating and review count
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    """Run main program routine"""
    options = Options()
    # startup the webdriver
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome(resource_path("./driver/chromedriver.exe"), options=options)
    
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 11):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    
    # save data to csv file
    with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)

# GUI tkinter codes begin

root = Tk()
root.title("Web Scrapper")
#root.geometry("640x480")
myFont=font.Font(size=12, weight="bold")
label=Label(root, text="Amazon Price Tracker")
label['font']=myFont
label.pack(side="top", padx=5, pady=5, ipady=3)

# search text frame

frm_search = LabelFrame(root, text="Search")
frm_search.pack(fill="x", padx=5, pady=5, ipady=3)

e = Entry(frm_search, width=30)
e.pack(fill="x", padx=5, pady=5, ipady=5)
e.insert(0, "Enter products to search")

def add_txt():
    search_list.insert(END,e.get())

def remove_txt():
    for index in reversed(search_list.curselection()):
        search_list.delete(index)

x = datetime.datetime(2021, 1, 15)
y = datetime.datetime.now()

def start():
    if search_list.size() == 0:
        msgbox.showwarning("Warning", "Add products to search")
        return

    search_terms = search_list.get(0, END)

    for idx, search_term in enumerate(search_terms):

        main(search_term)
       
        progress = (idx + 1) / len(search_terms) * 100
        p_var.set(progress)
        progressbar.update()

# add/remove search text frame
frm_search1 = LabelFrame(root, text="Add/Remove Products")
frm_search1.pack(fill="x", padx=5, pady=5, ipady=3)

btn_remove = Button(frm_search1, text="Remove", padx=5, pady=5, width=12, command=remove_txt)
btn_add = Button(frm_search1, text="Add", padx=5, pady=5, width=12, command=add_txt)

btn_remove.pack(side="right", padx=5, pady=5)
btn_add.pack(side="right", padx=5, pady=5)

#list frame
frm_list = Frame(root)
frm_list.pack(fill="both", padx=5, pady=5)


scrollbar = Scrollbar(frm_list)
scrollbar.pack(side="right", fill="y")

search_list = Listbox(frm_list, selectmode="extended", height=10, yscrollcommand=scrollbar.set)
search_list.pack(side="left", fill="both", expand=True)
scrollbar.config(command=search_list.yview)

# progress bar frame
frm_progress = LabelFrame(root, text="Progress")
frm_progress.pack(fill="x", padx=5, pady=5, ipady=3)

p_var = DoubleVar()
progressbar = ttk.Progressbar(frm_progress, maximum=100, length=150, variable=p_var)
progressbar.pack(fill="x", padx=5, pady=5)



# run/exit button frame
frm_run = LabelFrame(root)
frm_run.pack(fill="x", padx=5, pady=5)

btn_exit = Button(frm_run, text="Exit", padx=5, pady=5, width=12, command=root.quit)
btn_start = Button(frm_run, text="Start", padx=5, pady=5, width=12, command=start)

btn_exit.pack(side="right", padx=5, pady=5)
btn_start.pack(side="right", padx=5, pady=5)

root.mainloop()

Python – Web scraping AMAZON Price Tracking

This code returns a csv file with product name, price, rating, review counts, and URL based on search terms entered. Multiple search terms can be entered with comma separator.

# import liabraries
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
# define URL using template
def get_url(search_text):

    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    url = template.format(search_term)
    url += '&page{}'
        
    return url
# Extract and return data from a single record
def extract_record(item):
  
    try:
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.com' + atag.get('href')

    except AttributeError:
        description =''
        url = ''
    try:
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text

    except AttributeError:
        return
    
    try:
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text

    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result
# Extract and return data from a single record
def main(search_term):
    
    options = Options()
    # open chrome with incognito option
    options.add_argument("--incognito")
    # by turning this on, code will run with chrome browser hidden
    #options.add_argument("--headless")

    # change chrome driver path accordingly
    driver = webdriver.Chrome("C:\\webdrivers\\chromedriver.exe", options=options)
    
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 11):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    
    # save data to csv file
    with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)
# run program
search_terms = {'ultrawide curved monitor', 'gaming monitor curved'}
for search_term in search_terms:
    main(search_term)

This was originally from YouTube channel, Izzy Analytics. I have another post for product view data, which the same concept is used. Please visit Izzy Analytics channel for a original posting. https://www.youtube.com/channel/UCWHJGQc7Vqo37dlUmpiK6hQ.