Web scrapping – Amazon ASIN search


Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of ASINs and crawls per each ASIN.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_ASIN):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/dp/{}'
    ASIN = search_ASIN.replace(' ', '')
    
    # add term query to url
    url = template.format(ASIN)  
    # print(url) 
       
    return url

# %%
def main(search_ASIN):

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_ASIN)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # results = soup.find_all('div', {'class':'centerColAlign'})

    records = []
    for item in soup:
        record = extract_records(item)
        if record:
            records.append(record)
    
    driver.close()
    
    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'


    with open(to_path + search_ASIN + '.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Rating', 'ReviewCount','Prince', 'Title'])
        writer.writerows(records)


# %%
def extract_records(item):
    try:
        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
    except AttributeError:
        title = ''
        rating = ''
        review_count = ''
        price_whole = ''
        price_praction = ''
        
    price = price_whole + price_praction
    
    result = (search_ASIN, rating, review_count, price,title)

    return result

# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')


    

# %%
def read_asin(file):
    # global search_ASINs
    df = pd.read_csv(file, names=['asin'])
    search_ASINs = df['asin'].tolist()



# %%
file = 'asin.csv'
read_asin(file)

for search_ASIN in search_ASINs:
    
    main(search_ASIN)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)






Leave a Reply

Discover more from this and that @ work

Subscribe now to keep reading and get access to the full archive.

Continue reading