web scrapping – this and that @ work

Web scrapping – Keyword search – Walmart

Product search using keywords. Returns title, Price, Shipping, Sold by.

# %%
import pandas as pd
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup as soup
# import csv

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.walmart.com/search?q={}'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'
        
    return url


# %%
def extract_record(item):
        try:

                title = prd_bsobj.find('h1', {'class':'f3 b lh-copy dark-gray mt1 mb2'}).text.strip()
                price = prd_bsobj.find('span', {'class':'b lh-copy dark-gray mr2 f1'}).text.strip()
                shipping = prd_bsobj.find('div', {'class':'f6 bw0-xl b--near-white ml3 mr3 mid-gray ml0-m mr0-m'}).text.strip()
                sold_by = prd_bsobj.find('div', {'class':'f6 ml0-xl mid-gray mt3 bw0-xl b--near-white mb3'}).text.strip()
        except AttributeError:
                title = ''
                price = ''
                shipping = ''
                sold_by = ''
                
        results = (title, price, shipping, sold_by)
                
        return results

# %%
def main(search_terms):
    global prd_bsobj

    records = []

    for search_term in search_terms:
        url = get_url(search_term)

        for page in range (1,2):            
            url = url.format(page)
            headers = {
                'authority': 'www.walmart.com',
                'accept': 'application/json',
                'wm_client_ip': '',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'content-type': 'application/json',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': url,
                'accept-language': 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7',
                'cookie': 'vtc=RIO4sLvDbO19PMfnQ0IOG8; __gads=ID=b67a6d7a2e6e92af:T=1586194404:S=ALNI_MY3JbhHhJIK5PnOcfX_-QmCh5qcbg; s_vi=[CS]v1|2F45B3F28515E699-600008970EB85509[CE]; _fbp=fb.1.1586194406343.1767263749; cid_csid=30952ecb-4390-4a8d-abd5-aef0eeaaf2b0; _ga=GA1.2.1917989675.1586368563; _uuid=RIO4sLvDbO19PMfnQ0IOG8; __hstc=195562739.673210eb139f877dada9e614c52c183a.1591968672889.1591968672889.1591968672889.1; hubspotutk=673210eb139f877dada9e614c52c183a; s_pers=%20s_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%20om_mv3d%3Dseo_un%253A%7C1592081747798%3B%20om_mv7d%3Dseo_un%253A%7C1592427347800%3B%20om_mv14d%3Dseo_un%253A%7C1593032147802%3B%20om_mv30d%3Dseo_un%253A%7C1594414547804%3B%20s_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%20s_v%3DY%7C1593018044296%3B%20gpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%20gpv_p44%3DProduct%7C1593018044323%3B%20s_vs%3D1%7C1593018044327%3B; s_pers_2=%2Bs_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%2Bom_mv3d%3Dseo_un%253A%7C1592081747798%3B%2Bom_mv7d%3Dseo_un%253A%7C1592427347800%3B%2Bom_mv14d%3Dseo_un%253A%7C1593032147802%3B%2Bom_mv30d%3Dseo_un%253A%7C1594414547804%3B%2Bs_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%2Bs_v%3DY%7C1593018044296%3B%2Bgpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%2Bgpv_p44%3DProduct%7C1593018044323%3B%2Bs_vs%3D1%7C1593018044327%3BuseVTC%3DN%7C1656814517%3Bps%3D1; _pxvid=9146d56c-fe8f-11ea-a919-0242ac120006; _gcl_au=1.1.2036042732.1602687602; DL=28227%2C%2C%2Cip%2C28227%2C%2C; ACID=0610f36c-ea13-4d72-b95a-4ae8b4dec586; hasACID=1; auth=MTAyOTYyMDE4yUWtbBAuOTnO%2FHY8R%2FRo0Cgt5A0Ax8pPHF0z4A09FRj%2B5CLRu8XVV8J6%2FE4yp81Jl%2F79TACnElSSnMnalfN3EB9HVYAdzZS42qdAp59yDkmHyTp7%2BhQVV6loriuoOtnX767wuZloTfhm7Wk2Kcjygv699%2F6tFVwuL3qJB39WKV9mqFkMKTOoBynFLIZn6c9WpYJvDm413TzZhALP43qhIhemjR0urqaZhzDta95%2BdQQUMk70P8glgOEpLOprhDfMb%2FEw67%2FGsLtdlJHpe1JgEGea43fXiMf7OZIBkWJeaTG6haCvIJ1uWAX0ZXnMDgERo8He18TaVIrSqn0SphYvMc8d3IEUwKFaaB2Rw%2Bdn77H7obzsSqgxk5I0sIxOzFnbjLZbvHUVU4F72UT5W5nZ4w%3D%3D; type=GUEST; cart-item-count=1; TBV=7; tb_sw_supported=true; location-data=28227%3ACharlotte%3ANC%3A%3A1%3A1|1el%3B%3B2.88%2C378%3B%3B4.28%2C3wn%3B%3B5.3%2C1aa%3B%3B6.97%2C2us%3B%3B7.18%2C14c%3B%3B9.33%2C1na%3B%3B9.73%2C4t5%3B%3B10.96%2C4fn%3B%3B12.28%2C52y%3B%3B12.73||7|1|1yju%3B16%3B7%3B10.23%2C1ylq%3B16%3B8%3B10.68%2C1ylp%3B16%3B10%3B11.14%2C1yjk%3B16%3B12%3B12.66%2C1y43%3B16%3B14%3B14.23; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; viq=Walmart; com.wm.reflector="reflectorid:0000000000000000000000@lastupd:1609171176679@firstcreate:1604945236885"; next-day=1609279200|true|false|1609329600|1609171176; TS01b0be75=01538efd7c302b7cf5de1d553fb4c3d86d29879f281822cdfbfd5b8dc0762c128bee6087fe7bcf86ce5011d1710f60e58bf44d48b1; akavpau_p8=1609171777~id=7a59e743f34e18f916d5384c77606a80; s_sess_2=ps%3D1; _uetsid=587b01e0492211eb94e9452426ed1340; _uetvid=629a2400415d11eba2b3cd20fdb13715; _px3=a3558f990c4314a29b8dd937b21430c99f473995d93598f316101442b307a5cd:01YsOg8RtkFWlzM7DUoGOsfO4g3ieqhHOxxf2j1MkrKJaAKUApLzUUS5+1UgpH/OFsoc8LlklO9iMPtO8ip2FA==:1000:VpD9BMtNFbA3uv9rxVpFZ47DexPJdDvY1Qzic1/ATzwTZY6HlGUHw7ID6NqVN0kg27ZnxbwpwBd39bC4n361Nsd5moeq/9F956o94pGC4UaYqxwZTpq+9XgcTrkEUnWAfD0Y9j9m7wMcn2Q9hDSbwBGlHOtthIiuBYUkiDQJsiQ=; _pxde=69b8f58603b6a72c3b3a4fa9b25fb05242dc800f7fd58b8f9df98354e21ae51e:eyJ0aW1lc3RhbXAiOjE2MDkxNzEyMDAzOTIsImZfa2IiOjAsImlwY19pZCI6W119; DL=28227%2C%2C%2Cip%2C28227%2C%2C; vtc=RIO4sLvDbO19PMfnQ0IOG8; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; TS01b0be75=01538efd7cf6007e7f521e5daf07d797a59dacfed56418ea5b2cf0e59f2f19a541e169c45314983b96ed9226ae71182d3baac38f9b; akavpau_p8=1609171833~id=cf8d6690cff3c708d9584550a97ac6ec'
            }
            
            r = requests.request("GET", url, headers=headers)
            bsobj = soup(r.content,'html.parser')
            atags = bsobj.findAll('a',{'class':'absolute w-100 h-100 z-1'})

            for atag in atags:
                if atag.get('href')[:1]=='/':
                    p_link = 'https://www.walmart.com' + atag.get('href')
                else:
                    p_link = atag.get('href')

                headers = {
                'authority': 'www.walmart.com',
                'accept': 'application/json',
                'wm_client_ip': '',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'content-type': 'application/json',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': p_link,
                'accept-language': 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7',
                'cookie': 'vtc=RIO4sLvDbO19PMfnQ0IOG8; __gads=ID=b67a6d7a2e6e92af:T=1586194404:S=ALNI_MY3JbhHhJIK5PnOcfX_-QmCh5qcbg; s_vi=[CS]v1|2F45B3F28515E699-600008970EB85509[CE]; _fbp=fb.1.1586194406343.1767263749; cid_csid=30952ecb-4390-4a8d-abd5-aef0eeaaf2b0; _ga=GA1.2.1917989675.1586368563; _uuid=RIO4sLvDbO19PMfnQ0IOG8; __hstc=195562739.673210eb139f877dada9e614c52c183a.1591968672889.1591968672889.1591968672889.1; hubspotutk=673210eb139f877dada9e614c52c183a; s_pers=%20s_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%20om_mv3d%3Dseo_un%253A%7C1592081747798%3B%20om_mv7d%3Dseo_un%253A%7C1592427347800%3B%20om_mv14d%3Dseo_un%253A%7C1593032147802%3B%20om_mv30d%3Dseo_un%253A%7C1594414547804%3B%20s_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%20s_v%3DY%7C1593018044296%3B%20gpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%20gpv_p44%3DProduct%7C1593018044323%3B%20s_vs%3D1%7C1593018044327%3B; s_pers_2=%2Bs_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%2Bom_mv3d%3Dseo_un%253A%7C1592081747798%3B%2Bom_mv7d%3Dseo_un%253A%7C1592427347800%3B%2Bom_mv14d%3Dseo_un%253A%7C1593032147802%3B%2Bom_mv30d%3Dseo_un%253A%7C1594414547804%3B%2Bs_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%2Bs_v%3DY%7C1593018044296%3B%2Bgpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%2Bgpv_p44%3DProduct%7C1593018044323%3B%2Bs_vs%3D1%7C1593018044327%3BuseVTC%3DN%7C1656814517%3Bps%3D1; _pxvid=9146d56c-fe8f-11ea-a919-0242ac120006; _gcl_au=1.1.2036042732.1602687602; DL=28227%2C%2C%2Cip%2C28227%2C%2C; ACID=0610f36c-ea13-4d72-b95a-4ae8b4dec586; hasACID=1; auth=MTAyOTYyMDE4yUWtbBAuOTnO%2FHY8R%2FRo0Cgt5A0Ax8pPHF0z4A09FRj%2B5CLRu8XVV8J6%2FE4yp81Jl%2F79TACnElSSnMnalfN3EB9HVYAdzZS42qdAp59yDkmHyTp7%2BhQVV6loriuoOtnX767wuZloTfhm7Wk2Kcjygv699%2F6tFVwuL3qJB39WKV9mqFkMKTOoBynFLIZn6c9WpYJvDm413TzZhALP43qhIhemjR0urqaZhzDta95%2BdQQUMk70P8glgOEpLOprhDfMb%2FEw67%2FGsLtdlJHpe1JgEGea43fXiMf7OZIBkWJeaTG6haCvIJ1uWAX0ZXnMDgERo8He18TaVIrSqn0SphYvMc8d3IEUwKFaaB2Rw%2Bdn77H7obzsSqgxk5I0sIxOzFnbjLZbvHUVU4F72UT5W5nZ4w%3D%3D; type=GUEST; cart-item-count=1; TBV=7; tb_sw_supported=true; location-data=28227%3ACharlotte%3ANC%3A%3A1%3A1|1el%3B%3B2.88%2C378%3B%3B4.28%2C3wn%3B%3B5.3%2C1aa%3B%3B6.97%2C2us%3B%3B7.18%2C14c%3B%3B9.33%2C1na%3B%3B9.73%2C4t5%3B%3B10.96%2C4fn%3B%3B12.28%2C52y%3B%3B12.73||7|1|1yju%3B16%3B7%3B10.23%2C1ylq%3B16%3B8%3B10.68%2C1ylp%3B16%3B10%3B11.14%2C1yjk%3B16%3B12%3B12.66%2C1y43%3B16%3B14%3B14.23; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; viq=Walmart; com.wm.reflector="reflectorid:0000000000000000000000@lastupd:1609171176679@firstcreate:1604945236885"; next-day=1609279200|true|false|1609329600|1609171176; TS01b0be75=01538efd7c302b7cf5de1d553fb4c3d86d29879f281822cdfbfd5b8dc0762c128bee6087fe7bcf86ce5011d1710f60e58bf44d48b1; akavpau_p8=1609171777~id=7a59e743f34e18f916d5384c77606a80; s_sess_2=ps%3D1; _uetsid=587b01e0492211eb94e9452426ed1340; _uetvid=629a2400415d11eba2b3cd20fdb13715; _px3=a3558f990c4314a29b8dd937b21430c99f473995d93598f316101442b307a5cd:01YsOg8RtkFWlzM7DUoGOsfO4g3ieqhHOxxf2j1MkrKJaAKUApLzUUS5+1UgpH/OFsoc8LlklO9iMPtO8ip2FA==:1000:VpD9BMtNFbA3uv9rxVpFZ47DexPJdDvY1Qzic1/ATzwTZY6HlGUHw7ID6NqVN0kg27ZnxbwpwBd39bC4n361Nsd5moeq/9F956o94pGC4UaYqxwZTpq+9XgcTrkEUnWAfD0Y9j9m7wMcn2Q9hDSbwBGlHOtthIiuBYUkiDQJsiQ=; _pxde=69b8f58603b6a72c3b3a4fa9b25fb05242dc800f7fd58b8f9df98354e21ae51e:eyJ0aW1lc3RhbXAiOjE2MDkxNzEyMDAzOTIsImZfa2IiOjAsImlwY19pZCI6W119; DL=28227%2C%2C%2Cip%2C28227%2C%2C; vtc=RIO4sLvDbO19PMfnQ0IOG8; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; TS01b0be75=01538efd7cf6007e7f521e5daf07d797a59dacfed56418ea5b2cf0e59f2f19a541e169c45314983b96ed9226ae71182d3baac38f9b; akavpau_p8=1609171833~id=cf8d6690cff3c708d9584550a97ac6ec'
                }
    
                session = HTMLSession()
                r = session.get(p_link,headers=headers)
                prd_bsobj = soup(r.content, 'html.parser')

                for item in prd_bsobj:                   
                    record = extract_record(item)
                    if record:
                        records.append(record)

    df = pd.DataFrame (records, columns = ['Title', 'Price', 'Shipping', 'Sold_by'])
     
            # with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
            #     writer = csv.writer(f)
            #     writer.writerow(['Search', 'Title', 'Price', 'Shipping', 'Sold_by'])
            #     writer.writerows(records)
    df.to_csv('result.csv')        


# %%
search_terms = {'LG OLED TV', 'Samsung QLED TV'}

# using CSV file
# file = 'search.csv'
# search_terms = pd.read_csv(file)

main(search_terms)


# %%

Web scrapping – Amazon ASIN search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of ASINs and crawls per each ASIN.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_ASIN):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/dp/{}'
    ASIN = search_ASIN.replace(' ', '')
    
    # add term query to url
    url = template.format(ASIN)  
    # print(url) 
       
    return url

# %%
def main(search_ASIN):

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_ASIN)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # results = soup.find_all('div', {'class':'centerColAlign'})

    records = []
    for item in soup:
        record = extract_records(item)
        if record:
            records.append(record)
    
    driver.close()
    
    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'


    with open(to_path + search_ASIN + '.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Rating', 'ReviewCount','Prince', 'Title'])
        writer.writerows(records)


# %%
def extract_records(item):
    try:
        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
    except AttributeError:
        title = ''
        rating = ''
        review_count = ''
        price_whole = ''
        price_praction = ''
        
    price = price_whole + price_praction
    
    result = (search_ASIN, rating, review_count, price,title)

    return result

# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')


    

# %%
def read_asin(file):
    # global search_ASINs
    df = pd.read_csv(file, names=['asin'])
    search_ASINs = df['asin'].tolist()



# %%
file = 'asin.csv'
read_asin(file)

for search_ASIN in search_ASINs:
    
    main(search_ASIN)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)

Web scraping – Amazon keyword search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of keywords and crawls first 2 pages per each keyword. You can increase the number of pages.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

# %%
def extract_records(item):
    try:

        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]  
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
            
    except AttributeError:
        title=''
        rating =''
        review_count = ''
        price_whole = ''
        price_praction = ''
    
    price = price_whole + price_praction 

    # asin = url_product.find('dp/',10)
    
    result = (search_term, asin, rating, review_count, price,title)

    return result

# %%
def main(search_term):
    global url_product
    global asin

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_term)
   
    records = []

    for page in range(1, 2):

        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
        for item in results:
            atag = item.h2.a
            url_product = 'https://www.amazon.com' + atag.get('href')
            driver.get(url_product)
            soup_products = BeautifulSoup(driver.page_source, 'html.parser')
            
            if url_product.find('dp%') >= 2:
                x = url_product.find('dp%')
                asin = url_product[x+4:x+14]
            else:
                x = url_product.find('dp/')
                asin = url_product[x+3:x+13]
                
            # print(asin)
    
            for item in soup_products:
                record = extract_records(item)
                if record:
                    records.append(record)

            driver.close

    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

    with open(to_path + search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Search', 'ASIN', 'Rating', 'ReviewCount', 'Price', 'Title'])
        writer.writerows(records)



# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')

# %%
def read_asin(file):
    # global search_terms
    df = pd.read_csv(file, names=['search'])
    search_terms = df['search'].tolist()

# %% [markdown]
# file='search.csv'
# df = pd.read_csv(file, names=['search'])
# search_terms = df['search'].tolist()
# search_terms

# %%
file='search.csv'

for search_term in search_terms:
    main(search_term)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)

Python – Web scraping AMAZON Product Review

This code returns a product review data from Amazon.com

This code returns csv file with review title, reviewer, review date, rating, product size, verified purchase, and review details. You will need to find Amazon Standard Identification Numbers (ASINs) by searching your products in amazon web site.

Finding ASINs in Amazon.com

Type a product name in search bar and click the product link looking for to open product page. ASIN numbers can be found in URL address bar. ASIN numbers usually comes after ‘/dp/’.

I have used ‘ultrawide curved monitor’ in this example and added 2 ASINs in the code. ASINs can be added as many as you need with comma separator (‘,’).

Python code for product review

# import libraries

from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

# define url

def get_url(ASIN):
    
    template = 'https://www.amazon.com/product-reviews/{}/ref=cm_cr_getr_d_paging_btm_next_3?sortBy=recent'
    url = template.format(ASIN)
    url += '&pageNumber={}'
        
    return url

# Extract and return data from a single record

def extract_record(item):
    
    
    profile = item.find('span', 'a-profile-name').text.strip()
    rating = item.find('div','a-row').text.strip()[len(profile):len(profile)+3]
    title = item.find('a','a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold').text.strip()
    
    try:
        product_size = item.find('a','a-size-mini a-link-normal a-color-secondary').text.strip()
        verified_purchase = item.find('span','a-size-mini a-color-state a-text-bold').text.strip()
    except AttributeError:
        product_size = ''
        verified_purchase = ''
    review = item.find('span','a-size-base review-text review-text-content').text.strip()    
    review_date = item.find('span','a-size-base a-color-secondary review-date').text.strip()
    date = review_date[review_date.index('on')+3:]

        
    result = (title, profile, date, rating,  product_size, verified_purchase, review)
    
    return result

# Run main program routine

def main(ASIN):
    
    # startup the webdriver
    options = Options()
    # open chrome in incognito mode
    options.add_argument("--incognito")

    # this option will run a code without opening chrome
    #options.add_argument("--headless")
    
    # my chrome driver location. change it accordingly
    driver = webdriver.Chrome("C:\\webdrivers\\chromedriver.exe", options=options)
    
    
    records = []
    url = get_url(ASIN)
    
    for page in range(1, 11):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'class': 'a-section review aok-relative'})
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
   
   
    driver.close()
    
    # save data to csv file
    with open(ASIN +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title', 'Profile', 'Date', 'Rating', 'ProductSize', 'Verified_Purchase', 'Review'])
        writer.writerows(records)

# Run program for entered ASINs. Multiple ASINs can be run with comma separator


ASINs = {'B07YGZ7C1K','B0812DKDD9'}
for ASIN in ASINs:
    main(ASIN)

The idea was from Amazon price tracking tutorial on YouTube channel, Izzy Analytics. Please visit Izzy Analytics channel for a original posting. https://www.youtube.com/channel/UCWHJGQc7Vqo37dlUmpiK6hQ.