web scrapping – this and that @ work

Web scrapping – Keyword search – Walmart

Product search using keywords. Returns title, Price, Shipping, Sold by.

# %%
import pandas as pd
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup as soup
# import csv

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.walmart.com/search?q={}'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'
        
    return url


# %%
def extract_record(item):
        try:

                title = prd_bsobj.find('h1', {'class':'f3 b lh-copy dark-gray mt1 mb2'}).text.strip()
                price = prd_bsobj.find('span', {'class':'b lh-copy dark-gray mr2 f1'}).text.strip()
                shipping = prd_bsobj.find('div', {'class':'f6 bw0-xl b--near-white ml3 mr3 mid-gray ml0-m mr0-m'}).text.strip()
                sold_by = prd_bsobj.find('div', {'class':'f6 ml0-xl mid-gray mt3 bw0-xl b--near-white mb3'}).text.strip()
        except AttributeError:
                title = ''
                price = ''
                shipping = ''
                sold_by = ''
                
        results = (title, price, shipping, sold_by)
                
        return results

# %%
def main(search_terms):
    global prd_bsobj

    records = []

    for search_term in search_terms:
        url = get_url(search_term)

        for page in range (1,2):            
            url = url.format(page)
            headers = {
                'authority': 'www.walmart.com',
                'accept': 'application/json',
                'wm_client_ip': '',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'content-type': 'application/json',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': url,
                'accept-language': 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7',
                'cookie': 'vtc=RIO4sLvDbO19PMfnQ0IOG8; __gads=ID=b67a6d7a2e6e92af:T=1586194404:S=ALNI_MY3JbhHhJIK5PnOcfX_-QmCh5qcbg; s_vi=[CS]v1|2F45B3F28515E699-600008970EB85509[CE]; _fbp=fb.1.1586194406343.1767263749; cid_csid=30952ecb-4390-4a8d-abd5-aef0eeaaf2b0; _ga=GA1.2.1917989675.1586368563; _uuid=RIO4sLvDbO19PMfnQ0IOG8; __hstc=195562739.673210eb139f877dada9e614c52c183a.1591968672889.1591968672889.1591968672889.1; hubspotutk=673210eb139f877dada9e614c52c183a; s_pers=%20s_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%20om_mv3d%3Dseo_un%253A%7C1592081747798%3B%20om_mv7d%3Dseo_un%253A%7C1592427347800%3B%20om_mv14d%3Dseo_un%253A%7C1593032147802%3B%20om_mv30d%3Dseo_un%253A%7C1594414547804%3B%20s_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%20s_v%3DY%7C1593018044296%3B%20gpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%20gpv_p44%3DProduct%7C1593018044323%3B%20s_vs%3D1%7C1593018044327%3B; s_pers_2=%2Bs_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%2Bom_mv3d%3Dseo_un%253A%7C1592081747798%3B%2Bom_mv7d%3Dseo_un%253A%7C1592427347800%3B%2Bom_mv14d%3Dseo_un%253A%7C1593032147802%3B%2Bom_mv30d%3Dseo_un%253A%7C1594414547804%3B%2Bs_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%2Bs_v%3DY%7C1593018044296%3B%2Bgpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%2Bgpv_p44%3DProduct%7C1593018044323%3B%2Bs_vs%3D1%7C1593018044327%3BuseVTC%3DN%7C1656814517%3Bps%3D1; _pxvid=9146d56c-fe8f-11ea-a919-0242ac120006; _gcl_au=1.1.2036042732.1602687602; DL=28227%2C%2C%2Cip%2C28227%2C%2C; ACID=0610f36c-ea13-4d72-b95a-4ae8b4dec586; hasACID=1; auth=MTAyOTYyMDE4yUWtbBAuOTnO%2FHY8R%2FRo0Cgt5A0Ax8pPHF0z4A09FRj%2B5CLRu8XVV8J6%2FE4yp81Jl%2F79TACnElSSnMnalfN3EB9HVYAdzZS42qdAp59yDkmHyTp7%2BhQVV6loriuoOtnX767wuZloTfhm7Wk2Kcjygv699%2F6tFVwuL3qJB39WKV9mqFkMKTOoBynFLIZn6c9WpYJvDm413TzZhALP43qhIhemjR0urqaZhzDta95%2BdQQUMk70P8glgOEpLOprhDfMb%2FEw67%2FGsLtdlJHpe1JgEGea43fXiMf7OZIBkWJeaTG6haCvIJ1uWAX0ZXnMDgERo8He18TaVIrSqn0SphYvMc8d3IEUwKFaaB2Rw%2Bdn77H7obzsSqgxk5I0sIxOzFnbjLZbvHUVU4F72UT5W5nZ4w%3D%3D; type=GUEST; cart-item-count=1; TBV=7; tb_sw_supported=true; location-data=28227%3ACharlotte%3ANC%3A%3A1%3A1|1el%3B%3B2.88%2C378%3B%3B4.28%2C3wn%3B%3B5.3%2C1aa%3B%3B6.97%2C2us%3B%3B7.18%2C14c%3B%3B9.33%2C1na%3B%3B9.73%2C4t5%3B%3B10.96%2C4fn%3B%3B12.28%2C52y%3B%3B12.73||7|1|1yju%3B16%3B7%3B10.23%2C1ylq%3B16%3B8%3B10.68%2C1ylp%3B16%3B10%3B11.14%2C1yjk%3B16%3B12%3B12.66%2C1y43%3B16%3B14%3B14.23; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; viq=Walmart; com.wm.reflector="reflectorid:0000000000000000000000@lastupd:1609171176679@firstcreate:1604945236885"; next-day=1609279200|true|false|1609329600|1609171176; TS01b0be75=01538efd7c302b7cf5de1d553fb4c3d86d29879f281822cdfbfd5b8dc0762c128bee6087fe7bcf86ce5011d1710f60e58bf44d48b1; akavpau_p8=1609171777~id=7a59e743f34e18f916d5384c77606a80; s_sess_2=ps%3D1; _uetsid=587b01e0492211eb94e9452426ed1340; _uetvid=629a2400415d11eba2b3cd20fdb13715; _px3=a3558f990c4314a29b8dd937b21430c99f473995d93598f316101442b307a5cd:01YsOg8RtkFWlzM7DUoGOsfO4g3ieqhHOxxf2j1MkrKJaAKUApLzUUS5+1UgpH/OFsoc8LlklO9iMPtO8ip2FA==:1000:VpD9BMtNFbA3uv9rxVpFZ47DexPJdDvY1Qzic1/ATzwTZY6HlGUHw7ID6NqVN0kg27ZnxbwpwBd39bC4n361Nsd5moeq/9F956o94pGC4UaYqxwZTpq+9XgcTrkEUnWAfD0Y9j9m7wMcn2Q9hDSbwBGlHOtthIiuBYUkiDQJsiQ=; _pxde=69b8f58603b6a72c3b3a4fa9b25fb05242dc800f7fd58b8f9df98354e21ae51e:eyJ0aW1lc3RhbXAiOjE2MDkxNzEyMDAzOTIsImZfa2IiOjAsImlwY19pZCI6W119; DL=28227%2C%2C%2Cip%2C28227%2C%2C; vtc=RIO4sLvDbO19PMfnQ0IOG8; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; TS01b0be75=01538efd7cf6007e7f521e5daf07d797a59dacfed56418ea5b2cf0e59f2f19a541e169c45314983b96ed9226ae71182d3baac38f9b; akavpau_p8=1609171833~id=cf8d6690cff3c708d9584550a97ac6ec'
            }
            
            r = requests.request("GET", url, headers=headers)
            bsobj = soup(r.content,'html.parser')
            atags = bsobj.findAll('a',{'class':'absolute w-100 h-100 z-1'})

            for atag in atags:
                if atag.get('href')[:1]=='/':
                    p_link = 'https://www.walmart.com' + atag.get('href')
                else:
                    p_link = atag.get('href')

                headers = {
                'authority': 'www.walmart.com',
                'accept': 'application/json',
                'wm_client_ip': '',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'content-type': 'application/json',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': p_link,
                'accept-language': 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7',
                'cookie': 'vtc=RIO4sLvDbO19PMfnQ0IOG8; __gads=ID=b67a6d7a2e6e92af:T=1586194404:S=ALNI_MY3JbhHhJIK5PnOcfX_-QmCh5qcbg; s_vi=[CS]v1|2F45B3F28515E699-600008970EB85509[CE]; _fbp=fb.1.1586194406343.1767263749; cid_csid=30952ecb-4390-4a8d-abd5-aef0eeaaf2b0; _ga=GA1.2.1917989675.1586368563; _uuid=RIO4sLvDbO19PMfnQ0IOG8; __hstc=195562739.673210eb139f877dada9e614c52c183a.1591968672889.1591968672889.1591968672889.1; hubspotutk=673210eb139f877dada9e614c52c183a; s_pers=%20s_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%20om_mv3d%3Dseo_un%253A%7C1592081747798%3B%20om_mv7d%3Dseo_un%253A%7C1592427347800%3B%20om_mv14d%3Dseo_un%253A%7C1593032147802%3B%20om_mv30d%3Dseo_un%253A%7C1594414547804%3B%20s_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%20s_v%3DY%7C1593018044296%3B%20gpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%20gpv_p44%3DProduct%7C1593018044323%3B%20s_vs%3D1%7C1593018044327%3B; s_pers_2=%2Bs_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%2Bom_mv3d%3Dseo_un%253A%7C1592081747798%3B%2Bom_mv7d%3Dseo_un%253A%7C1592427347800%3B%2Bom_mv14d%3Dseo_un%253A%7C1593032147802%3B%2Bom_mv30d%3Dseo_un%253A%7C1594414547804%3B%2Bs_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%2Bs_v%3DY%7C1593018044296%3B%2Bgpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%2Bgpv_p44%3DProduct%7C1593018044323%3B%2Bs_vs%3D1%7C1593018044327%3BuseVTC%3DN%7C1656814517%3Bps%3D1; _pxvid=9146d56c-fe8f-11ea-a919-0242ac120006; _gcl_au=1.1.2036042732.1602687602; DL=28227%2C%2C%2Cip%2C28227%2C%2C; ACID=0610f36c-ea13-4d72-b95a-4ae8b4dec586; hasACID=1; auth=MTAyOTYyMDE4yUWtbBAuOTnO%2FHY8R%2FRo0Cgt5A0Ax8pPHF0z4A09FRj%2B5CLRu8XVV8J6%2FE4yp81Jl%2F79TACnElSSnMnalfN3EB9HVYAdzZS42qdAp59yDkmHyTp7%2BhQVV6loriuoOtnX767wuZloTfhm7Wk2Kcjygv699%2F6tFVwuL3qJB39WKV9mqFkMKTOoBynFLIZn6c9WpYJvDm413TzZhALP43qhIhemjR0urqaZhzDta95%2BdQQUMk70P8glgOEpLOprhDfMb%2FEw67%2FGsLtdlJHpe1JgEGea43fXiMf7OZIBkWJeaTG6haCvIJ1uWAX0ZXnMDgERo8He18TaVIrSqn0SphYvMc8d3IEUwKFaaB2Rw%2Bdn77H7obzsSqgxk5I0sIxOzFnbjLZbvHUVU4F72UT5W5nZ4w%3D%3D; type=GUEST; cart-item-count=1; TBV=7; tb_sw_supported=true; location-data=28227%3ACharlotte%3ANC%3A%3A1%3A1|1el%3B%3B2.88%2C378%3B%3B4.28%2C3wn%3B%3B5.3%2C1aa%3B%3B6.97%2C2us%3B%3B7.18%2C14c%3B%3B9.33%2C1na%3B%3B9.73%2C4t5%3B%3B10.96%2C4fn%3B%3B12.28%2C52y%3B%3B12.73||7|1|1yju%3B16%3B7%3B10.23%2C1ylq%3B16%3B8%3B10.68%2C1ylp%3B16%3B10%3B11.14%2C1yjk%3B16%3B12%3B12.66%2C1y43%3B16%3B14%3B14.23; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; viq=Walmart; com.wm.reflector="reflectorid:0000000000000000000000@lastupd:1609171176679@firstcreate:1604945236885"; next-day=1609279200|true|false|1609329600|1609171176; TS01b0be75=01538efd7c302b7cf5de1d553fb4c3d86d29879f281822cdfbfd5b8dc0762c128bee6087fe7bcf86ce5011d1710f60e58bf44d48b1; akavpau_p8=1609171777~id=7a59e743f34e18f916d5384c77606a80; s_sess_2=ps%3D1; _uetsid=587b01e0492211eb94e9452426ed1340; _uetvid=629a2400415d11eba2b3cd20fdb13715; _px3=a3558f990c4314a29b8dd937b21430c99f473995d93598f316101442b307a5cd:01YsOg8RtkFWlzM7DUoGOsfO4g3ieqhHOxxf2j1MkrKJaAKUApLzUUS5+1UgpH/OFsoc8LlklO9iMPtO8ip2FA==:1000:VpD9BMtNFbA3uv9rxVpFZ47DexPJdDvY1Qzic1/ATzwTZY6HlGUHw7ID6NqVN0kg27ZnxbwpwBd39bC4n361Nsd5moeq/9F956o94pGC4UaYqxwZTpq+9XgcTrkEUnWAfD0Y9j9m7wMcn2Q9hDSbwBGlHOtthIiuBYUkiDQJsiQ=; _pxde=69b8f58603b6a72c3b3a4fa9b25fb05242dc800f7fd58b8f9df98354e21ae51e:eyJ0aW1lc3RhbXAiOjE2MDkxNzEyMDAzOTIsImZfa2IiOjAsImlwY19pZCI6W119; DL=28227%2C%2C%2Cip%2C28227%2C%2C; vtc=RIO4sLvDbO19PMfnQ0IOG8; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; TS01b0be75=01538efd7cf6007e7f521e5daf07d797a59dacfed56418ea5b2cf0e59f2f19a541e169c45314983b96ed9226ae71182d3baac38f9b; akavpau_p8=1609171833~id=cf8d6690cff3c708d9584550a97ac6ec'
                }
    
                session = HTMLSession()
                r = session.get(p_link,headers=headers)
                prd_bsobj = soup(r.content, 'html.parser')

                for item in prd_bsobj:                   
                    record = extract_record(item)
                    if record:
                        records.append(record)

    df = pd.DataFrame (records, columns = ['Title', 'Price', 'Shipping', 'Sold_by'])
     
            # with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
            #     writer = csv.writer(f)
            #     writer.writerow(['Search', 'Title', 'Price', 'Shipping', 'Sold_by'])
            #     writer.writerows(records)
    df.to_csv('result.csv')        


# %%
search_terms = {'LG OLED TV', 'Samsung QLED TV'}

# using CSV file
# file = 'search.csv'
# search_terms = pd.read_csv(file)

main(search_terms)


# %%

Web scrapping – Amazon ASIN search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of ASINs and crawls per each ASIN.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_ASIN):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/dp/{}'
    ASIN = search_ASIN.replace(' ', '')
    
    # add term query to url
    url = template.format(ASIN)  
    # print(url) 
       
    return url

# %%
def main(search_ASIN):

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_ASIN)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # results = soup.find_all('div', {'class':'centerColAlign'})

    records = []
    for item in soup:
        record = extract_records(item)
        if record:
            records.append(record)
    
    driver.close()
    
    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'


    with open(to_path + search_ASIN + '.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Rating', 'ReviewCount','Prince', 'Title'])
        writer.writerows(records)


# %%
def extract_records(item):
    try:
        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
    except AttributeError:
        title = ''
        rating = ''
        review_count = ''
        price_whole = ''
        price_praction = ''
        
    price = price_whole + price_praction
    
    result = (search_ASIN, rating, review_count, price,title)

    return result

# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')


    

# %%
def read_asin(file):
    # global search_ASINs
    df = pd.read_csv(file, names=['asin'])
    search_ASINs = df['asin'].tolist()



# %%
file = 'asin.csv'
read_asin(file)

for search_ASIN in search_ASINs:
    
    main(search_ASIN)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)

Web scraping – Amazon keyword search

Chrome driver is required to run this code. And there are a couple places of file paths. Change the paths appropriately on your machine. The code reads CSV file of keywords and crawls first 2 pages per each keyword. You can increase the number of pages.

Download link – https://chromedriver.chromium.org/downloads

# %%
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import glob

# %%
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

# %%
def extract_records(item):
    try:

        title = item.find('span',{'id':'productTitle'}).text.strip()
        rating = item.find('span', {'class':'a-icon-alt'}).text.strip()[:3]
        review_count = item.find('span',{'id':'acrCustomerReviewText'}).text.strip()[:-8]  
        price_whole = item.find('span',{'class':'a-price-whole'}).text.strip()
        price_praction = item.find('span',{'class':'a-price-fraction'}).text.strip()
            
    except AttributeError:
        title=''
        rating =''
        review_count = ''
        price_whole = ''
        price_praction = ''
    
    price = price_whole + price_praction 

    # asin = url_product.find('dp/',10)
    
    result = (search_term, asin, rating, review_count, price,title)

    return result

# %%
def main(search_term):
    global url_product
    global asin

    options = Options() 
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome("chromedriver.exe", options=options)
    options = Options()

    url = get_url(search_term)
   
    records = []

    for page in range(1, 2):

        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
        for item in results:
            atag = item.h2.a
            url_product = 'https://www.amazon.com' + atag.get('href')
            driver.get(url_product)
            soup_products = BeautifulSoup(driver.page_source, 'html.parser')
            
            if url_product.find('dp%') >= 2:
                x = url_product.find('dp%')
                asin = url_product[x+4:x+14]
            else:
                x = url_product.find('dp/')
                asin = url_product[x+3:x+13]
                
            # print(asin)
    
            for item in soup_products:
                record = extract_records(item)
                if record:
                    records.append(record)

            driver.close

    to_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

    with open(to_path + search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Search', 'ASIN', 'Rating', 'ReviewCount', 'Price', 'Title'])
        writer.writerows(records)



# %%
def combine(from_path):
    joined_files = os.path.join(from_path,'*csv')

    joined_list = glob.glob(joined_files)

    df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
    for f in joined_list:
        os.remove(f)

    df.to_csv(from_path + 'result.csv')

# %%
def read_asin(file):
    # global search_terms
    df = pd.read_csv(file, names=['search'])
    search_terms = df['search'].tolist()

# %% [markdown]
# file='search.csv'
# df = pd.read_csv(file, names=['search'])
# search_terms = df['search'].tolist()
# search_terms

# %%
file='search.csv'

for search_term in search_terms:
    main(search_term)

from_path = 'C:\\code\\PythonScripts\\web_scraping\\results\\'

combine(from_path)

Executable stand-alone Amazon Price Tracking tool using Python with tkinter library

creates stand-alone executable tool to scrap amazon price

Please send me your email address and public IP if you are interested in the tool. The customizations are also available (a different store, save the search result to a database, schedule to send a report as an email attachment, etc)

Requirement:

Create ‘webscraptool.py’, and copy & paste ‘Main Code’ below (Assuming you have python installed in your machine)
Download and save chromedriver under ./driver/ folder. Create ‘driver’ folder in the same folder that you have save ‘webscraptool.py’. (chrome driver download: https://chromedriver.chromium.org/downloads)
To create stand-alone ‘webscraptool.exe’, install ‘pyinstaller’ (pip install pyinstaller)
Run the code below in command line. This will create the stand-alone ‘.exe’ file.

 pyinstaller -F -w --add-binary "./driver/chromedriver.exe;./driver"

Main Code:

from tkinter import *
import tkinter.ttk as ttk
from tkinter import filedialog
import tkinter.messagebox as msgbox
import tkinter.font as font
import os
import time

from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

# web scrap main code block
def resource_path(relative_path):
    try:
        base_path = sys._MEIPASS
    except Exception:
        base_path = os.path.abspath(".")

    return os.path.join(base_path, relative_path)

def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
        
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    try:
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.com' + atag.get('href')
    except AttributeError:
        description =''
        url = ''
    try:
        # product price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating and review count
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    """Run main program routine"""
    options = Options()
    # startup the webdriver
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome(resource_path("./driver/chromedriver.exe"), options=options)
    
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 11):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    
    # save data to csv file
    with open(search_term +'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)

# GUI tkinter codes begin

root = Tk()
root.title("Web Scrapper")
#root.geometry("640x480")
myFont=font.Font(size=12, weight="bold")
label=Label(root, text="Amazon Price Tracker")
label['font']=myFont
label.pack(side="top", padx=5, pady=5, ipady=3)

# search text frame

frm_search = LabelFrame(root, text="Search")
frm_search.pack(fill="x", padx=5, pady=5, ipady=3)

e = Entry(frm_search, width=30)
e.pack(fill="x", padx=5, pady=5, ipady=5)
e.insert(0, "Enter products to search")

def add_txt():
    search_list.insert(END,e.get())

def remove_txt():
    for index in reversed(search_list.curselection()):
        search_list.delete(index)

x = datetime.datetime(2021, 1, 15)
y = datetime.datetime.now()

def start():
    if search_list.size() == 0:
        msgbox.showwarning("Warning", "Add products to search")
        return

    search_terms = search_list.get(0, END)

    for idx, search_term in enumerate(search_terms):

        main(search_term)
       
        progress = (idx + 1) / len(search_terms) * 100
        p_var.set(progress)
        progressbar.update()

# add/remove search text frame
frm_search1 = LabelFrame(root, text="Add/Remove Products")
frm_search1.pack(fill="x", padx=5, pady=5, ipady=3)

btn_remove = Button(frm_search1, text="Remove", padx=5, pady=5, width=12, command=remove_txt)
btn_add = Button(frm_search1, text="Add", padx=5, pady=5, width=12, command=add_txt)

btn_remove.pack(side="right", padx=5, pady=5)
btn_add.pack(side="right", padx=5, pady=5)

#list frame
frm_list = Frame(root)
frm_list.pack(fill="both", padx=5, pady=5)


scrollbar = Scrollbar(frm_list)
scrollbar.pack(side="right", fill="y")

search_list = Listbox(frm_list, selectmode="extended", height=10, yscrollcommand=scrollbar.set)
search_list.pack(side="left", fill="both", expand=True)
scrollbar.config(command=search_list.yview)

# progress bar frame
frm_progress = LabelFrame(root, text="Progress")
frm_progress.pack(fill="x", padx=5, pady=5, ipady=3)

p_var = DoubleVar()
progressbar = ttk.Progressbar(frm_progress, maximum=100, length=150, variable=p_var)
progressbar.pack(fill="x", padx=5, pady=5)



# run/exit button frame
frm_run = LabelFrame(root)
frm_run.pack(fill="x", padx=5, pady=5)

btn_exit = Button(frm_run, text="Exit", padx=5, pady=5, width=12, command=root.quit)
btn_start = Button(frm_run, text="Start", padx=5, pady=5, width=12, command=start)

btn_exit.pack(side="right", padx=5, pady=5)
btn_start.pack(side="right", padx=5, pady=5)

root.mainloop()

Python – Web scraping Walmart Price Tracking: Avoid Google CAPCHA issue

In this example, I will use postman (https://www.postman.com/) to get ‘request’ code. This method can avoid Google CAPCHA issue using ‘Selenium’ web driver method.

Visit Postman website and install desktop app. (You’ll need to create a free account).

Step 1

Enter a desired product in walmart.com search bar and open inspect windows in your browser. Select ‘Network’ tab and ‘XHR’, then refresh the browser.

Scroll down to page number, and click 2nd page. Find an item starts with ‘preso?prg…’ under ‘Name’ window and select. Notice that this is JSON data set with item attributes.

Copy as cURL (bash) by right clicking the name of this item.

Step 2

Open postman app, and paste copied URL under ‘IMPORT’ > ‘Raw text’, and ‘Continue’ & ‘Import’

Once it is imported, click ‘Send’ to retrieve the data.

Change page value to ‘1’ and click ‘Send’ again to see it works, then click ‘Code’ under ‘Send’ button.

Select ‘Python – Requests’ and copy it to clipboard.

Step 3

Paste the code into your Python IDE program. From the copied code as a base, Edit the codes as below.

# import libraries

import json
import pandas as pd
import requests
import time

# use your own search items

search_items= {'ultrawide monitor', 'gaming monitor'}

# note that the value of 'referer' inside 'headers' is changed as 'url' that formatted with {search_term}. 
# If the code get blocked by the site, change 'cookie' by getting an code for a search item from 'Postman' app.
# change the number of pages you'd like to crawl inside 'for' loop. It's 3 pages per each item for now.

for search_item in search_items:
    search_term = search_item.replace(' ', '%20')
    
    url=f"https://www.walmart.com/search/api/preso?prg=desktop&page=1&ps=40&query={search_term}" 
    
    headers = {
      'authority': 'www.walmart.com',
      'accept': 'application/json',
      'wm_client_ip': '',
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
      'content-type': 'application/json',
      'sec-fetch-site': 'same-origin',
      'sec-fetch-mode': 'cors',
      'sec-fetch-dest': 'empty',

# url from search_term
      'referer': url,

      'accept-language': 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7',
      'cookie': 'vtc=RIO4sLvDbO19PMfnQ0IOG8; __gads=ID=b67a6d7a2e6e92af:T=1586194404:S=ALNI_MY3JbhHhJIK5PnOcfX_-QmCh5qcbg; s_vi=[CS]v1|2F45B3F28515E699-600008970EB85509[CE]; _fbp=fb.1.1586194406343.1767263749; cid_csid=30952ecb-4390-4a8d-abd5-aef0eeaaf2b0; _ga=GA1.2.1917989675.1586368563; _uuid=RIO4sLvDbO19PMfnQ0IOG8; __hstc=195562739.673210eb139f877dada9e614c52c183a.1591968672889.1591968672889.1591968672889.1; hubspotutk=673210eb139f877dada9e614c52c183a; s_pers=%20s_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%20om_mv3d%3Dseo_un%253A%7C1592081747798%3B%20om_mv7d%3Dseo_un%253A%7C1592427347800%3B%20om_mv14d%3Dseo_un%253A%7C1593032147802%3B%20om_mv30d%3Dseo_un%253A%7C1594414547804%3B%20s_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%20s_v%3DY%7C1593018044296%3B%20gpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%20gpv_p44%3DProduct%7C1593018044323%3B%20s_vs%3D1%7C1593018044327%3B; s_pers_2=%2Bs_cmpstack%3D%255B%255B%2527seo_un%2527%252C%25271589979941851%2527%255D%252C%255B%2527sem_un%2527%252C%25271589980000838%2527%255D%252C%255B%2527seo_un%2527%252C%25271591822547739%2527%255D%255D%7C1749588947739%3B%2Bom_mv3d%3Dseo_un%253A%7C1592081747798%3B%2Bom_mv7d%3Dseo_un%253A%7C1592427347800%3B%2Bom_mv14d%3Dseo_un%253A%7C1593032147802%3B%2Bom_mv30d%3Dseo_un%253A%7C1594414547804%3B%2Bs_fid%3D4E3B7A675BB3EAC3-083A78CCEDE19F2A%7C1656088244292%3B%2Bs_v%3DY%7C1593018044296%3B%2Bgpv_p11%3D%255BPatio%2520%2526%2520Garden%255D%2520Product%2520Page%7C1593018044314%3B%2Bgpv_p44%3DProduct%7C1593018044323%3B%2Bs_vs%3D1%7C1593018044327%3BuseVTC%3DN%7C1656814517%3Bps%3D1; _pxvid=9146d56c-fe8f-11ea-a919-0242ac120006; _gcl_au=1.1.2036042732.1602687602; DL=28227%2C%2C%2Cip%2C28227%2C%2C; ACID=0610f36c-ea13-4d72-b95a-4ae8b4dec586; hasACID=1; auth=MTAyOTYyMDE4yUWtbBAuOTnO%2FHY8R%2FRo0Cgt5A0Ax8pPHF0z4A09FRj%2B5CLRu8XVV8J6%2FE4yp81Jl%2F79TACnElSSnMnalfN3EB9HVYAdzZS42qdAp59yDkmHyTp7%2BhQVV6loriuoOtnX767wuZloTfhm7Wk2Kcjygv699%2F6tFVwuL3qJB39WKV9mqFkMKTOoBynFLIZn6c9WpYJvDm413TzZhALP43qhIhemjR0urqaZhzDta95%2BdQQUMk70P8glgOEpLOprhDfMb%2FEw67%2FGsLtdlJHpe1JgEGea43fXiMf7OZIBkWJeaTG6haCvIJ1uWAX0ZXnMDgERo8He18TaVIrSqn0SphYvMc8d3IEUwKFaaB2Rw%2Bdn77H7obzsSqgxk5I0sIxOzFnbjLZbvHUVU4F72UT5W5nZ4w%3D%3D; type=GUEST; cart-item-count=1; TBV=7; tb_sw_supported=true; location-data=28227%3ACharlotte%3ANC%3A%3A1%3A1|1el%3B%3B2.88%2C378%3B%3B4.28%2C3wn%3B%3B5.3%2C1aa%3B%3B6.97%2C2us%3B%3B7.18%2C14c%3B%3B9.33%2C1na%3B%3B9.73%2C4t5%3B%3B10.96%2C4fn%3B%3B12.28%2C52y%3B%3B12.73||7|1|1yju%3B16%3B7%3B10.23%2C1ylq%3B16%3B8%3B10.68%2C1ylp%3B16%3B10%3B11.14%2C1yjk%3B16%3B12%3B12.66%2C1y43%3B16%3B14%3B14.23; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; viq=Walmart; com.wm.reflector="reflectorid:0000000000000000000000@lastupd:1609171176679@firstcreate:1604945236885"; next-day=1609279200|true|false|1609329600|1609171176; TS01b0be75=01538efd7c302b7cf5de1d553fb4c3d86d29879f281822cdfbfd5b8dc0762c128bee6087fe7bcf86ce5011d1710f60e58bf44d48b1; akavpau_p8=1609171777~id=7a59e743f34e18f916d5384c77606a80; s_sess_2=ps%3D1; _uetsid=587b01e0492211eb94e9452426ed1340; _uetvid=629a2400415d11eba2b3cd20fdb13715; _px3=a3558f990c4314a29b8dd937b21430c99f473995d93598f316101442b307a5cd:01YsOg8RtkFWlzM7DUoGOsfO4g3ieqhHOxxf2j1MkrKJaAKUApLzUUS5+1UgpH/OFsoc8LlklO9iMPtO8ip2FA==:1000:VpD9BMtNFbA3uv9rxVpFZ47DexPJdDvY1Qzic1/ATzwTZY6HlGUHw7ID6NqVN0kg27ZnxbwpwBd39bC4n361Nsd5moeq/9F956o94pGC4UaYqxwZTpq+9XgcTrkEUnWAfD0Y9j9m7wMcn2Q9hDSbwBGlHOtthIiuBYUkiDQJsiQ=; _pxde=69b8f58603b6a72c3b3a4fa9b25fb05242dc800f7fd58b8f9df98354e21ae51e:eyJ0aW1lc3RhbXAiOjE2MDkxNzEyMDAzOTIsImZfa2IiOjAsImlwY19pZCI6W119; DL=28227%2C%2C%2Cip%2C28227%2C%2C; vtc=RIO4sLvDbO19PMfnQ0IOG8; bstc=TNiFgUrdGKMzY7B73wr2cY; mobileweb=0; xpa=; xpm=3%2B1609169740%2BRIO4sLvDbO19PMfnQ0IOG8~%2B0; TS013ed49a=01538efd7cc909426db3ee1cda51c89fbb23590ed891435a3f1a23a9195798900188a33c460f91890bd3ca42adfc2924ad03a9b5bc; TS01b0be75=01538efd7cf6007e7f521e5daf07d797a59dacfed56418ea5b2cf0e59f2f19a541e169c45314983b96ed9226ae71182d3baac38f9b; akavpau_p8=1609171833~id=cf8d6690cff3c708d9584550a97ac6ec'
    }


    prods = pd.DataFrame([])

# change page number range as you desire
    for page in range(1,3):
        url = f"https://www.walmart.com/search/api/preso?prg=desktop&page={page}&ps=40&query={search_term}"
        r = requests.request("GET", url, headers=headers)
        data = json.loads(r.text)
        prods = prods.append(pd.json_normalize(data['items']), ignore_index=True)
        time.sleep(3)
        print(f'Getting page {page}', 'waiting..')

    prods['date']=pd.to_datetime('today')
    prods.to_csv(f"{search_item}-Prices@Walmart.csv")

    print(f'Completed for {search_item}')

The idea using postman to grab a request code was originally from John Watson Rooney YouTube channel. This code includes a little tweak to loop for a multiple item search.