태지쌤

로봇 & 코딩교육 No.1 크리에이터

Data Science

웹 크롤링을 통한 데이터 수집

태지쌤 2023. 1. 31. 20:01
반응형

CH 03_01. 웹 크롤링을 통한 데이터 수집.ipynb
0.00MB

# 라이브러리 정의
import requests
import bs4
import pandas as pd
from selenium.webdriver import Chrome
import time
from tqdm.notebook import tqdm
import warnings
warnings.simplefilter('ignore')

query = "삼성"

# 수집할 데이터를 for문 밖에 둠
titles = [] # 제품명
prices = []
review_counts = []
buy_counts = []
published_dates = [] # 제품 등록일
favorites = []

driver = Chrome('./chromedriver') # for 문 밖. 한번만 실행

for page_no in tqdm(range(1, 6)): # 5페이지까지 가져옴

    page_url = f"https://search.shopping.naver.com/search/all?frm=NVSHATC&origQuery={query}&pagingIndex={page_no}&pagingSize=40&productSet=total&query={query}&sort=rel&timestamp=&viewType=list"
    driver.get(page_url)
    time.sleep(3)

    # 스크롤 내려서 페이지 불러오기
    for scroll_down in range(7):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(1)

    list_basis = driver.find_element_by_class_name("list_basis")
    item_list = list_basis.find_elements_by_class_name('basicList_inner__xCM3J')

    items = []
    for i in tqdm(range(len(item_list))):
        item = item_list[i]
        title = item.find_element_by_class_name('basicList_title__VfX3c')
        titles.append(title.text)


        price = item.find_element_by_class_name("price_num__S2p_v").text[:-1].replace(',', '')
        prices.append(price)

        footer = item.find_element_by_class_name('basicList_etc_box__5lkgg')#.text
        reviews = footer.find_elements_by_class_name('basicList_num__sfz3h')
        footer_text = footer.text
        try:
            if "구매건수" in footer_text:
                review_counts.append(int(reviews[0].text.replace(',', '')))
                buy_counts.append(int(reviews[1].text.replace(',', '')))
                favorites.append(int(reviews[2].text.replace(',', '')))
                date = footer.find_elements_by_tag_name('span')[0].text[4:]
                published_dates.append(date)
            else:
                favorites.append(int(reviews[1].text.replace(',', '')))
                review_counts.append(int(reviews[0].text.replace(',', '')))
                date = footer.find_elements_by_class_name('basicList_etc__LSkN_')[1].text[4:]
                published_dates.append(date)

        except IndexError: ## 리뷰가 아직 충분하지 않아서, 정보가 안뜨는 케이스
            review_counts.append(0)
            favorites.append(int(reviews[0].text.replace(',', '')))
            date = footer.find_elements_by_tag_name('span')[0].text[4:]
            published_dates.append(date)
            
print(len(titles), len(prices), len(review_counts), len(published_dates), len(favorites))

result = pd.DataFrame({"제품명" : titles,
                       "가격" : prices,
                       "리뷰수" : review_counts,
                       "등록일" : published_dates,
                       "찜하기" : favorites})
result

result.to_excel(f"naver_shopping({query}).xlsx", index=False)
반응형