반응형
# 라이브러리 정의
import requests
import bs4
import pandas as pd
from selenium.webdriver import Chrome
import time
from tqdm.notebook import tqdm
import warnings
warnings.simplefilter('ignore')
query = "삼성"
# 수집할 데이터를 for문 밖에 둠
titles = [] # 제품명
prices = []
review_counts = []
buy_counts = []
published_dates = [] # 제품 등록일
favorites = []
driver = Chrome('./chromedriver') # for 문 밖. 한번만 실행
for page_no in tqdm(range(1, 6)): # 5페이지까지 가져옴
page_url = f"https://search.shopping.naver.com/search/all?frm=NVSHATC&origQuery={query}&pagingIndex={page_no}&pagingSize=40&productSet=total&query={query}&sort=rel×tamp=&viewType=list"
driver.get(page_url)
time.sleep(3)
# 스크롤 내려서 페이지 불러오기
for scroll_down in range(7):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
list_basis = driver.find_element_by_class_name("list_basis")
item_list = list_basis.find_elements_by_class_name('basicList_inner__xCM3J')
items = []
for i in tqdm(range(len(item_list))):
item = item_list[i]
title = item.find_element_by_class_name('basicList_title__VfX3c')
titles.append(title.text)
price = item.find_element_by_class_name("price_num__S2p_v").text[:-1].replace(',', '')
prices.append(price)
footer = item.find_element_by_class_name('basicList_etc_box__5lkgg')#.text
reviews = footer.find_elements_by_class_name('basicList_num__sfz3h')
footer_text = footer.text
try:
if "구매건수" in footer_text:
review_counts.append(int(reviews[0].text.replace(',', '')))
buy_counts.append(int(reviews[1].text.replace(',', '')))
favorites.append(int(reviews[2].text.replace(',', '')))
date = footer.find_elements_by_tag_name('span')[0].text[4:]
published_dates.append(date)
else:
favorites.append(int(reviews[1].text.replace(',', '')))
review_counts.append(int(reviews[0].text.replace(',', '')))
date = footer.find_elements_by_class_name('basicList_etc__LSkN_')[1].text[4:]
published_dates.append(date)
except IndexError: ## 리뷰가 아직 충분하지 않아서, 정보가 안뜨는 케이스
review_counts.append(0)
favorites.append(int(reviews[0].text.replace(',', '')))
date = footer.find_elements_by_tag_name('span')[0].text[4:]
published_dates.append(date)
print(len(titles), len(prices), len(review_counts), len(published_dates), len(favorites))
result = pd.DataFrame({"제품명" : titles,
"가격" : prices,
"리뷰수" : review_counts,
"등록일" : published_dates,
"찜하기" : favorites})
result
result.to_excel(f"naver_shopping({query}).xlsx", index=False)
반응형
'Data Science' 카테고리의 다른 글
[AI와빅데이터경영입문] 데이터변환 (0) | 2023.03.09 |
---|---|
혼자공부하는 머신러닝 + 딥러닝(ch1) k-최근접 이웃 알고리즘 (0) | 2023.02.16 |
[판다스] 여러 엑셀 파일 합치기 (0) | 2023.01.31 |
seaborn 그래프_경향성_크기_분포 파악 (0) | 2023.01.31 |
살아움직이는 그래프 plotly (0) | 2023.01.31 |