Data Science
[파이썬] 웹크롤링 주식 데이터 가져오기
태지쌤
2023. 1. 30. 20:00
반응형
# -*- coding: utf-8 -*-
"""CH 02_02.주식 데이터 가져오기.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/12NAlW9QiyWko31SpWwBoYRMxJue-Drt0
"""
# 해당 페이지의 page source를 직접 가져옵니다. 웹페이지에서도 우클릭 "페이지 소스 보기"로 같은 HTML 소스를 볼 수 있습니다.
import requests
import bs4
page_no = 1
page_url = f'https://finance.naver.com/sise/sise_index_day.naver?code=KPI200&page={page_no}'
source = requests.get(page_url).text
# BeautifulSoup 함수를 사용해서 불러온 html source를 "lxml" parser로 parsing 합니다.
source = bs4.BeautifulSoup(source)
dates = source.find_all('td', class_='date')
dates
date_list = []
for date in dates:
date_list.append(date.text)
date_list
prices = source.find_all('td', class_ = 'number_1')
price_list = []
for price in prices[::4]:
price_list.append(price.text)
price_list
last_url = source.find_all('td', class_='pgRR')[0].find_all('a')[0]['href']
last_url
last_page = int(last_url.split('&page=')[-1])
last_page
date_list = []
price_list = []
for page_no in range(1, last_page+1):
page_url = f'https://finance.naver.com/sise/sise_index_day.naver?code=KPI200&page={page_no}'
source = requests.get(page_url).text
source = bs4.BeautifulSoup(source)
dates = source.find_all('td', class_='date')
for date in dates:
date_list.append(date.text)
prices = source.find_all('td', class_ = 'number_1')
for price in prices[::4]:
price_list.append(price.text)
import pandas as pd
df = pd.DataFrame({"date" : date_list, "price" : price_list})
df.to_excel("kpi200.xlsx", index=False)
ch_02_02_주식_데이터_가져오기.py
0.00MB
CH_02_02_주식_데이터_가져오기.ipynb
0.01MB
반응형