IPO Stock 종합 크롤링

ianman99 ㅣ 2024. 3. 12. 17:06

from selenium import webdriver
import chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import csv
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from pykrx import stock
import pandas as pd
from pandas_datareader import data
from tabulate import tabulate
from datetime import datetime
from datetime import timedelta

# 크롬브라우저 조용히 실행
chrome_options = Options()
chrome_options.add_argument("--headless")

odd_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39]
data_list = []

# 연도 설정
for year in range(2020, 2025):
    for page_num in range(1, 8):
        for num1 in odd_list:
            try:
                driver = webdriver.Chrome(options=chrome_options)
                driver.get(f'http://www.ipostock.co.kr/sub03/ipo04.asp?str1={year}&str2=all&str3=&str4=&page={page_num}')
                time.sleep(0.5)

                # 해당 페이지 요소가 없으면 다음으로 넘어감
                try:
                    page_in = driver.find_elements(By.CSS_SELECTOR,f'#print > table:nth-child(1) > tbody > tr:nth-child(4) > td > table > tbody > tr:nth-child(4) > td > table > tbody > tr:nth-child({num1}) > td:nth-child(3) > a > font')
                    if page_in:
                        page_in[0].click()
                    else:
                        raise NoSuchElementException
                except NoSuchElementException:
                    continue  
               
                time.sleep(0.5)

                # 종목코드
                c_code = driver.find_elements(By.CSS_SELECTOR, '.view_txt01')
                var1 = c_code[0].text if c_code else '' # c_code 리스트가 존재한다면 var1에 반환 그렇지 않다면 빈값 반환
                ticker = var1

                # 종목명
                c_name = driver.find_elements(By.CSS_SELECTOR, '.view_tit')
                var2 = c_name[0].text if c_name else ''

                # 상장일
                date = driver.find_elements(By.CSS_SELECTOR, '#print > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td:nth-child(1) > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td:nth-child(2) > table > tbody > tr:nth-child(7) > td:nth-child(4)')
                var3 = date[0].text if date else ''

                # 공모가
                price = driver.find_elements(By.CSS_SELECTOR, 'td[height="25"] font[color="B83D11"] strong')
                var4 = price[0].text if price else ''

                # 공모금액
                amount = driver.find_elements(By.CSS_SELECTOR, '#print > table > tbody > tr:nth-child(5) > td > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td:nth-child(2) > table > tbody > tr:nth-child(4) > td:nth-child(2) > strong > font')
                var5 = amount[0].text if amount else ''

                # 청약경쟁률
                comp_norm = driver.find_elements(By.CSS_SELECTOR, 'td[height="25"] strong font[color="B83D11"]')
                var6 = comp_norm[1].text if len(comp_norm) > 1 else ''

                # 우리사주배정비율
                we_ratio = driver.find_elements(By.CSS_SELECTOR, 'table[width="350"] tr[align="center"] td')
                var7 = we_ratio[2].text if len(we_ratio) > 2 else ''

                # 수요예측 페이지 진입
                page_in = driver.find_elements(By.CSS_SELECTOR, 'img[alt="수요예측"]')
                if page_in:
                    page_in[0].click()
                else:
                    raise NoSuchElementException

                time.sleep(0.5)

                # 기관경쟁률
                comp_inst = driver.find_elements(By.CSS_SELECTOR, 'tr[height="26"] td[align="left"] font[color="cc3300"]')
                var8 = comp_inst[0].text if comp_inst else ''

                # 의무보유확약비율
                prom_ratio = driver.find_elements(By.CSS_SELECTOR, 'tr[height="26"][align="left"] td[align="left"][bgcolor="#FFFFFF"]')
                var9 = prom_ratio[2].text if len(prom_ratio) > 2 else ''
               
                # 주가 구하기
                start_date = var3.replace('.', '')
                end_date = start_date
                df_stock = stock.get_market_ohlcv_by_date(start_date, end_date, ticker, adjusted=False)

                # 데이터프레임이 비어있지 않은 경우에만 값을 추출
                if len(df_stock) > 0:
                    var10 = df_stock.iloc[0, 0]  # 시초가
                    var11 = df_stock.iloc[0, 3]  # 종가
                else:
                    # 데이터프레임이 비어있는 경우, var10과 var11에 빈값 할당
                    var10 = ''
                    var11 = ''
               
                data_list.append([var1, var2, var3, var4, var5, var6, var7, var8, var9, var10, var11])

            except Exception as e:
                print(f"An error occurred: {e}")
            finally:
                driver.quit()

# 리스트를 데이터프레임으로 변환
df = pd.DataFrame(data_list, columns=['종목코드', '종목명', '상장일', '공모가', '공모금액', '청약경쟁률', '우리사주배정비율', '기관경쟁률', '의무보유확약비율', '시초가', '종가'])

# '대 1'과 ':1' 문자열을 공백으로 대체
df = df.replace({'대 1': '', ' :1': '', ' ': ''}, regex=True)

# 데이터프레임 출력
print(tabulate(df, headers='keys', tablefmt='rst'))

df.to_excel('output.xlsx', index=False)