λŸ¬λ‹μŠ€ν‘Όμ¦ˆ μˆ˜μ—… 정리 

 

 

< 이전 κΈ€ > 

https://silvercoding.tistory.com/55

 

[μ‹œκ°ν™” 뢄석 ν”„λ‘œμ νŠΈ] 1-2. μ•Όκ΅¬μ„ μˆ˜κ°€ κ°•ν•΄μ§€λŠ” κ³„μ ˆμ΄ μžˆμ„κΉŒ?

λŸ¬λ‹μŠ€ν‘Όμ¦ˆ μˆ˜μ—… 정리 < 이전 κΈ€ > https://silvercoding.tistory.com/54 https://silvercoding.tistory.com/53 https://silvercoding.tistory.com/52 [python μ‹œκ°ν™”] 1. seaborn 라이브러리 (distplot, relplot,..

silvercoding.tistory.com

 

 


 νŠΉμ • νŒ€μ— κ°•ν•œ μ„ μˆ˜κ°€ μžˆμ„κΉŒ? 

* idea : 'μƒλŒ€' 컬럼으둜 κ²¨λ£¨μ—ˆλ˜ μƒλŒ€ νŒ€μ„ μΆ”μΆœν•˜μ—¬ 각각의 μƒλŒ€ νŒ€κ³Όμ˜ κ²½κΈ°μ—μ„œμ˜ μΆœλ£¨μœ¨μ„ κ³„μ‚°ν•œλ‹€.  

 

- 데이터 뢈러였기 

import pandas as pd

file  = './data/KBO_2019_player_gamestats.csv'
raw = pd.read_csv(file, encoding = 'cp949')
raw.head()

 

 

- μƒλŒ€ νŒ€λ³„ 기둝 정리 

raw['μƒλŒ€'].unique()

μš°μ„  unique() λ₯Ό μ΄μš©ν•˜μ—¬ 'μƒλŒ€' μ»¬λŸΌμ— μžˆλŠ” 값듀을 확인해 μ€€λ‹€. μ•žμ— @κ°€ 뢙은 κ²½μš°λŠ” 원정경기, 그렇지 μ•ŠμœΌλ©΄ ν™ˆ κ²½κΈ°λ₯Ό μ˜λ―Έν•œλ‹€. 이λ₯Ό 'ν™ˆμ–΄μ›¨μ΄' λΌλŠ” μ»¬λŸΌμ„ μΆ”κ°€ν•˜μ—¬ κ΅¬λΆ„ν•˜λ„λ‘ ν•˜κ³  , 'μƒλŒ€νŒ€' μ»¬λŸΌμ—λŠ” νŒ€λͺ…λ§Œμ„ λ„£μ–΄μ€€λ‹€.  

opp_list = [ ]
home_away_list = [ ]

for opp in raw['μƒλŒ€']:
    if "@" in opp:
        home_away = '원정'
        opp = opp.replace('@', '')
    else:
        home_away = 'ν™ˆ'
    home_away_list.append(home_away)
    opp_list.append(opp)

raw['ν™ˆμ–΄μ›¨μ΄'] = home_away_list
raw['μƒλŒ€νŒ€'] = opp_list
raw.head()

for문을 μ΄μš©ν•˜μ—¬ ν™ˆμ–΄μ›¨μ΄μ™€ μƒλŒ€νŒ€μ„ κ΅¬λΆ„ν•˜κ³  ,  λ‘κ°œμ˜ μ»¬λŸΌμ„ μΆ”κ°€ν•΄ μ£Όμ—ˆλ‹€. 

factors = ['νƒ€μˆ˜','μ•ˆνƒ€','ν™ˆλŸ°', '루타', '타점','λ³Όλ„·', '사ꡬ', '희비']
data = raw.pivot_table(index = ['νŒ€','이름','생일', 'μƒλŒ€νŒ€'],
                      values = factors,
                       aggfunc = 'sum')
data.head()

μ„ μˆ˜λ“€μ˜ μƒνƒœνŒ€ 별 싀적을 μ§‘κ³„ν•˜κΈ° μœ„ν•΄ ν”Όλ²— ν…Œμ΄λΈ”μ„ μƒμ„±ν•œλ‹€. 

cond = data['νƒ€μˆ˜'] > 0 
data = data[ cond ]
data.head()

νƒ€μˆ˜κ°€ μ—†λŠ” μ„ μˆ˜λ“€μ€ λ°μ΄ν„°ν”„λ ˆμž„μ—μ„œ μ œμ™Έμ‹œν‚¨λ‹€. 

data = data.reset_index()
data.head()

reset_indexλ₯Ό μ‚¬μš©ν•˜μ—¬ indexλ₯Ό λͺ¨λ‘ 컬럼으둜 λ³€κ²½ν•΄ μ€€λ‹€. 

 

 

 

 

- νƒ€μž vs μƒλŒ€νŒ€ 별 싀적 계산 

def cal_hit(df):
    '''
    - νƒ€μœ¨ : 곡을 μ³μ„œ λ‚˜κ°€λŠ” λΉ„μœ¨ --> μ•ˆνƒ€ / νƒ€μˆ˜
    - 좜루율: μ§„λ£¨ν•΄μ„œ λ‚˜κ°€λŠ” λΉ„μœ¨ -->  (μ•ˆνƒ€+λ³Όλ„·+λͺΈμ—λ§žλŠ”λ³Ό)/(νƒ€μˆ˜+λ³Όλ„·+λͺΈμ—λ§žλŠ”λ³Ό+ν¬μƒν”ŒλΌμ΄)
    - μž₯νƒ€μœ¨ : νƒ€μœ¨μ— μ§„λ£¨ν•œ 베이슀 κ°€μ€‘μΉ˜ μΆ”κ°€ -->   루타 / νƒ€μˆ˜
    '''
    
    df['νƒ€μœ¨'] = df['μ•ˆνƒ€'] / df['νƒ€μˆ˜']
    df['좜루율'] = (df['μ•ˆνƒ€'] + df['λ³Όλ„·'] + df['사ꡬ']) / (df['νƒ€μˆ˜'] + df['사ꡬ'] + df['희비'])
    df['μž₯νƒ€μœ¨'] = df['루타'] / df['νƒ€μˆ˜']
    df['OPS'] = df['좜루율'] + df['μž₯νƒ€μœ¨']
    return df

이전 κΈ€μ—μ„œ μ‚¬μš©ν–ˆλ˜ ν•¨μˆ˜ , 싀적 계산을 ν•˜κ³  μ»¬λŸΌμ„ μΆ”κ°€ν•΄ μ€€λ‹€. 

player_stats_opp = cal_hit(data)
player_stats_opp

 

 

 

 

- κ²°κ³Ό 보기 : DataFrame 

(1) 두산에 κ°•ν•œ μ„ μˆ˜ ? - μƒμœ„ 10λͺ…

team = '두산'
cond = (player_stats_opp['μƒλŒ€νŒ€'] == team) & (player_stats_opp['νƒ€μˆ˜'] > 10)
player_stats_opp[cond].sort_values(by = '좜루율', ascending = False).head(10)

μƒλŒ€νŒ€μ΄ λ‘μ‚°μ΄λ©΄μ„œ , νƒ€μˆ˜κ°€ 20보닀 큰 μ„ μˆ˜λ“€μ„ 뽑아 λ‚΄κ³ , μΆœλ£¨μœ¨μ„ κΈ°μ€€μœΌλ‘œ μ •λ ¬ν•˜μ—¬ 10λͺ…μ˜ μ„ μˆ˜λ₯Ό 좜λ ₯ν•΄ λ³Έλ‹€. 

 

λ”°λΌμ„œ μƒλŒ€νŒ€μ΄ '두산' 일 λ•Œ 좜루율 μƒμœ„ 10λͺ…μ˜ 이름을 뽑아보면 λ‹€μŒκ³Ό κ°™λ‹€

player_stats_opp[cond].sort_values(by = '좜루율', ascending = False)['이름'].head(10)

 

(2) 둯데에 κ°•ν•œ μ„ μˆ˜ ? - μƒμœ„ 10λͺ…

team = '둯데'
cond = (player_stats_opp['μƒλŒ€νŒ€'] == team) & (player_stats_opp['νƒ€μˆ˜'] > 20)
player_stats_opp[cond].sort_values(by = '좜루율', ascending = False).head(10)

λ™μΌν•˜κ²Œ μ‹œν–‰ ν•΄ μ€€λ‹€. 

 

μƒλŒ€νŒ€μ΄ '두산' 일 λ•Œ 좜루율 μƒμœ„ 10λͺ…μ˜ 이름을 뽑아보면 λ‹€μŒκ³Ό κ°™λ‹€

player_stats_opp[cond].sort_values(by = '좜루율', ascending = False)['이름'].head(10)

 

 

(3) KBO 전체 νŒ€μ„ μƒλŒ€λ‘œ νŒ€ 별 좜루율 μƒμœ„ 5인 νƒ€μžλ“€ 확인해 보기 

hitter_df = pd.DataFrame()

for team in player_stats_opp['μƒλŒ€νŒ€'].unique():
    print(team)
    cond = (player_stats_opp['μƒλŒ€νŒ€'] == team) & (player_stats_opp['νƒ€μˆ˜'] > 20)
    df = player_stats_opp[cond].sort_values(by = '좜루율', ascending = False).head(5)
    hitter_df = hitter_df.append(df)

hitter_df

νŠΉμ • νŒ€ μƒλŒ€ 좜루율 Top5 μ•ˆμ— λ“€μ–΄ μžˆλŠ” νƒ€μž 리슀트 (쀑볡 제거) 

hitter_df['이름'].unique()

 

 

 

- 결과보기 : Heatmap (μ‹œκ°ν™”) 

cond = player_stats_opp['이름'].isin(hitter_df['이름'].unique())
top_df = player_stats_opp[cond]
top_pivot = top_df.pivot_table(index = ['νŒ€','이름'], values = '좜루율', columns = 'μƒλŒ€νŒ€', aggfunc = 'sum')
top_pivot

μœ„μ—μ„œ λ§Œλ“€μ–΄ λ†“μ•˜λ˜ νŠΉμ • νŒ€μ„ μƒλŒ€λ‘œ 좜루율 top5 μ•ˆμ— λ“€μ—ˆλ˜ μ΄λ¦„λ“€λ§Œ player_stats_oppμ—μ„œ λ½‘μ•„μ˜¨ ν›„ , ν•΄λ‹Ή μ„ μˆ˜λ“€μ˜ μƒλŒ€νŒ€ 별 좜루율 pivot_table을 μƒμ„±ν•œλ‹€.  

import matplotlib
from matplotlib import font_manager, rc
import platform
import matplotlib.pyplot as plt
import seaborn as sns

# 이미지 ν•œκΈ€ ν‘œμ‹œ μ„€μ •
if platform.system() == 'Windows':  # μœˆλ„μš°μΈ 경우 맑은고딕
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    # Mac 인 경우 μ• ν”Œκ³ λ”•
    rc('font', family='AppleGothic')

#κ·Έλž˜ν”„μ—μ„œ λ§ˆμ΄λ„ˆμŠ€ κΈ°ν˜Έκ°€ ν‘œμ‹œλ˜λ„λ‘ ν•˜λŠ” μ„€μ •μž…λ‹ˆλ‹€.
matplotlib.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots( figsize=(15,15) )

sns.heatmap(data = top_pivot, 
            annot = True, fmt = '.3f', 
            cmap = 'Reds',
            center= 0.4   # 컬러맡 쀑간값 지정
           )

색이 μ§„ν• μˆ˜λ‘ 좜루율이 λ†’μŒμ„ μ˜λ―Έν•œλ‹€. 예λ₯Ό λ“€μ–΄ , NC의 μ–‘μ˜μ§€ μ„ μˆ˜λŠ” ν•΄λ‹Ή μ‹œμ¦Œμ—μ„œ KIAλ₯Ό μƒλŒ€λ‘œ ν•œ κ²½κΈ°μ—μ„œ 좜루율이 λ†’μ•˜μœΌλ©° , ν•œν™”μ˜ μ •κ·Όμš° μ„ μˆ˜λŠ” LGμ™€μ˜ κ²½κΈ°μ—μ„œ κ°•ν–ˆλ‹€λŠ” 것을 μ•Œ 수 μžˆλ‹€.  

sns.heatmap(data = top_pivot, 
            annot = True, fmt = '.3f', 
            cmap = 'Reds',
            center= 0.6   # 컬러맡 쀑간값 지정
           )

μƒλŒ€μ μΈ 크기λ₯Ό μ‚΄νŽ΄λ³΄κ³ μž ν•  λ•ŒλŠ” centerλ₯Ό λ³€κ²½ν•΄ κ°€λ©° 확인해볼 수 μžˆλ‹€. 

 

 

 

 

 

+ Recent posts