United States sports writer and pioneer of baseball Sabermetrics [Bill James](https://ja.wikipedia.org/wiki/%E3%83%93%E3%83%AB%E3%83 % BB% E3% 82% B8% E3% 82% A7% E3% 83% BC% E3% 83% A0% E3% 82% BA) has created a formula to predict the team's score.
Number of points= (Number of hits+Number of walks)× Number of base hits ÷(At bat+Number of walks)
The score estimated by this formula was named ** RC (Runs Created) **. James substituted the past season records of various MLB teams on the right side of the equation to see if it matched the actual score. As a result, this formula was effective regardless of which team was applied, and the score could be predicted with extremely high accuracy. But there is one question.
--It was MLB data that James used to verify the accuracy of this formula. Is it possible to accurately predict the score for the NPB team using this formula?
Therefore, this time, we verified the accuracy of this score formula RC using the actual NPB team data.
2016C_bat.csv
Carp, .272, 143, 5582, 4914, 684, 1338, 203, 35, 153, 2070, 649, 118, 52, 91, 29, 500, 13, 47, 1063, 85, .421, .343
Yakult, .256, 143, 5509, 4828, 594, 1234, 210, 20, 113, 1823, 565, 82, 24, 85, 33, 524, 10, 39, 907, 117, .378, .331
Giants, .251, 143, 5356, 4797, 519, 1203, 217, 19, 128, 1842, 497, 62, 26, 112, 23, 389, 11, 35, 961, 100, .384, .310
DeNA, .249, 143, 5364, 4838, 572, 1205, 194, 21, 140, 1861, 548, 67, 34, 81, 18, 373, 7, 54, 1049, 92, .385, .309
Dragons, .245, 143, 5405, 4813, 500, 1180, 209, 21, 89, 1698, 473, 60, 28, 108, 28, 410, 7, 46, 1001, 103, .353, .309
Tigers, .245, 143, 5401, 4789, 506, 1171, 204, 17, 90, 1679, 475, 59, 25, 88, 38, 435, 17, 51, 1149, 99, .351, .312
(The data is https://github.com/AnchorBlues/python/tree/master/baseballdata I put the processed one in
2.2. Program The programming language Python was used for data reading, analysis, and visualization.
NPB.py
#coding:utf - 8
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
fy = 2005
ly = 2016
Yn = ly - fy + 1
Bat_Column = ['Average', 'Game', 'PA', 'AB', 'Score', 'Hit', \
'TwoBase', 'ThreeBase', 'HR', 'TB', 'RBI', 'Steel', \
'MissSteal', 'Bunt', 'SF', 'BB', 'IntentionalWalk', \
'DeadBall', 'StrikeOut', 'DoublePlay', 'SLG', 'OBP']
# PA :Plate Appearance Number of at-bats
# AB :At Bat at bat
# TB :Total Bases Number of bases
# RBI :RBI
# SF :Sacrifice Fly Sacrifice fly
# IntentionalWalk :Intentional walk
N = len(Bat_Column)
class Bat_Data():
def __init__(self, Data, Year, Team):
self.Year = Year
self.Team = Team
for i in range(0, N):
setattr(self, Bat_Column[i], Data[:, i])
self.OPS = self.SLG + self.OBP
self.NOI = (self.SLG / 3.0 + self.OBP) * 1000
self.BABIP = (self.Hit - self.HR) / (self.AB + self.SF - self.HR - self.StrikeOut)
self.RC = (self.Hit + self.BB) * self.TB / (self.AB + self.BB)
self.IsoP = self.SLG - self.Average
self.IsoD = self.OBP - self.Average
class TEAM:
def __init__(self, ID, Name, maker):
self.ID = ID
self.Name = Name
self.maker = maker
team = [0] * 12
team[0] = TEAM(0, 'Carp', '>')
team[1] = TEAM(1, 'Tigers', '<')
team[2]= TEAM(2, 'Giants', '^')
team[3] = TEAM(3, 'Dragons', 'v')
team[4] = TEAM(4, 'DeNA', 'd')
team[5] = TEAM(5, 'Yakult', 'D')
team[6] = TEAM(6, 'Fighters', '8')
team[7] = TEAM(7, 'Lotte', 'H')
team[8] = TEAM(8, 'Lions', 'h')
team[9] = TEAM(9, 'Eagles', '*')
team[10] = TEAM(10, 'Orix', 'p')
team[11] = TEAM(11, 'Hawks', 's')
#2 Bat_Consolidate Data instances into one instance
def Docking(Data1, Data2):
data = np.zeros((Data1.Average.shape[0] + Data2.Average.shape[0], N))
for i in range(0, N):
data[:, i] = np.r_[getattr(Data1, Bat_Column[i]), getattr(Data2, Bat_Column[i])]
year = np.r_[Data1.Year, Data2.Year]
team = np.r_[Data1.Team, Data2.Team]
Data_new = Bat_Data(data, year, team)
return Data_new
def get_data(League, year):
fname = './baseballdata/' + str(year) + League + '_bat.csv'
Data = np.loadtxt(fname, delimiter = ',', usecols = range(1, N + 1))
Year = np.ones(6) * year
Team = np.loadtxt(fname, delimiter = ',', usecols = range(0, 1), dtype = str)
Data = Bat_Data(Data, Year, Team)
return Data
def get_all_data(League):
for i in range(Yn):
year = i + fy
tmp = get_data(League, year)
if i == 0:
Data = tmp
else:
Data = Docking(Data, tmp)
return Data
# Data.Column_From the name, the team name is Team_Extract only the one with name.
def PickUp_Data_of_a_team(Data, Column_name, Team_name):
return getattr(Data, Column_name)[np.where(getattr(Data, 'Team') == Team_name)]
def draw_scatter(plt, Data, X_name, Y_name, regression_flg = 0, Y_eq_X_line_flg = 0, \
title = 'Scatter plot', fsizex = 10, fsizey = 8):
fig, ax = plt.subplots(figsize = (fsizex, fsizey))
plt.rcParams['font.size'] = 16
for i in range(0, len(team)):
x = PickUp_Data_of_a_team(Data, X_name, team[i].Name)
y = PickUp_Data_of_a_team(Data, Y_name, team[i].Name)
year = PickUp_Data_of_a_team(Data, 'Year', team[i].Name)
if x != np.array([]):
CF = ax.scatter(x, y, c = year, s = 50, marker = team[i].maker, \
label = team[i].Name, vmin = fy, vmax = ly)
if i == 0:
X = x
Y = y
else:
X = np.r_[X, x]
Y = np.r_[Y, y]
plt.colorbar(CF, ticks = list(np.arange(fy, ly + 1)), label = 'year')
plt.legend(bbox_to_anchor = (1.35, 1), loc = 2, borderaxespad = 0., scatterpoints = 1)
ax.set_title(title)
ax.set_xlabel(X_name)
ax.set_ylabel(Y_name)
#Draw a regression line
if regression_flg == 1:
slope, intercept, r_value, _, _ = stats.linregress(X, Y)
xx = np.arange(450, 750, 1)
yy = slope * xx + intercept
ax.plot(xx, yy, linewidth = 2)
# y=Draw a straight line of x
if Y_eq_X_line_flg == 1:
xx = np.arange(450, 750, 1)
yy_d = xx
ax.plot(xx, yy_d, color = 'k')
print 'Correlation=', np.corrcoef(X, Y)[0, 1]
return plt
For example, if you want to retrieve the 2016 Central League data, do the following:
In [1]:import NPB
In [2]:Data_2016C=NPB.get_data('C',2016) #When you want to be in the Pacific League'C'To'P'To.
In [3]:Data_2016C.Average #Output the batting average of each of the 6 teams in the 2016 Central League
Out[3]: array([ 0.272, 0.256, 0.251, 0.249, 0.245, 0.245])
In [1]:import matplotlib.pyplot as plt
In [2]:Data_C=NPB.get_all_data('C') #Extract all data from the Central League
In [3]:Data_P=NPB.get_all_data('P') #Get all the Pacific League data
In [4]:Data=NPB.Docking(Data_C,Data_P) #Integrate data from both leagues
In [5]:plt=NPB.draw_scatter(plt,Data,'Average','Score') #Draw a scatter plot of batting average and score
In [6]:plt.show()
The output figure is as follows.
Also, the correlation coefficient is
Correlation= 0.825987845723
The result was that.
3.2. RC vs Score For the same data as 3.1., Now draw a scatter plot with "RC" on the horizontal axis and "Score" on the vertical axis.
In [7]:plt=NPB.draw_scatter(plt,Data,'RC','Score',regression_flg=1,Y_eq_X_line_flg=1) #Draw a scatter plot of RC and scores. Furthermore, the regression line and y=Draw a straight line of x.
In [8]:plt.show()
The output figure is as follows.
Also, the correlation coefficient is
Correlation= 0.953524104544
The result was that.
From the value of the correlation coefficient, it can be seen that there is an extremely strong correlation between RC and the score.
Furthermore, the regression line (blue line in the above figure) is very close to the straight line of "y = x" (black line in the above figure).
(The regression line was y = 0.95 * x-6.3
)
References
-[Moneyball [Complete Edition](by Michael Lewis)](https://www.amazon.co.jp/s/ref=nb_sb_noss?__mk_ja_JP=%E3%82%AB%E3%82%BF% E3% 82% AB% E3% 83% 8A & url = search-alias% 3Daps & field-keywords =% E3% 83% 9E% E3% 83% 8D% E3% 83% BC% E3% 83% 9C% E3% 83% BC % E3% 83% AB) -Nippon Professional Baseball
Recommended Posts