Statcast Pitching Data

This post explores some of the statcast pitching dimensions and highlights some interesting ways to visualize the data.

Pitch Type

To get started, we will import a data set we collected in a previous post, which contains Aroldis Chapman’s pitch data from 2015-2021.

df_chapman = readr::read_csv("./chapman_20150301-20211001.csv")

Let’s start by identifying Chapman’s favortie ptich.

from bokeh.io import output_file, show, save
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
import pandas as pd

df_chapman = pd.read_csv("./chapman_20150301-20211001.csv")
df_pitch_lookup = pd.read_csv("./pitch_dict.csv").set_index("pitch_type")

df_pitch_count = (df_chapman
                    .groupby('pitch_type')
                    [['pitch_type']]
                    .size()
                    .sort_values(ascending = True)
                    .to_frame("count")
                    .reset_index()
                    .join(df_pitch_lookup, on = "pitch_type")
                    .assign(percent = lambda df: 100*df['count']/df['count'].sum()))

print(df_pitch_count)

source = ColumnDataSource(df_pitch_count)

TOOLTIPS = [("Percent","@percent{0.2f} %")]

p = figure(y_range = df_pitch_count.pitch_name,
            tooltips = TOOLTIPS,
            title = "Aroldis Chapman Pitch Types: 2015-2021")
p.hbar(y="pitch_name", right = "count", source = source, height = .7)

p.x_range.start = 0
p.outline_line_color = None
p.grid.grid_line_color = None
p.xaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
p.yaxis.axis_line_color = None

output_file("pitch_count.html")

save(p)

Bokeh Plot

Next, we can analyze if his pitch preference has changed over time.


from bokeh.palettes import all_palettes

pal = all_palettes['Category10']
# print(pal)

df_chapman[['game_date']].dtypes

df_pt_gd = ( df_chapman[['game_date','pitch_type', 'pitch_name']]
        .groupby(['game_date','pitch_type', 'pitch_name'])
        .size()
        .to_frame('count')
        .reset_index()
        .assign(game_date = lambda df: pd.to_datetime(df['game_date']))
        .assign(year = lambda df: df.game_date.dt.year)
        .groupby(['year','pitch_type', 'pitch_name'])
        .agg({'count':'sum'})
        .reset_index()
        .assign(total = lambda df: df.groupby('year')['count'].transform('sum'))
        .assign(percent = lambda df: (df['count']/df['total'])*100)
)

# df_pt_gd.info


pitch_types = df_pt_gd['pitch_type'].drop_duplicates().to_list()
# print(pitch_types)
num_pitch_types = len(pitch_types)
pal = pal[num_pitch_types]
# print(pal[0])

pal = {'CH':'green',
       'FF':'red',
       'FT':'orange',
       'SL':'yellow',
       'FS':'blue',
       'SI':'orange'}

print(pal)

['CH', 'FF', 'FT', 'IN', 'SL', 'SI', 'FS']

TOOLTIPS = [
("Pitch Type","@pitch_name"),
("Year", "@year"),
("Percent","@percent{0.2f} %")]

p = figure(width=1000,
tooltips = TOOLTIPS,
title = "Aroldis Chapman Pitch % by Season"
# x_axis_type = "datetime"
)
print(pal['CH'])

for count, value in enumerate(pitch_types):
    print(count)
    print(value)
    print(pal.get(value))
    pitch_name = df_pt_gd[df_pt_gd['pitch_type'] == value]['pitch_name'].drop_duplicates().to_list()[0]
    print(pitch_name)
    source = ColumnDataSource(df_pt_gd[df_pt_gd['pitch_type'] == value])
    p.line('year','percent', color = pal.get(value), source= source, width = 2,
    legend_label = pitch_name)
    p.circle('year','percent', color = pal.get(value), source= source)

p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.minor_tick_line_color = None
p.xaxis.major_tick_line_color = 'grey'

p.yaxis.axis_line_color = None
p.yaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
p.y_range.start = 0

p.yaxis.ticker = [25, 50, 75]
p.outline_line_color = None
p.add_layout(p.legend[0], 'right')
p.legend.border_line_color = None


output_file("pitch_count_by_date.html")

save(p)

Bokeh Plot

Pitch Location

Let’s now look at his location by pitch type over time and see if anything has changed.

avg_zone <- df_chapman %>% 
  summarise(sz_top = mean(sz_top, na.rm = TRUE),
            sz_bot = mean(sz_bot, na.rm = TRUE))



df_chapman %>% 
  # count(plate_x, plate_z) %>% 
  # filter(!is.na(plate_x)) %>% 
  group_by(pitch_type, game_year) %>% 
  mutate(count = n()) %>% 
  ungroup() %>% 
  filter(count > 15) %>% 
  filter(pitch_type %in% c('CH','FF', 'FS', 'SI', 'SL')) %>% 
  # arrange(desc(n)) %>% 
  ggplot(aes(x = plate_x, y = plate_z)) +
  geom_hex(bins = 10) +
  # stat_density_2d(aes(fill = ..level..), geom = "polygon") +
  # geom_density_2d() +
  geom_rect(aes(xmin = -.83 , xmax = .83, ymin = avg_zone$sz_bot, ymax =      avg_zone$sz_top),color = 'black', alpha = 0) +
  # stat_bin_hex(bins = 10) +
  scale_fill_continuous(type = "viridis") +
  ylim(c(1,4)) +
  facet_grid(game_year ~ pitch_type) +
  theme_bw()

Results

We will now build a few charts to analyze the results of his pitches.

df_chapman %>%
  filter(!is.na(events)) %>% 
  count(events) %>% 
  arrange(desc(n)) %>% 
  mutate(events = fct_reorder(events, n, .desc = FALSE)) %>% 
  ggplot(aes(x = events,  y = n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Aroldis Chapman Event Types")

From the chart above, we can see Chapman is a strikeout pitcher.

Let’s see if Chapman’s events have changed over time.

df_games <- df_chapman %>% 
  distinct(game_date) %>% 
  mutate(year = lubridate::year(game_date)) %>% 
  count(year) %>% 
  mutate(games = n) %>% 
  select(year, games)
  

df_chapman %>%
  left_join(df_games, by = c("game_year" = "year")) %>% 
  filter(!is.na(events)) %>% 
  count(events, game_year, games) %>% 
  group_by(events) %>% 
  mutate(total_events = sum(n)) %>% 
  filter(total_events > 20) %>%
  ungroup() %>% 
  arrange(desc(n)) %>% 
  mutate(events = fct_reorder(events, n, .desc = FALSE)) %>% 
  mutate(per_game = n/games) %>% 
  ggplot(aes(x = game_year,  y = per_game, color = events)) +
  geom_line() +
  labs(title = "Aroldis Chapman Event Types") +
  facet_wrap(~events, scales = "free")

From this chart, we can see Chapman is trending up in home runs, walks, doubles, and down in force_outs, strikeouts, and force outs, which is not what you want from your closer.

Intro to Statcast Pitching Data

TOC

Statcast Pitching Data

Pitch Type

Pitch Location

Results

FEATURED TAGS