Posted by Nick Paul on Tuesday, September 21, 2021


  1. Pitching Data
  2. Pitch Type
  3. Pitch Location
  4. Result

This post explores some of the statcast pitching dimensions and highlights some interesting ways to visualize the data.

Pitch Type

To get started, we will import a data set we collected in a previous post, which contains Aroldis Chapman’s pitch data from 2015-2021.

df_chapman = readr::read_csv("./chapman_20150301-20211001.csv")

Let’s start by identifying Chapman’s favortie ptich.

from import output_file, show, save
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
import pandas as pd

df_chapman = pd.read_csv("./chapman_20150301-20211001.csv")
df_pitch_lookup = pd.read_csv("./pitch_dict.csv").set_index("pitch_type")

df_pitch_count = (df_chapman
                    .sort_values(ascending = True)
                    .join(df_pitch_lookup, on = "pitch_type")
                    .assign(percent = lambda df: 100*df['count']/df['count'].sum()))

source = ColumnDataSource(df_pitch_count)

TOOLTIPS = [("Percent","@percent{0.2f} %")]

p = figure(y_range = df_pitch_count.pitch_name,
            tooltips = TOOLTIPS,
            title = "Aroldis Chapman Pitch Types: 2015-2021")
p.hbar(y="pitch_name", right = "count", source = source, height = .7)
p.x_range.start = 0
p.outline_line_color = None
p.grid.grid_line_color = None
p.xaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
p.yaxis.axis_line_color = None


Bokeh Plot

Next, we can analyze if his pitch preference has changed over time.

from bokeh.palettes import all_palettes

pal = all_palettes['Category10']
# print(pal)

df_pt_gd = ( df_chapman[['game_date','pitch_type', 'pitch_name']]
        .groupby(['game_date','pitch_type', 'pitch_name'])
        .assign(game_date = lambda df: pd.to_datetime(df['game_date']))
        .assign(year = lambda df: df.game_date.dt.year)
        .groupby(['year','pitch_type', 'pitch_name'])
        .assign(total = lambda df: df.groupby('year')['count'].transform('sum'))
        .assign(percent = lambda df: (df['count']/df['total'])*100)


pitch_types = df_pt_gd['pitch_type'].drop_duplicates().to_list()
# print(pitch_types)
num_pitch_types = len(pitch_types)
pal = pal[num_pitch_types]
# print(pal[0])

pal = {'CH':'green',

['CH', 'FF', 'FT', 'IN', 'SL', 'SI', 'FS']
("Pitch Type","@pitch_name"),
("Year", "@year"),
("Percent","@percent{0.2f} %")]

p = figure(width=1000,
tooltips = TOOLTIPS,
title = "Aroldis Chapman Pitch % by Season"
# x_axis_type = "datetime"
for count, value in enumerate(pitch_types):
    pitch_name = df_pt_gd[df_pt_gd['pitch_type'] == value]['pitch_name'].drop_duplicates().to_list()[0]
    source = ColumnDataSource(df_pt_gd[df_pt_gd['pitch_type'] == value])
    p.line('year','percent', color = pal.get(value), source= source, width = 2,
    legend_label = pitch_name)'year','percent', color = pal.get(value), source= source)
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.minor_tick_line_color = None
p.xaxis.major_tick_line_color = 'grey'

p.yaxis.axis_line_color = None
p.yaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
p.y_range.start = 0

p.yaxis.ticker = [25, 50, 75]
p.outline_line_color = None
p.add_layout(p.legend[0], 'right')
p.legend.border_line_color = None


Bokeh Plot

Pitch Location

Let’s now look at his location by pitch type over time and see if anything has changed.

avg_zone <- df_chapman %>% 
  summarise(sz_top = mean(sz_top, na.rm = TRUE),
            sz_bot = mean(sz_bot, na.rm = TRUE))

df_chapman %>% 
  # count(plate_x, plate_z) %>% 
  # filter(! %>% 
  group_by(pitch_type, game_year) %>% 
  mutate(count = n()) %>% 
  ungroup() %>% 
  filter(count > 15) %>% 
  filter(pitch_type %in% c('CH','FF', 'FS', 'SI', 'SL')) %>% 
  # arrange(desc(n)) %>% 
  ggplot(aes(x = plate_x, y = plate_z)) +
  geom_hex(bins = 10) +
  # stat_density_2d(aes(fill = ..level..), geom = "polygon") +
  # geom_density_2d() +
  geom_rect(aes(xmin = -.83 , xmax = .83, ymin = avg_zone$sz_bot, ymax =      avg_zone$sz_top),color = 'black', alpha = 0) +
  # stat_bin_hex(bins = 10) +
  scale_fill_continuous(type = "viridis") +
  ylim(c(1,4)) +
  facet_grid(game_year ~ pitch_type) +


We will now build a few charts to analyze the results of his pitches.

df_chapman %>%
  filter(! %>% 
  count(events) %>% 
  arrange(desc(n)) %>% 
  mutate(events = fct_reorder(events, n, .desc = FALSE)) %>% 
  ggplot(aes(x = events,  y = n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Aroldis Chapman Event Types")

From the chart above, we can see Chapman is a strikeout pitcher.

Let’s see if Chapman’s events have changed over time.

df_games <- df_chapman %>% 
  distinct(game_date) %>% 
  mutate(year = lubridate::year(game_date)) %>% 
  count(year) %>% 
  mutate(games = n) %>% 
  select(year, games)

df_chapman %>%
  left_join(df_games, by = c("game_year" = "year")) %>% 
  filter(! %>% 
  count(events, game_year, games) %>% 
  group_by(events) %>% 
  mutate(total_events = sum(n)) %>% 
  filter(total_events > 20) %>%
  ungroup() %>% 
  arrange(desc(n)) %>% 
  mutate(events = fct_reorder(events, n, .desc = FALSE)) %>% 
  mutate(per_game = n/games) %>% 
  ggplot(aes(x = game_year,  y = per_game, color = events)) +
  geom_line() +
  labs(title = "Aroldis Chapman Event Types") +
  facet_wrap(~events, scales = "free")

From this chart, we can see Chapman is trending up in home runs, walks, doubles, and down in force_outs, strikeouts, and force outs, which is not what you want from your closer.