#initial testing for strike zone gif for 2025 season
#note: did not take into account different batter stances and included small tests before all of 2025
#the final version is 25strikezone-nostance.R and 24vs25strikezone.R

#load packages
library(tidyr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggridges) #theme package
library(mgcv) #run bam model
library(lubridate)

#load full 2025 season data
full <- read.csv('data/statcast2025.csv')


#using data from may 2025 for easier testing
may <- full |> filter(
  game_date >= '2025-05-01', game_date < '2025-06-01',
  description %in% c("ball", "called_strike")) |> 
  select(description, plate_x, plate_z, sz_top, sz_bot, stand) |> 
  mutate(is_strike = ifelse(description == "called_strike", 1, 0)) |> 
  drop_na()

may$stand <- factor(may$stand, levels=c("L", "R"))
may$description <- factor(may$description, levels = c("ball", "called_strike"))
  

#find and store the average top/bot of the strike zone for may for drawing
#the strike zone rectangle
mayavg_sztop <- mean(may$sz_top)
mayavg_szbot <- mean(may$sz_bot)


#function that draws the plate to use in visuals
geom_plate <- function(){
  df <- data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
  g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "gray60", linewidth = 1.25)
  g
}


#############################################
#drawing the "actual strike zone"
#############################################
#using a generative additive model (GAM) to identify non-linear relationships
#can be used to make heatmaps or drawing the boundary where balls/strikes are called
#bam = big additive model, fits GAMs for very large datasets. reduced memory usage and faster computation

#create and fit a bam model
#use to get the probability that a pitch will be called a strike
may_fit <- bam(is_strike ~ te(plate_x, plate_z, by = stand), #stand will create a different strike zone for L vs R batters
               data=may, 
               family = binomial, #0=ball, 1=strike
               discrete = TRUE) #helps for faster computation with large dataset 

#create dataset "grid" that has combinations of plate_x and plate_y coordinates
#50x50 map around the plate
grid <- expand.grid(
  plate_x = seq(-1.5, 1.5, length.out = 50),
  plate_z = seq(1, 4, length.out = 50),
  stand = levels(may$stand)
)


#applies gam to the grid of coordinates to assign a probability of being called a strike
grid$prob <- predict(may_fit, grid, type="response")


#plotting predictions from the grid dataset
#NOTE: to add geom_plate(), expand grid since plate goes down to plate_z.= -0.5
ggplot(grid, aes(plate_x, plate_z, z = prob)) +
  geom_contour(breaks = 0.5, linewidth = 1.2) +  #drawing a line where the probability of a strike is 50%
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = mayavg_szbot, ymax = mayavg_sztop, 
            fill = NA, color = "black", alpha = 0.3) +  #"set" strike zone 
  labs(title = "'Actual' May 2025 Strike Zone",
       subtitle = "Compared to 'Set' Strike Zone for L vs R batters") +
  facet_wrap(~stand) + #separate visuals for left and right handed batters
  coord_fixed() +
  theme_minimal()


#######################################

#what if i try to do based on matchup???
#create a new column that holds matchup: ex: LR (left handed batter, right handed pither)

may_matchup <- full |> filter(
    game_date >= '2025-05-01', game_date < '2025-06-01',
    description %in% c("ball", "called_strike")) |> 
  select(description, plate_x, plate_z, sz_top, sz_bot, stand, p_throws) |> 
  mutate(is_strike = ifelse(description == "called_strike", 1, 0)) |> 
  drop_na()

may_matchup$stand <- factor(may_matchup$stand, levels=c("L", "R"))
may_matchup$p_throws <- factor(may_matchup$p_throws, levels=c("L", "R"))
may_matchup$description <- factor(may_matchup$description, levels = c("ball", "called_strike"))


#find and store the average top/bot of the strike zone for may for drawing
#the strike zone rectangle
mayavg_sztop <- mean(may$sz_top)
mayavg_szbot <- mean(may$sz_bot)


#####LEAVING OFF HERE
#create variable that holds batter and pitcher 
#string together contents of stand and p_throws
may_matchup$matchup <- paste0(may_matchup$stand, may_matchup$p_throws, sep = "") #need sep="" for the factor to work

may_matchup$matchup <- factor(may_matchup$matchup, levels = c("LL", "RR", "LR", "RL"))


maymatchup_fit <- bam(is_strike ~ te(plate_x, plate_z, by = matchup), #stand will create a different strike zone for L vs R batters
               data=may_matchup, 
               family = binomial, #0=ball, 1=strike
               discrete = TRUE) #helps for faster computation with large dataset 

#create dataset "grid" that has combinations of plate_x and plate_y coordinates
#50x50 map around the plate
grid_matchup <- expand.grid(
  plate_x = seq(-1.5, 1.5, length.out = 50),
  plate_z = seq(1, 4, length.out = 50),
  matchup = levels(may_matchup$matchup)
)


#applies gam to the grid of coordinates to assign a probability of being called a strike
grid_matchup$prob <- predict(maymatchup_fit, grid_matchup, type="response")


#plotting predictions from the grid dataset
#NOTE: to add geom_plate(), expand grid since plate goes down to plate_z.= -0.5
ggplot(grid_matchup, aes(plate_x, plate_z, z = prob)) +
  geom_contour(breaks = 0.5, linewidth = 1.2) +  #drawing a line where the probability of a strike is 50%
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = mayavg_szbot, ymax = mayavg_sztop, 
            fill = NA, color = "black", alpha = 0.3) +  #"set" strike zone 
  labs(title = "'Actual' May 2025 Strike Zone",
       subtitle = "Compared to 'Set' Strike Zone for L vs R batters") +
  facet_wrap(~matchup) + #separate visuals for left and right handed batters
  coord_fixed() +
  theme_minimal()


####################################################################
#------------------------------------------------------------------#
####################################################################


#now that the basis is done....
#move onto making a method to create this for every month of the year


#create a clean, simple dataset with only the information i need
year_25 <- full |> 
  filter(
  description %in% c("ball", "called_strike")) |> 
  mutate(is_strike = ifelse(description == "called_strike", 1, 0),
         matchup = paste0(stand, p_throws, sep = " "),
         game_month = month(game_date)) |>
  select(game_month, plate_x, plate_z, stand, is_strike) |> 
  drop_na()


#creating a function for running the BAM model for each month in 2025
#maybe create a small subset to make sure this works correctly first...
#rerun year_25 because matchup doest work since i factored it - ended up not using matchup at all

#take a random sample to run first as a test
test_year_25 <- sample_n(year_25, size = 100000, replace = FALSE)

test_year_25 <- test_year_25 |> 
  filter(game_month != 3, game_month != 11, game_month != 10)

#view number of observations in test_year_25
test_year_25 |> 
  group_by(game_month, stand) |> 
  summarise(
    pitch_count = n(),
    strike_rate = mean(is_strike),
    .groups = "drop"
  )


#nest the data by month
test_monthlymodel <- test_year_25 |> 
  group_by(game_month) |> 
  nest() |> 
  mutate(model = map(data, ~bam(
    is_strike ~ te(plate_x, plate_z, by = as.factor(stand)),
    data = .x,
    family=binomial,
    discrete = TRUE
  )))


#grid
test_yeargrid <- expand.grid(
  plate_x = seq(-1.5, 1.5, length.out = 50),
  plate_z = seq(1, 4, length.out = 50),
  stand = c("L", "R")
)


final_grid <- test_yeargrid


# loop through each row of the data
for(i in 1:nrow(test_monthlymodel)) {
  month_val <- test_monthlymodel$game_month[i]
  current_model <- test_monthlymodel$model[[i]]
  
  #column name ex:month_4
  col_name <- paste0("month_", month_val)
  
  #create predictions and add to the grid
  final_grid[[col_name]] <- predict(
    current_model, 
    newdata = final_grid, 
    type = "response"
  )
}


#turn grid into long form so there is one month column and one probability column
grid_long <- final_grid |>
  pivot_longer(
    cols = starts_with("month_"), 
    names_to = "month", 
    names_prefix = "month_",
    values_to = "probability"
  ) |>
  mutate(month = as.numeric(month))


#get the top and bot of the strike zone
year_metrics <- full |> 
  select(sz_top, sz_bot)

top2025 <- mean(year_metrics$sz_top)
bot2025 <- mean(year_metrics$sz_bot)


grid_long <- grid_long |> 
  mutate(month = as.factor(month))


#create the same visual of the actual strike zone separated by left vs right handed batters
#months are drawn on top of each other, with beginning of the season being lighter than end
ggplot(grid_long, aes(plate_x, plate_z, z = probability)) +
  geom_contour(aes(color = as.factor(month)), breaks = 0.5, linewidth = 1.2) +  #drawing a line where the probability of a strike is 50%
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = bot2025, ymax = top2025, 
            fill = NA, color = "black", alpha = 0.3, inherit.aes = FALSE) +  #"set" strike zone 
  scale_color_brewer(palette = "Blues") +
  labs(title = "'Actual' Strike Zone",
       subtitle = "Compared to 'Set' Strike Zone for L vs R batters") +
  facet_wrap(~stand) + #separate visuals for left and right handed batters
  coord_fixed() +
  theme_minimal()


#can this be animated? to draw a month on top of the next
library(gganimate)

gif <- 
  ggplot(grid_long, aes(plate_x, plate_z, z = probability, color = month)) +
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = bot2025, ymax = top2025, 
            fill = NA, color = "black", alpha = 0.3, inherit.aes = FALSE) +  #"set" strike zone 
  geom_contour(breaks = 0.5, linewidth = 1.0) +
  scale_color_brewer(palette = "Blues") +
  facet_wrap(~stand) +
  coord_fixed() +
  theme_minimal() +
  labs(title = "Changing Strike Zone",
       subtitle = "Month: {closest_state}",
       color = "Month") +
  transition_states(month, transition_length = 2, state_length = 1) +
  shadow_mark(past = TRUE, future = FALSE) #keeps the other months on the plot

#no way.
animate(gif, nframes = 80, fps = 10, width = 800, height = 600, renderer = gifski_renderer())
  

####################################################################
#------------------------------------------------------------------#
####################################################################

#NOW THAT IT WORKS
#run on the entire 2025 set instead of subset

#load full 2025 season data
full <- read.csv('data/statcast2025.csv')


#create a clean, simple dataset with only the information i need
#same code as above
year_25 <- full |> 
  filter(
    description %in% c("ball", "called_strike")) |> 
  mutate(is_strike = ifelse(description == "called_strike", 1, 0),
         # matchup = paste0(stand, p_throws, sep = " "),
         game_month = month(game_date)) |>
  select(game_month, plate_x, plate_z, stand, is_strike) |> 
  drop_na()


#functions and values to add to the visualizations
#get the top and bot of the strike zone for visualizing the 'set' strike zone
year_metrics <- full |> 
  select(sz_top, sz_bot) |> 
  drop_na()

top2025 <- mean(year_metrics$sz_top)
bot2025 <- mean(year_metrics$sz_bot)

#function that draws the plate to use in visuals
geom_plate <- function(){
  df <- data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
  g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "gray60", linewidth = 1.25)
  g
}


#view number of observations in year_25 to see if months need to be dropped
year_25 |> 
  group_by(game_month, stand) |> 
  summarise(
    pitch_count = n(),
    strike_rate = mean(is_strike),
    .groups = "drop"
  )

#oct: 6k observations; nov: 170 observations
#drop both months so that way every month has at least 10k observations for each "stand"
year_25 <- year_25 |> 
  filter(game_month != 10, game_month != 11)


#nest the data by month
#run a bam() model for each
monthly_modeling <- year_25 |> 
  group_by(game_month) |> 
  nest() |> 
  mutate(model = map(data, ~bam(
    is_strike ~ te(plate_x, plate_z, by = as.factor(stand)),
    data = .x,
    family=binomial,
    discrete = TRUE
  )))


#create a grid that the predictions will be used against
grid25 <- expand.grid(
  plate_x = seq(-1.5, 1.5, length.out = 50),
  plate_z = seq(1, 4, length.out = 50),
  stand = c("L", "R")
)


# loop through each row of the data
for(i in 1:nrow(monthly_modeling)) {
  month_val <- monthly_modeling$game_month[i]
  current_model <- monthly_modeling$model[[i]]
  
  #column name ex:month_4
  col_name <- paste0("month_", month_val)
  
  #create predictions and add to the grid
  grid25[[col_name]] <- predict(
    current_model, 
    newdata = grid25, 
    type = "response"
  )
}


#turn grid into long form so there is one month column and one probability column
grid25_long <- grid25 |>
  pivot_longer(
    cols = starts_with("month_"), 
    names_to = "month", 
    names_prefix = "month_",
    values_to = "probability"
  ) |>
  mutate(
    month = month.name[as.numeric(month)],
    month = factor(month, levels = month.name)
  )


grid25_long <- grid25_long |> 
  mutate(month = as.factor(month))


#create the same visual of the actual strike zone separated by left vs right handed batters
#months are drawn on top of each other, with beginning of the season being lighter than end
p <- ggplot(grid25_long, aes(plate_x, plate_z, z = probability, color = month)) +
  geom_contour(breaks = 0.5, linewidth = 1.1) +  #drawing a line where the probability of a strike is 50%
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = bot2025, ymax = top2025, 
            fill = NA, color = "black", alpha = 0.3, inherit.aes = FALSE) +  #"set" strike zone 
  scale_color_brewer(palette = "Blues") +
  labs(title = "'Actual' Strike Zone each Month in 2025", 
       subtitle = "Compared to 'Set' Strike Zone for Left vs Right-Handed Batters",
       x = "Horizontal Location of Pitch",
       y = "Vertical Location of Pitch",
       color = "Month"
  ) +
  facet_wrap(~stand) + #separate visuals for left and right handed batters
  coord_fixed() +
  theme_minimal()


#further edits from here
p <- p + theme(plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 14), 
               plot.subtitle = element_text(hjust = 0.5, vjust = 0.5, size = 12,
                                            margin = margin(b=20, unit="pt")),
               axis.title.x = element_text(margin = margin(t=15, unit="pt")),
               axis.title.y = element_text(margin = margin(r=15, unit="pt")),
               strip.text = element_text(size = 10, color = 'black'), 
               panel.spacing = unit(2, "lines"))  #add space between facet wraps

#view static visual
p


#create a gif of the strike zone each month with the actual zone being placed on top of the others
library(av)
library(gifski)
library(gganimate)

grid25_long$month <- factor(grid25_long$month, levels = month.name)

month_order <- c("March", "April", "May", "June", "July", "August", "September")

grid_cleaned <- grid25_long %>%
  group_by(stand, month, plate_x, plate_z) %>%
  summarise(probability = mean(probability, na.rm = TRUE), .groups = "drop") %>%
  mutate(month = factor(month, levels = month_order))


gifs25 <- 
  ggplot(grid_cleaned, aes(plate_x, plate_z, z = probability, color = month, group = month)) +
  geom_rect(xmin = -0.7083, xmax = 0.7083, ymin = bot2025, ymax = top2025, 
            fill = NA, color = "black", alpha = 0.3, inherit.aes = FALSE) +  #"set" strike zone 
  geom_contour(aes(group = month), breaks = 0.5, linewidth = 1.1) +
  scale_color_brewer(palette = "Blues") +
  facet_wrap(~stand) +
  coord_fixed() +
  theme_minimal() +
  labs(title = "'Actual' Strike Zone each Month in 2025",
       subtitle = "Month: {closest_state}",
       x = "Horizontal Location of Pitch",
       y = "Vertical Location of Pitch",
       color = "Month") +
  transition_states(month, transition_length = 2, state_length = 1) +
  shadow_mark(past = TRUE, future = FALSE) #keeps the other months on the plot


gifs25 <- gifs25 + theme(plot.title = element_text(hjust = 0.5, vjust = 0.5, size = 14), 
                       plot.subtitle = element_text(hjust = 0.5, vjust = 0.5, size = 12,
                                                    margin = margin(b=20, unit="pt")),
                       axis.title.x = element_text(margin = margin(t=15, unit="pt")),
                       axis.title.y = element_text(margin = margin(r=15, unit="pt")),
                       strip.text = element_text(size = 10, color = 'black'), 
                       panel.spacing = unit(2, "lines"))  #add space between facet wraps


#save visual as gif
final_gifs25 <- animate(gifs25, nframes = 80, fps = 10, width = 800, height = 600, renderer = gifski_renderer())

#save strike zone gif to folder
anim_save("changingstrikezone25.gif", final_gifs25)