#save pitcher data for simulation as csv
#NOTE: the only csvs you need to download are clean_pitcherdata.csv and pitchnames_and_estsd.csv
#(other spreadsheets were other versions no longer being used to run python/flask)

library(tidyr)
library(tidyverse)
library(dplyr)

#read raw data from 24 and 25 seasons
s24 <- read.csv('statcast2024.csv')
s25 <- read.csv('statcast2025.csv')

#merge into one dataset
all_data <- rbind(s24,s25)



#find all pitchers that have enough data
pitcher_options <- all_data |> 
  group_by(pitcher, stand) |> 
  summarize(observations = n()) |> 
  filter(all(observations > 1000)) |> 
  ungroup() |> 
  distinct(pitcher)

view(pitcher_options)


#create clean data set for simulation
pitcher_data <- all_data |> 
  filter(pitcher %in% pitcher_options$pitcher) |> #only use data from pitchers that users can select
  select(pitcher, player_name, plate_x, plate_z, sz_top, sz_bot, #get necessary columns for simulation
         stand, p_throws,
         pitch_type, description,
         balls, strikes,
         pitch_name)


#create lists
#swinging strike qualifies as any outcome in this list
swinging_strike <- c("swinging_strike_blocked", "swinging_strike", "missed_bunt", "automatic_strike")
#ball is any outcome in this list
ball <- c("blocked_ball", "ball", "hit_by_pitch", "automatic_ball", "pitchout")
#foul is anything in this list
foul <- c("foul", "foul_bunt", "foul_tip", "bunt_foul_tip", "foul_pitchout")

#creating lists of pitch_type_groups
#these groupings were listed on statcast
fastball <- c("FF", "SI", "FC")
offspeed <- c("CH", "FS", "FO", "SC")
curveball <- c("CU", "KC", "CS")
slider <- c("SL", "ST", "SV")
knuckleball <- "KN"

#creating the full clean dataset
#already has data for only valid pitchers
pitcher_data <- pitcher_data |> 
  #create a column assigning pitch type to a pitch_type_group as listed above
  mutate(pitch_type_groups = ifelse(pitch_type %in% fastball, "fastball",
                               ifelse(pitch_type %in% offspeed, "offspeed",
                                      ifelse(pitch_type %in% curveball, "curveball",
                                             ifelse(pitch_type %in% slider, "slider",
                                                    ifelse(pitch_type %in% knuckleball, "knuckleball",
                                                           NA))))),
    #using result lists from above - create possible outcomes
    simple_result = ifelse(description %in% swinging_strike, "swinging_strike",
                           ifelse(description %in% ball, "ball",
                                  ifelse(description %in% foul, "foul", 
                                         ifelse(description == "called_strike", "called_strike", description)))),
    is_swing = ifelse(simple_result %in% c("swinging_strike", "foul", "hit_into_play"), 1, 0), #did the batter swing yes (1) or no(0)
    is_contact = ifelse(simple_result %in% c("foul", "hit_into_play"), 1, 0), #did the batter make contact on the swing, yes (1) or no(0)
    is_foul = ifelse(simple_result == "foul", 1, 0), #was the contact a foul, yes(1) or no(0)
    is_swingstrike = ifelse(simple_result == "swinging_strike", 1, 0), #did batter swing and miss, yes(1) or no(0)
    is_hit = ifelse(simple_result == "hit_into_play", 1, 0), #did batter get a hit, yes(1) or no(0)
    #plate_x = ifelse(stand == "L", -plate_x, plate_x), #if the batter is a lefty, reflect the x-coord to be in terms of righty batters
    dist_x = plate_x, #dist from center horizontally
    dist_z = plate_z - 2.5) |> #dist from center vertically
  select(pitcher, player_name, plate_x, plate_z, sz_top, sz_bot, #select only necessary columns
         stand, p_throws,
         pitch_type, description,
         balls, strikes,
         pitch_type_groups, simple_result, is_swing, is_contact, is_foul, is_swingstrike, is_hit,
         pitch_name,
         dist_x, dist_z) |> 
  drop_na()


#save data set to csv
write.csv(pitcher_data, "clean_pitcherdatatest.csv", row.names = FALSE)




##############################################

#use pitch_options from above to view the valid pitchers and batters (thrown 1000 pitches)


#create and save a csv of all pitch types by pitcher id
p_options <- pitcher_data |> 
  group_by(pitcher, stand, pitch_type_groups) |> 
  select(pitcher, stand, pitch_type_groups) |> 
  summarise(observations = n()) |> 
  ungroup()

final <- p_options |> 
  filter(observations > 50) |> 
  select(-observations)

#using pitch groupings (fastball, offspeed, curveball)
write.csv(final, "basicpitch_arsenal.csv", row.names = FALSE)



#this may be a more appealing option and gives user more specific options to choose from
#create and save a csv of all pitch types by pitcher id
adv_p_options <- pitcher_data |> 
  group_by(pitcher, stand, pitch_type_groups, pitch_type) |> 
  select(pitcher, stand, pitch_type_groups, pitch_type) |> 
  summarise(observations = n()) |> 
  ungroup()

#only keep pitch types that have been thrown at least 50 times for that pitcher, batter combination
adv_final <- adv_p_options |> 
  filter(observations > 50) |> 
  select(-observations)

#abbreviation of specific pitch thrown for each group
write.csv(adv_final, "advpitch_arsenal.csv", row.names = FALSE)


###########################
#FIXING PROBLEMS!
#need to account for the count as well as type of pitch

#need 50 pitches for each pitch type to run
#need to make sure there is data for every type of count within these min 50 pitches

#code that made current dataset...
p_options <- pitcher_data |> 
  group_by(pitcher, stand, pitch_type_groups) |> 
  select(pitcher, stand, pitch_type_groups) |> 
  summarise(observations = n()) |> 
  ungroup()

final <- p_options |> 
  filter(observations > 50) |> 
  select(-observations)


#include count
new_p_options <- pitcher_data |> 
  group_by(pitcher, stand, pitch_type_groups, balls, strikes) |> 
  select(pitcher, stand, pitch_type_groups, balls, strikes) |> 
  summarise(observations = n()) |> 
  ungroup()

test <- new_p_options |> 
  filter(pitcher == 542881,
         stand == "L",
         pitch_type_groups == "slider") 



#not about the count apparently
problem <- pitcher_data |> 
  filter(pitcher == 542881,
         stand == "L",
         pitch_type_groups == "slider") |> 
  select(-c(plate_x, plate_z, sz_top, sz_bot, p_throws, pitch_type_groups,
            pitch_type, dist_x, dist_z)) |> 
  filter(balls == 2,
         strikes == 0)



#try this again knowing i need all counts and some variability
test_data <- pitcher_data |> 
  filter(pitcher %in% final$pitcher) |>  #only get pitchers that are in final dataset
  select(pitcher, stand, pitch_type_groups, balls, strikes
         #, is_swing, is_contact, is_foul,
         #is_swingstrike, is_hit
         ) |> 
  group_by(pitcher, stand, pitch_type_groups, balls, strikes) |> 
  summarise(ob = n()) |> 
  ungroup()

#12 combinations of count...
#only keep pitcher, stand, pitch combinations with 12 rows
test_data |> group_by(pitcher, stand, pitch_type_groups, balls, strikes) |> 
  filter(row_number() == 12,
         ob >= 50)

test_final <- test_data |> 
  distinct(pitcher, stand, pitch_type_groups)



#try this again
clean_test <- pitcher_data |> 
  filter(pitcher %in% final$pitcher) |> 
  group_by(pitcher, stand, pitch_type_groups, balls, strikes) |> 
  summarize(
    total_pitches = n(),
    swings = sum(is_swing, na.rm = TRUE),
    no_swing = total_pitches - swings,
    foul = sum(is_foul, na.rm = TRUE),
    miss = sum(is_swingstrike, na.rm = TRUE),
    hit = sum(is_hit, na.rm = TRUE)
  ) |> 
  filter(total_pitches >= 10,
         swings > 0 & no_swing > 0,
         ) |> 
  group_by(pitcher, stand, pitch_type_groups) |> 
  mutate(count_options = n()) |> 
  filter(count_options == 12) |> 
  ungroup()

pitchers_clean_test <- clean_test |> 
  distinct(pitcher, stand, pitch_type_groups)

#only gives me fastballs...


#####################
#getting sd for each pitcher, batter, pitch type combination

variety_test_sd <- pitcher_data |>
  group_by(pitcher, stand, pitch_name) |>
  summarise(
    total_pitches = n(),
    swings = sum(is_swing),
    takes = total_pitches - swings,
    foul = sum(is_foul, na.rm = TRUE),
    miss = sum(is_swingstrike, na.rm = TRUE),
    hit = sum(is_hit, na.rm = TRUE),
    sd_x = sd(plate_x, na.rm = TRUE),
    sd_z = sd(plate_z, na.rm = TRUE),
    .groups = "drop"
  ) |>
  #additional filtering to make sure there is enough data for every situation
  filter(total_pitches >= 50, 
         swings >= 25, 
         takes >= 10,
         foul >= 10,
         miss >= 10,
         hit >= 10,
         sd_x > 0.2,
         sd_z > 0.2)

#possible pitcher, batter, pitch matchups for user to choose from
final_variety <- variety_test_sd |> 
  distinct(pitcher, stand, pitch_name)


#use original sheet dataset, find sd for each combinations
pitch_sd <- pitcher_data |> 
  group_by(pitcher, stand, pitch_name) |> 
  summarize(sd_x = sd(plate_x),
            sd_z = sd(plate_z))

#join the possible selections with the sd for each
pitch_join <- final_variety |> 
  left_join(pitch_sd, by = c("pitcher", "stand", "pitch_name")) |> 
  mutate(sd_x = round(sd_x, 3),
         sd_z = round(sd_z, 3))

#write csv of possible combinations with associated sds
write.csv(pitch_join, "pitchnames_and_sd.csv", row.names = FALSE)



############################################
#when taking the sd for all pitches, we don't know wehre the pitcher was actually trying to aim
#instead, try to estimate how much the pitcher is missing his "targets" by

#we can use pitches near the edges of the strike zone and assume that they intended to throw outside of the zone
#with the edge of the ball touching the edge of the zone (enough to be called a strike in ABS challenge systems)

#LOGIC
#assume that pitchers intended to throw so the ball caught the edge of the strike zone just outside of the zone
#baseball radius ~ 0.12 ft (0.11 to ensure ball is on the line)
#use data from one ball width (~0.22) inside the zone - 1 ft outside of the strike zone for all edges
#target locations are 0.11ft outside of the strike zone for all edges
#find the difference between the actual pitch location and the target location



#run this for all pitchers - general miss_x and miss_z to fall back on if there is not enough data for other pitchers
#only keep pitches that we can infer a target location (inside 1 ball - 1ft outside zone)
miss <- pitcher_data |> 
  filter(plate_x >= 0.7083 - 0.11 - 0.11 | plate_x <= -0.7083 - 0.11 - 0.11 |
           plate_z >= 1.5 - 0.11 - 0.11 | plate_z <= 3.5 - 0.11 - 0.11) |> 
  filter(abs(plate_x) < 1.5, plate_z > 0.5, plate_z < 4.5) #max amount they miss is a foot if they truly intended on that target

#add columns to the miss dataset
miss <- miss |> 
  mutate(target_x = ifelse(plate_x  > 0, 0.7083 + 0.11, -0.7083 + -0.11), #store whether target was location outside L/R of plate
         target_z = ifelse(plate_z > 2.5, 3.5 + 0.11, 1.5 - 0.11), #store whether target was location outside T/B of zone
         miss_x = plate_x - target_x, #horizontal mixx
         miss_z = plate_z - target_z, #vertical miss
         closer = ifelse(abs(miss_x) < abs(miss_z), "L/R", "T/B")) #whether the pitch landed closer to the sides or top/bot of zone

#sd of 
#pitches closer to left and right of zone
general_miss <- miss |> 
  filter(closer == "L/R")
sd(general_miss$miss_x) #horizontal miss #0.389

#pitches closer to top and bottom of zone
general_miss <- miss |> 
  filter(closer == "T/B") 
sd(general_miss$miss_z) #vertical miss #0.331


#now, for each pitcher, batter, pitch combination, find the estimated sd_x and sd_z
#find horizontal miss for each combination
test_x <- miss |> 
  group_by(pitcher, stand, pitch_name) |> 
  filter(closer == "L/R") |> 
  summarize(est_sdx = sd(miss_x))

#find vertical miss for each combination
test_z <- miss |> 
  group_by(pitcher, stand, pitch_name) |> 
  filter(closer == "T/B") |> 
  summarize(est_sdz = sd(miss_z))

#pitch_join from above holds the valid pitcher, stand, pitch combinations that work in the simulation
#this join did not add any rows, but added columns for the estimated miss sds
pitch_join_est <- pitch_join |> 
  left_join(test_x, by = c("pitcher", "stand", "pitch_name")) |> 
  left_join(test_z, by = c("pitcher", "stand", "pitch_name"))

#remove old sds from dataset
pitch_join_est <- pitch_join_est |> 
  select(-c(sd_x, sd_z)) |> 
  mutate(est_sdx = round(est_sdx, 3),
         est_sdz = round(est_sdz, 3))

view(pitch_join_est)

#write new csv to use instead of old sd (pitchnames_and_sd.csv)
write.csv(pitch_join_est, "pitchnames_and_estsd.csv", row.names = FALSE)

