Apologies for whatever is unclear. I've been told using data.table is one possible fix.
I'm trying to run some baseball savant at bat data through the run_expectancy_code function that is found in the baseballr package in a modified manner. Normally the function will take the dataframe and compute a run expectancy dataframe into the global environment with 2 columns the count_base_out_state and avg_re (average run expectancy). Like such:
count_base_out_state avg_re
<chr> <dbl>
1 3 - 1 , 0 outs, 1b 2b 3b 2.22
2 3 - 0 , 0 outs, 1b 2b 3b 2.14
3 3 - 2 , 0 outs, 1b 2b 3b 2.10
4 2 - 0 , 0 outs, 1b 2b 3b 1.87
5 2 - 0 , 0 outs, _ 2b 3b 1.76
6 3 - 0 , 0 outs, _ 2b 3b 1.74
What I want to do is to calculate the run expectancy table of my data but to do so in a for loop, where I the run expectancy is calculated after the data is re-weighted each time to increase the value of the primary pitch in these calculations and its 30 most similar pitches which I have stored in another dataframe (pitch_similarity). But I want in the end not just a dataframe with the run expectancy of one given pitch after the data is re-weighted but to be run in a loop reweighting the data each time for the appropriate pitch and its top 30 similar pitches and then in the run_expectancy table including a 3rd column which will let me know the primary pitch in a given situation (the one everything was corrected by.
I want the weighting to be the primary pitch (aka pitch_1) at 20%, 1-5 (pitch_2) closest matches (by rank) to 20% of the data, 6-10 to 5%, 11-30 to 5%, and all other pitches as the remaining 50%.
I at first thought maybe to write a for loop like this which I think would duplicate the approriate rows to the right weighting, but I'm worried about memory exhaustion issues and would need something much more efficient.
for (pitch_1 in pitch_similarity) {
# temporary weighted dataframe
# primary correction
df2 = lefty_abs %>% filter(pitch_1 = pitch_key) %>% slice(rep(1:n(),
each = nrow(lefty_abs)*0.2)) %>%
rbind(lefty_abs)
df2 = df2 %>% mutate(pitch_key_2 = pitch_key)
# 1-5 correction
df2 = df2 %>% filter(pitch_1 = pitch_key & pitch_2 = pitch_key_2 & rank <= 5) %>%
slice(rep(1:n(), each = (nrow(lefty_abs)*0.2)/5)) %>%
rbind(df2)
# 6-10 correction
df2 = df2 %>% filter(pitch_1 = pitch_key & pitch_2 = pitch_key_2 & rank <= 10 & rank >= 6) %>%
slice(rep(1:n(), each = (nrow(lefty_abs)*0.05)/5)) %>%
rbind(df2)
# 11-30 correction
df2 = df2 %>% filter(pitch_1 = pitch_key & pitch_2 = pitch_key_2 & rank <= 30 & rank >= 11) %>%
slice(rep(1:n(), each = (nrow(lefty_abs)*0.05)/20)) %>%
rbind(df2)
# runs through run expectancy calculator, but need to figure out how to create 4th column
# indicatin that primary pitch and also to bind all these dataframes together in the end
run_expectancy(df2, level = "pitch")
}
This is the run_expectancy_code function itself:
#' Generate run expectancy and related measures and variables from Baseball Savant data
#'
#' These functions allow a user to generate run expectancy and related measures and variables from Baseball Savant data. Measures and variables will be added to the data frame and a run expectancy table will be assigned to the Global Environment.
#' @param df A data frame generated from Baseball Savant.
#' @param level Whether you want run expectancy calculated at the plate appearance or pitch level. Defaults to plate appearance.
#' @keywords MLB, sabermetrics
#' @importFrom dplyr filter group_by summarise arrange lead mutate left_join
#' @importFrom stringr str_count
#' @export
#' @examples
#' \dontrun{run_expectancy_code(df, level = "plate appearances")}
run_expectancy_code <- function(df, level = "plate appearance") {
single_outs <- c("strikeout", "caught_stealing_2b",
"pickoff_caught_stealing_2b", "other_out",
"caught_stealing_3b", "caught_stealing_home",
"field_out", "force_out", "pickoff_1b",
"batter_interference", "fielders_choice",
"pickoff_2b", "pickoff_caught_stealing_3b",
"pickoff_caught_stealing_home")
df <- df %>%
dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
dplyr::group_by(game_pk) %>%
dplyr::mutate(final_pitch_game =
ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
dplyr::ungroup() %>%
dplyr::group_by(game_pk, at_bat_number, inning_topbot) %>%
dplyr::mutate(final_pitch_at_bat = ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
dplyr::ungroup()
df <- df %>%
dplyr::arrange(game_pk, inning_topbot, at_bat_number, pitch_number) %>%
dplyr::mutate(runs_scored_on_pitch = stringr::str_count(des, "scores"),
runs_scored_on_pitch =
ifelse(events == "home_run", runs_scored_on_pitch + 1,
runs_scored_on_pitch),
bat_score_after = bat_score + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
dplyr::mutate(final_pitch_inning =
ifelse(final_pitch_at_bat == 1 &
inning_topbot != lead(inning_topbot), 1, 0),
final_pitch_inning = ifelse(is.na(final_pitch_inning),
1, final_pitch_inning))
if (level == "plate appearance") {
df <- df %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(bat_score_start_inning = min(bat_score),
bat_score_end_inning = max(bat_score),
cum_runs_in_inning = cumsum(runs_scored_on_pitch),
runs_to_end_inning = bat_score_end_inning - bat_score) %>%
dplyr::ungroup() %>%
dplyr::mutate(base_out_state = paste(outs_when_up, " outs, ",
ifelse(!is.na(.$on_1b), "1b", "_"),
ifelse(!is.na(.$on_2b), "2b", "_"),
ifelse(!is.na(.$on_3b), "3b", "_")))
re_table <- run_expectancy_table(df)
df <- df %>%
left_join(re_table, by = "base_out_state")
df <- df %>%
dplyr::filter(final_pitch_at_bat == 1) %>%
dplyr::arrange(game_pk, inning, inning_topbot) %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(next_base_out_state = dplyr::lead(base_out_state)) %>%
dplyr::ungroup() %>%
dplyr::left_join(re_table,
by = c("next_base_out_state" = "base_out_state")) %>%
dplyr::rename(next_avg_re = avg_re.y,
avg_re = avg_re.x) %>%
dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
change_re = next_avg_re - avg_re,
re24 = change_re + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, inning, inning_topbot)
} else {
df <- df %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(bat_score_start_inning = min(bat_score),
bat_score_end_inning = max(bat_score),
cum_runs_in_inning = cumsum(runs_scored_on_pitch),
runs_to_end_inning = bat_score_end_inning - bat_score) %>%
dplyr::ungroup() %>%
dplyr::mutate(count_base_out_state =
paste(balls, "-", strikes, ", ",
outs_when_up, " outs, ",
ifelse(!is.na(.$on_1b), "1b", "_"),
ifelse(!is.na(.$on_2b), "2b", "_"),
ifelse(!is.na(.$on_3b), "3b", "_")))
re_table <- run_expectancy_table(df, level = "pitch")
df <- df %>%
left_join(re_table, by = "count_base_out_state")
df <- df %>%
#dplyr::filter(final_pitch_at_bat == 1) %>%
dplyr::arrange(game_pk, inning, inning_topbot) %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(next_count_base_out_state =
dplyr::lead(count_base_out_state)) %>%
dplyr::ungroup() %>%
dplyr::left_join(re_table,
by = c("next_count_base_out_state" =
"count_base_out_state")) %>%
dplyr::rename(next_avg_re = avg_re.y,
avg_re = avg_re.x) %>%
dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
change_re = next_avg_re - avg_re,
re24 = change_re + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, inning, inning_topbot)
}
assign("run_expectancy_state_table", re_table, envir = .GlobalEnv)
df
}
Here are links to pastebins of dputs of the first 100 lines of every given dataframe I've mentioned but lefty_abs aka my baseball savant at bat data is a public dropbox link because its 223 mb:
baseball savant data (lefty_abs) - https://www.dropbox.com/s/tgoc2b8vefybi0b/test.txt?dl=0
pitch_similarity - https://pastebin.com/sink1bmD
run_expectancy_state_table - https://pastebin.com/uHnHGDc6