library(gt)
library(gtExtras)
library(hrbrthemes)
library(patchwork)
library(pracma)
library(plotly)
library(rvest)
library(tidyverse)
library(worldfootballR)
#https://beatthebookie.blog/2021/06/07/the-predictive-power-of-xg/
#set seed for reproducibility
set.seed(20231228)
#use this to for the season end year
#makes it easier to update stuff later
season_end <- 2024
ema_duration <- 12
#how many seasons we want to simulate
number_of_sims <- 20000
#rather than use a case statement over and over, just make this a function
calculate_points <- function(a, b){
case_when(a > b ~ 3,
a == b ~ 1,
a > b ~ 0,
TRUE ~ 0)
}
#fetch current table using the worldfootballr package
prem_2024_table <- fb_season_team_stats(country = "ENG",
gender = "M",
season_end_year = "2024",
tier = "1st",
stat_type = "league_table")
pl_table <- prem_2024_table %>%
select(Squad, MP, Pts, W, D, L, GF, GA, GD, xG, xGA, Rk) %>%
rename(Points = Pts,
Goals = GF,
Goals_Allowed = GA,
Rank = Rk,
Goal_Dif = GD,
xG_Allowed = xGA)
#get match results for current season and prior season
#also grab champions league so we have something for clubs that got promoted
match_pl_cs <- fb_match_results(country = "ENG",
gender = "M",
season_end_year = season_end,
tier = "1st")
match_pl_ps <- fb_match_results(country = "ENG",
gender = "M",
season_end_year = season_end - 1,
tier = "1st")
match_efl_ps <- fb_match_results(country = "ENG",
gender = "M",
season_end_year = season_end - 1,
tier = "2nd")
#set aside unplayed matches for later
pl_future_matches <- match_pl_cs %>%
filter(is.na(HomeGoals) | Notes == 'Match Suspended') %>%
select(-MatchURL) %>%
as_tibble()
#get max date with a game
Max_Date <- match_pl_cs %>%
filter(!is.na(HomeGoals)) %>%
summarise(Max_Date = max(Date)) %>%
pull()
#get home/away goals for the league using moving average
leaguge_exp <- match_pl_ps %>%
rbind(match_pl_cs) %>%
rbind(match_efl_ps) %>%
mutate(Wk = as.numeric(Wk)) %>%
group_by(Competition_Name, Season_End_Year, Wk) %>%
summarise(mean_Home_xG = mean(Home_xG, na.rm = TRUE),
mean_Away_xG = mean(Away_xG, na.rm = TRUE),
d = min(Date)) %>%
arrange(d) %>%
group_by(Competition_Name) %>%
mutate(lg_ema_Home_xG = movavg(mean_Home_xG, n = ema_duration, type = 'e'),
lg_ema_Away_xG = movavg(mean_Away_xG, n = ema_duration, type = 'e')
) %>%
select(-mean_Home_xG, -mean_Away_xG, -d)
#get club expected goals relative to league average for both home and away
#using a expoential moving average for each, i.e. compared to league over
#last n matches, how good/bad are they at scoring and preventing goals
club_xg_ema <- match_pl_ps %>%
rbind(match_pl_cs) %>%
rbind(match_efl_ps) %>%
mutate(Wk = as.numeric(Wk)) %>%
select(Competition_Name, Season_End_Year, Wk, Home, Away, Home_xG, Away_xG) %>%
gather(key = Location, value = Squad, Home, Away) %>%
filter(!is.na(Home_xG)) %>%
filter(!(Season_End_Year == 2024 & Wk == 20)) %>%
mutate(xG = case_when(Location == 'Home' ~ Home_xG, TRUE ~ Away_xG),
xGA = case_when(Location == 'Home' ~ Away_xG , TRUE ~ Home_xG)) %>%
select(-Home_xG, -Away_xG) %>%
arrange(Squad, Season_End_Year, Wk) %>%
group_by(Squad, Location) %>%
mutate(ema_xG = movavg(xG, n = ema_duration, type = 'e'),
ema_xGA = movavg(xGA, n = ema_duration, type = 'e')) %>%
inner_join(leaguge_exp) %>%
mutate(rel_xG = case_when(Location == 'Home' ~ ema_xG / lg_ema_Home_xG,
TRUE ~ ema_xG / lg_ema_Away_xG),
rel_xGa = case_when(Location == 'Home' ~ ema_xGA / lg_ema_Away_xG,
TRUE ~ ema_xGA / lg_ema_Home_xG)) %>%
group_by(Squad, Location) %>%
filter(Season_End_Year == season_end) %>%
filter(Wk == max(Wk)) %>%
select(Wk, Squad, lg_ema_Home_xG, lg_ema_Away_xG, rel_xG, rel_xGa)
sim_games3 <- function(home, away){
n_sims <- number_of_sims
home_table <- club_xg_ema %>%
filter(Squad == home & Location == 'Home')
away_table <- club_xg_ema %>%
filter(Squad == away & Location == 'Away')
home_exp <- (home_table$rel_xG * away_table$rel_xGa) * home_table$lg_ema_Home_xG
away_exp <- (away_table$rel_xG * home_table$rel_xGa) * away_table$lg_ema_Home_xG
home_goals <- rpois(n_sims, home_exp)
away_goals<- rpois(n_sims, away_exp)
results <- cbind(home_goals, away_goals) %>%
as_tibble %>%
mutate(goal_dif = home_goals - away_goals,
home_points = calculate_points(home_goals, away_goals),
away_points = calculate_points(away_goals, home_goals),
sim_num = row_number()) %>%
relocate(sim_num)
results
}
#old version of simmer just in case
sim_games <- function(home_goals, away_goals){
n_sims <- number_of_sims
home_goals <- rpois(n_sims, home_goals)
away_goals<- rpois(n_sims, away_goals)
results <- cbind(home_goals, away_goals) %>%
as_tibble %>%
mutate(goal_dif = home_goals - away_goals,
home_points = calculate_points(home_goals, away_goals),
away_points = calculate_points(away_goals, home_goals),
sim_num = row_number()) %>%
relocate(sim_num)
results
}
sim_new <- match_pl_cs %>%
filter(Wk %in% c(18, 19) & !is.na(HomeGoals)) %>%
select(Wk, Date, Home, Away, HomeGoals, AwayGoals) %>%
inner_join(pl_table %>% select(Squad, Goals, xG, MP) %>% rename(Home_Goals = Goals, Home_xG = xG, Home_MP = MP), join_by(Home == Squad)) %>%
inner_join(pl_table %>% select(Squad, Goals, xG, MP) %>% rename(Away_Goals = Goals, Away_xG = xG, Away_MP = MP), join_by(Away == Squad)) %>%
mutate(Home_G_per_Match = Home_Goals / Home_MP,
Away_G_per_Match = Away_Goals / Away_MP) %>%
mutate(simmed_games = map2(Home, Away, ~sim_games3(.x, .y))) %>%
unnest(simmed_games)
sim_agg_with_brier <- sim_new %>%
mutate(W = case_when(HomeGoals > AwayGoals ~ 1, TRUE ~ 0),
D = case_when(HomeGoals == AwayGoals ~ 1, TRUE ~ 0),
L = case_when(HomeGoals < AwayGoals ~ 1, TRUE ~ 0)) %>%
mutate(W_sim = case_when(home_goals > away_goals ~ 1, TRUE ~ 0),
D_sim = case_when(home_goals == away_goals ~ 1, TRUE ~ 0),
L_sim = case_when(home_goals < away_goals ~ 1, TRUE ~ 0)) %>%
group_by(Wk, Home, Away) %>%
summarise(W = mean(W),
D = mean(D),
L = mean(L),
HomeGoals = mean(HomeGoals),
AwayGoals = mean(AwayGoals),
W_sim = mean(W_sim),
D_sim = mean(D_sim),
L_sim = mean(L_sim)) %>%
mutate(Brier_W = case_when(W == 1 ~ (W_sim - 1)^2, TRUE ~ W_sim ^ 2),
Brier_D = case_when(D == 1 ~ (D_sim - 1)^2, TRUE ~ D_sim ^ 2),
Brier_L = case_when(L == 1 ~ (L_sim - 1)^2, TRUE ~ L_sim ^ 2),
Brier_T = (Brier_W + Brier_D + Brier_L) / 3
)
bw <- sim_agg_with_brier %>%
ggplot(aes(x = Brier_W))+
geom_density()+
geom_rug()+
theme_ipsum()+
labs(title = 'Brier Win')+
xlim(0, 0.85)
bd <- sim_agg_with_brier %>%
ggplot(aes(x = Brier_D))+
geom_density()+
geom_rug()+
theme_ipsum()+
labs(title = 'Brier Draw')+
xlim(0, 0.85)
bl <- sim_agg_with_brier %>%
ggplot(aes(x = Brier_L))+
geom_density()+
geom_rug()+
theme_ipsum()+
labs(title = 'Brier Loss')+
xlim(0, 0.85)
bt <- sim_agg_with_brier %>%
ggplot(aes(x = Brier_T))+
geom_density()+
geom_rug()+
theme_ipsum()+
labs(title = 'Brier Total')+
xlim(0, 0.85)
(bw | bd) / (bl | bt)