MKT 326 · Assignment 2
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(sqldf)
library(ggplot2)
library(dplyr)
setwd("~/Desktop/MKTAnalytics/Data")
df <- read.csv("OnlineBookClub.csv")
str(df)
head(df)
# Means via SQL — sqldf runs SQLite which lacks STDEV(),
# so we use R's sd() for standard deviations
sqldf("SELECT AVG(total_) AS AvgTotalSpend FROM df")
cat("SD total spend: ", sd(df$total_), "\n")
sqldf("SELECT AVG(purch) AS AvgBooksPurchased FROM df")
cat("SD books purchased: ", sd(df$purch), "\n")
sqldf("SELECT AVG(last) AS AvgRecency FROM df")
cat("SD recency: ", sd(df$last), "\n")
# 4) Subscriptions by gender
sqldf("
SELECT gender,
COUNT(*) AS TotalCustomers,
SUM(CASE WHEN subscribe = 'yes' THEN 1 ELSE 0 END) AS Subscribers,
CAST(SUM(CASE WHEN subscribe = 'yes' THEN 1 ELSE 0 END) AS FLOAT)
/ COUNT(*) AS SubRate
FROM df
GROUP BY gender
")
Dependent variable: total_ (total offline dollars spent). Independent variables: IsFemale, months since first purchase, and all book category counts.
# Create binary gender indicator
df$IsFemale <- ifelse(df$gender == "F", 1, 0)
# Fit linear model
lin_model <- lm(total_ ~ IsFemale + first +
child + youth + cook + do_it +
refernce + art + geog,
data = df)
summary(lin_model)
Dependent variable: subscribe (yes/no → binary 0/1). Independent variables: recency, monetary, IsFemale, and all book category counts.
# Create binary outcome
df$subscribe_bin <- ifelse(df$subscribe == "yes", 1, 0)
# Fit logistic model (family = binomial → logistic regression)
log_model <- glm(subscribe_bin ~ last + total_ + IsFemale +
child + youth + cook + do_it +
refernce + art + geog,
data = df,
family = binomial())
summary(log_model)
# Exponentiate coefficients to get odds ratios
exp(coef(log_model))
# Predict probability of subscribing for each customer
df$pred_prob <- predict(log_model, type = "response")
# Assign each customer to a decile (1 = lowest prob, 10 = highest prob)
df$decile <- ntile(df$pred_prob, 10) # dplyr::ntile
# Profile each decile
decile_profile <- sqldf("
SELECT decile,
COUNT(*) AS Customers,
AVG(pred_prob) AS AvgPredProb,
SUM(subscribe_bin) AS ActualSubscribers,
CAST(SUM(subscribe_bin) AS FLOAT) / COUNT(*) AS ActualSubRate,
AVG(total_) AS AvgSpend,
AVG(last) AS AvgRecency,
AVG(purch) AS AvgBooks,
AVG(IsFemale) AS PctFemale
FROM df
GROUP BY decile
ORDER BY decile
")
print(decile_profile)
# Highlight top and bottom decile
top_decile <- df[df$decile == 10, ]
bot_decile <- df[df$decile == 1, ]
cat("Top decile actual sub rate:", mean(top_decile$subscribe_bin), "\n")
cat("Bot decile actual sub rate:", mean(bot_decile$subscribe_bin), "\n")
cat("Lift (top vs overall):", mean(top_decile$subscribe_bin) / mean(df$subscribe_bin), "\n")
# Lift chart
ggplot(decile_profile, aes(x = decile, y = ActualSubRate * 100)) +
geom_col(fill = "#1a1a2e") +
geom_hline(aes(yintercept = mean(df$subscribe_bin) * 100),
color = "#e94560", linetype = "dashed") +
scale_x_continuous(breaks = 1:10) +
labs(title = "Actual Subscription Rate by Predicted Decile",
x = "Decile (1 = lowest, 10 = highest predicted)",
y = "Actual Sub Rate (%)")