MKT 326 · Assignment 1
All analyses use three CSV files exported from the Sephora dataset. The reviews, products, and authors tables are joined via SQL using the sqldf package, which lets you run SQLite queries directly against R dataframes.
# Run once — comment out before knitting
install.packages("sqldf")
install.packages("vader")
install.packages("tm")
install.packages("wordcloud")
install.packages("RColorBrewer")
install.packages("ggplot2")
install.packages("gridExtra")
install.packages("udpipe")
install.packages("dplyr")
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(sqldf); library(vader)
library(tm); library(wordcloud); library(RColorBrewer)
library(ggplot2); library(gridExtra); library(grid)
library(udpipe); library(dplyr)
# Set your working directory to wherever your CSVs live
setwd("~/Desktop/MKTAnalytics/Data/Sephora")
products <- read.csv("products.csv")
reviews <- read.csv("reviews.csv")
authors <- read.csv("authors.csv")
# Parse submission date
reviews$submission_date <- as.Date(reviews$submission_date, format = "%m/%d/%y")
# Join reviews to products to get brand_name
reviews_products <- sqldf("
SELECT r.*, p.brand_name, p.product_name
FROM reviews r INNER JOIN products p ON r.product_id = p.product_id
")
# Filter to the two target brands
rp <- reviews_products[reviews_products$brand_name %in% c("L'Occitane", "La Mer"), ]
# Mean ratings
mean(rp$rating)
mean(rp[rp$brand_name == "L'Occitane", ]$rating)
mean(rp[rp$brand_name == "La Mer", ]$rating)
# Rating distributions
table(rp[rp$brand_name == "L'Occitane", ]$rating)
table(rp[rp$brand_name == "La Mer", ]$rating)
# Histogram layout: overall + per brand
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
hist(rp$rating, breaks = seq(0, 5, by = 1),
main = "Distribution of Ratings (Both Brands)",
xlab = "Rating (Stars)")
hist(rp[rp$brand_name == "L'Occitane", ]$rating,
breaks = seq(0, 5, by = 1),
main = "L'Occitane Ratings", xlab = "Rating")
hist(rp[rp$brand_name == "La Mer", ]$rating,
breaks = seq(0, 5, by = 1),
main = "La Mer Ratings", xlab = "Rating")
# Calculate character length of each review
rp$ReviewTextChars <- nchar(rp$review_text)
# Summary stats
mean(rp$ReviewTextChars) # 404.1
median(rp$ReviewTextChars) # 327.0
# Histogram of review lengths
ggplot(rp, aes(x = ReviewTextChars)) +
geom_histogram(binwidth = 50, fill = "#1a1a2e", color = "white", linewidth = 0.3) +
geom_vline(aes(xintercept = mean(ReviewTextChars)),
color = "#e94560", linetype = "dashed", linewidth = 1) +
labs(title = "Review Length Distribution",
x = "Review Length (characters)", y = "# of Reviews")
# Segment reviews as long (above mean) or short (at or below mean)
threshold <- mean(rp$ReviewTextChars)
rp$IsLong <- rp$ReviewTextChars > threshold
write.csv(), then load the pre-computed CSV on subsequent knit runs by commenting out the loop block.# Create placeholder table with one row per review
d <- sqldf("
SELECT review_id AS ReviewId, review_text AS ReviewText,
0 AS Pos, 0 AS Neu, 0 AS Neg, 0 AS But, 0 AS Compound
FROM rp
")
# Loop through every review and score it with VADER
for (i in 1:nrow(d)) {
vout <- get_vader(d[i, 2])
d[i, 3] <- as.numeric(vout["pos"])
d[i, 4] <- as.numeric(vout["neu"])
d[i, 5] <- as.numeric(vout["neg"])
d[i, 6] <- as.numeric(vout["but_count"])
d[i, 7] <- as.numeric(vout["compound"])
}
# Merge VADER scores back to reviews
reviews_vader <- sqldf("
SELECT r.*, d.Pos, d.Neu, d.Neg, d.But, d.Compound
FROM rp r INNER JOIN d ON r.review_id = d.ReviewId
")
reviews_vader$ReviewTextChars <- nchar(reviews_vader$review_text)
# Export so you don't have to re-run the loop
write.csv(reviews_vader, "reviews_vader.csv")
reviews_vader <- read.csv("reviews_vader.csv")
threshold <- mean(reviews_vader$ReviewTextChars)
reviews_vader$IsLong <- reviews_vader$ReviewTextChars > threshold
sqldf("
SELECT rating,
AVG(Compound) AS AvgCompound,
AVG(Pos) AS AvgPos,
AVG(Neg) AS AvgNeg,
COUNT(review_id) AS n
FROM reviews_vader
GROUP BY rating
ORDER BY rating
")
one_star <- reviews_vader[reviews_vader$rating == 1, ]
# Average VADER by length group
sqldf("
SELECT IsLong, AVG(Compound) AS AvgCompound, COUNT(*) AS n
FROM one_star
GROUP BY IsLong
")
# Two-sample t-test (Welch, two-tailed)
t.test(one_star[one_star$IsLong == TRUE, ]$Compound,
one_star[one_star$IsLong == FALSE, ]$Compound)
five_star <- reviews_vader[reviews_vader$rating == 5, ]
sqldf("
SELECT IsLong, AVG(Compound) AS AvgCompound, COUNT(*) AS n
FROM five_star
GROUP BY IsLong
")
t.test(five_star[five_star$IsLong == TRUE, ]$Compound,
five_star[five_star$IsLong == FALSE, ]$Compound)
# ── 5-Star Reviews Word Cloud ──────────────────────────────
docsS <- Corpus(VectorSource(rp[rp$rating == 5, ]$review_text))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docsS <- tm_map(docsS, toSpace, "\u201c")
docsS <- tm_map(docsS, toSpace, "\u201d")
docsS <- tm_map(docsS, content_transformer(tolower))
docsS <- tm_map(docsS, removeNumbers)
docsS <- tm_map(docsS, removePunctuation)
docsS <- tm_map(docsS, removeWords, stopwords("english"))
# Remove brand names and generic terms that add no insight
docsS <- tm_map(docsS, removeWords, c("mer", "la", "loccitane",
"cream", "product", "skin",
"lamer", "use", "using", "face"))
docsS <- tm_map(docsS, stripWhitespace)
mS <- sort(rowSums(as.matrix(TermDocumentMatrix(docsS))), decreasing = TRUE)
dS <- data.frame(word = names(mS), freq = mS)
set.seed(1234)
wordcloud(words = dS$word, freq = dS$freq, min.freq = 2,
max.words = 200, random.order = FALSE, rot.per = 0.4,
scale = c(2.2, 0.44), colors = brewer.pal(8, "Dark2"))
# ── 1-Star Reviews Word Cloud ─────────────────────────────
# Same tm_map pipeline, just filtered to 1-star reviews
make_cloud <- function(texts, remove_extra = c(), pal = "Dark2") {
docs <- Corpus(VectorSource(texts))
toSp <- content_transformer(function(x, p) gsub(p, " ", x))
docs <- tm_map(docs, toSp, "“"); docs <- tm_map(docs, toSp, "”")
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords,
c("mer", "la", "loccitane", "cream", "product",
"skin", "lamer", "use", "using", "face", remove_extra))
docs <- tm_map(docs, stripWhitespace)
m <- sort(rowSums(as.matrix(TermDocumentMatrix(docs))), decreasing = TRUE)
d <- data.frame(word = names(m), freq = m)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 2,
max.words = 200, random.order = FALSE, rot.per = 0.35,
scale = c(2.2, 0.44),
colors = brewer.pal(8, pal))
invisible(d)
}
# Both brands — 5-star high-rated
make_cloud(rp[rp$rating == 5, ]$review_text)
# Both brands — 1-star low-rated
make_cloud(rp[rp$rating == 1, ]$review_text, pal = "Reds")
# Per-brand comparison
make_cloud(rp[rp$brand_name == "L'Occitane" & rp$rating == 5, ]$review_text)
make_cloud(rp[rp$brand_name == "La Mer" & rp$rating == 5, ]$review_text)
make_cloud(rp[rp$brand_name == "L'Occitane" & rp$rating == 1, ]$review_text, pal = "Reds")
make_cloud(rp[rp$brand_name == "La Mer" & rp$rating == 1, ]$review_text, pal = "Reds")
# Download English model (only needed once)
pos_en_model <- udpipe_download_model(language = "english")
pos_model <- udpipe_load_model(pos_en_model$file_model)
# Tag 5-star reviews
reviews_5 <- rp[rp$rating == 5, ]
pos_df <- data.frame(review_id = integer(),
token = character(),
upos = character(),
stringsAsFactors = FALSE)
for (i in 1:nrow(reviews_5)) {
rid <- reviews_5[i, "review_id"]
anno <- udpipe_annotate(pos_model, x = reviews_5[i, "review_text"])
anno_df <- as.data.frame(anno)
pos_df <- rbind(pos_df, cbind(review_id = rid,
anno_df[, c("token", "upos")]))
}
# Extract adjectives only
adj_5star <- pos_df[pos_df$upos == "ADJ", ]
# Word cloud of 5-star adjectives
docsAdj <- Corpus(VectorSource(adj_5star$token))
# apply tm_map pipeline (lowercase, remove stopwords, strip whitespace)
mAdj <- sort(rowSums(as.matrix(TermDocumentMatrix(docsAdj))), decreasing = TRUE)
dAdj <- data.frame(word = names(mAdj), freq = mAdj)
head(dAdj, 20) # top adjectives
wordcloud(words = dAdj$word, freq = dAdj$freq, min.freq = 2,
max.words = 200, random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
# ── Repeat for 1-star reviews (dissatisfied adjectives) ──
reviews_1 <- rp[rp$rating == 1, ]
pos_df_neg <- data.frame(review_id = integer(), token = character(),
upos = character(), stringsAsFactors = FALSE)
for (i in 1:nrow(reviews_1)) {
rid <- reviews_1[i, "review_id"]
anno <- udpipe_annotate(pos_model, x = reviews_1[i, "review_text"])
anno_df <- as.data.frame(anno)
pos_df_neg <- rbind(pos_df_neg, cbind(review_id = rid,
anno_df[, c("token", "upos")]))
}
adj_1star <- pos_df_neg[pos_df_neg$upos == "ADJ", ]
head(sort(table(adj_1star$token), decreasing = TRUE), 20) # top dissatisfied adjectives
# VADER compound by brand
sqldf("
SELECT brand_name,
AVG(Compound) AS AvgCompound,
COUNT(*) AS n
FROM reviews_vader
GROUP BY brand_name
")
# Positive/negative ratio (4+5 star vs 1+2 star)
sqldf("
SELECT brand_name,
SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS PositiveReviews,
SUM(CASE WHEN rating <= 2 THEN 1 ELSE 0 END) AS NegativeReviews,
CAST(SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS FLOAT) /
SUM(CASE WHEN rating <= 2 THEN 1 ELSE 0 END) AS PosNegRatio
FROM reviews_vader
GROUP BY brand_name
")
# Join authors for skin type and eye color
rv_auth <- sqldf("
SELECT rv.*, a.skin_type, a.eye_color, a.skin_tone
FROM reviews_vader rv
LEFT JOIN authors a ON rv.author_id = a.author_id
")
# Average rating by skin type
sqldf("
SELECT skin_type, AVG(rating) AS AvgRating, COUNT(*) AS n
FROM rv_auth
WHERE skin_type IS NOT NULL AND skin_type != ''
GROUP BY skin_type ORDER BY AvgRating DESC
")
# Average rating by eye color
sqldf("
SELECT eye_color, AVG(rating) AS AvgRating, COUNT(*) AS n
FROM rv_auth
WHERE eye_color IS NOT NULL AND eye_color != ''
GROUP BY eye_color ORDER BY AvgRating DESC
")
# Review volume by month
reviews$ReviewMonthInt <- as.integer(format(reviews$submission_date, "%m"))
reviews$ReviewMonth <- format(reviews$submission_date, "%b")
reviews$ReviewYear <- format(reviews$submission_date, "%y")
reviewsByMonth <- sqldf("
SELECT ReviewMonth, ReviewMonthInt, COUNT(review_id) AS Reviews
FROM reviews
WHERE ReviewYear BETWEEN 15 AND 22
GROUP BY ReviewMonth, ReviewMonthInt
ORDER BY ReviewMonthInt
")
ggplot(reviewsByMonth,
aes(x = reorder(ReviewMonth, ReviewMonthInt), y = Reviews)) +
geom_bar(stat = "identity", fill = "#1a1a2e") +
labs(title = "Luxury Brands Review Volume by Month",
x = "Month", y = "# of Reviews")
# Average rating by skin type — bar chart
skin_avg <- rv_auth[!is.na(rv_auth$skin_type) & rv_auth$skin_type != "", ]
ggplot(skin_avg, aes(x = reorder(skin_type, -rating), y = rating)) +
geom_bar(stat = "summary", fun = mean, fill = "#c8b89a") +
coord_cartesian(ylim = c(3.5, 4.3)) +
labs(title = "Average Rating by Skin Type",
x = "Skin Type", y = "Avg Rating")