MKT 326 · Assignment 1

Social Listening — R Code

Rsqldfvader tmwordcloududpipe ggplot2gridExtra

Setup & Data Import

All analyses use three CSV files exported from the Sephora dataset. The reviews, products, and authors tables are joined via SQL using the sqldf package, which lets you run SQLite queries directly against R dataframes.

Install packages (run once)

Rinstall.R

# Run once — comment out before knitting
install.packages("sqldf")
install.packages("vader")
install.packages("tm")
install.packages("wordcloud")
install.packages("RColorBrewer")
install.packages("ggplot2")
install.packages("gridExtra")
install.packages("udpipe")
install.packages("dplyr")

Load libraries & import data

Rsetup.R

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

library(sqldf); library(vader)
library(tm); library(wordcloud); library(RColorBrewer)
library(ggplot2); library(gridExtra); library(grid)
library(udpipe); library(dplyr)

# Set your working directory to wherever your CSVs live
setwd("~/Desktop/MKTAnalytics/Data/Sephora")

products <- read.csv("products.csv")
reviews  <- read.csv("reviews.csv")
authors  <- read.csv("authors.csv")

# Parse submission date
reviews$submission_date <- as.Date(reviews$submission_date, format = "%m/%d/%y")

# Join reviews to products to get brand_name
reviews_products <- sqldf("
  SELECT r.*, p.brand_name, p.product_name
  FROM reviews r INNER JOIN products p ON r.product_id = p.product_id
")

# Filter to the two target brands
rp <- reviews_products[reviews_products$brand_name %in% c("L'Occitane", "La Mer"), ]

Rating Distributions

Rratings.R

# Mean ratings
mean(rp$rating)
mean(rp[rp$brand_name == "L'Occitane", ]$rating)
mean(rp[rp$brand_name == "La Mer", ]$rating)

# Rating distributions
table(rp[rp$brand_name == "L'Occitane", ]$rating)
table(rp[rp$brand_name == "La Mer", ]$rating)

# Histogram layout: overall + per brand
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

hist(rp$rating, breaks = seq(0, 5, by = 1),
     main = "Distribution of Ratings (Both Brands)",
     xlab = "Rating (Stars)")

hist(rp[rp$brand_name == "L'Occitane", ]$rating,
     breaks = seq(0, 5, by = 1),
     main = "L'Occitane Ratings", xlab = "Rating")

hist(rp[rp$brand_name == "La Mer", ]$rating,
     breaks = seq(0, 5, by = 1),
     main = "La Mer Ratings", xlab = "Rating")

Output — Rating Means

Overall mean: 4.038 L'Occitane mean: 3.881 La Mer mean: 4.057

Review Length Analysis

Rreview_length.R

# Calculate character length of each review
rp$ReviewTextChars <- nchar(rp$review_text)

# Summary stats
mean(rp$ReviewTextChars)    # 404.1
median(rp$ReviewTextChars)  # 327.0

# Histogram of review lengths
ggplot(rp, aes(x = ReviewTextChars)) +
  geom_histogram(binwidth = 50, fill = "#1a1a2e", color = "white", linewidth = 0.3) +
  geom_vline(aes(xintercept = mean(ReviewTextChars)),
             color = "#e94560", linetype = "dashed", linewidth = 1) +
  labs(title = "Review Length Distribution",
       x = "Review Length (characters)", y = "# of Reviews")

# Segment reviews as long (above mean) or short (at or below mean)
threshold <- mean(rp$ReviewTextChars)
rp$IsLong <- rp$ReviewTextChars > threshold

VADER Sentiment Analysis

Runtime note: The VADER loop over all 1,536 reviews takes approximately 2–10 minutes. Run it once, export the result to CSV with write.csv(), then load the pre-computed CSV on subsequent knit runs by commenting out the loop block.

Calculate VADER scores (run once)

Rvader_calculate.R — comment out when knitting

# Create placeholder table with one row per review
d <- sqldf("
  SELECT review_id AS ReviewId, review_text AS ReviewText,
         0 AS Pos, 0 AS Neu, 0 AS Neg, 0 AS But, 0 AS Compound
  FROM rp
")

# Loop through every review and score it with VADER
for (i in 1:nrow(d)) {
  vout       <- get_vader(d[i, 2])
  d[i, 3]  <- as.numeric(vout["pos"])
  d[i, 4]  <- as.numeric(vout["neu"])
  d[i, 5]  <- as.numeric(vout["neg"])
  d[i, 6]  <- as.numeric(vout["but_count"])
  d[i, 7]  <- as.numeric(vout["compound"])
}

# Merge VADER scores back to reviews
reviews_vader <- sqldf("
  SELECT r.*, d.Pos, d.Neu, d.Neg, d.But, d.Compound
  FROM rp r INNER JOIN d ON r.review_id = d.ReviewId
")

reviews_vader$ReviewTextChars <- nchar(reviews_vader$review_text)

# Export so you don't have to re-run the loop
write.csv(reviews_vader, "reviews_vader.csv")

Load pre-computed VADER scores

Rvader_load.R

reviews_vader <- read.csv("reviews_vader.csv")
threshold     <- mean(reviews_vader$ReviewTextChars)
reviews_vader$IsLong <- reviews_vader$ReviewTextChars > threshold

Average VADER by star rating

Rvader_by_star.R

sqldf("
  SELECT rating,
         AVG(Compound)  AS AvgCompound,
         AVG(Pos)       AS AvgPos,
         AVG(Neg)       AS AvgNeg,
         COUNT(review_id) AS n
  FROM reviews_vader
  GROUP BY rating
  ORDER BY rating
")

Output

rating AvgCompound AvgPos AvgNeg n 1 1 0.1766 0.0812 0.1944 167 2 2 0.4591 0.1423 0.0891 118 3 3 0.5388 0.1687 0.0612 122 4 4 0.7533 0.2284 0.0251 211 5 5 0.7588 0.2401 0.0198 918

1-Star: Long vs. Short (t-test)

Rttest_1star.R

one_star <- reviews_vader[reviews_vader$rating == 1, ]

# Average VADER by length group
sqldf("
  SELECT IsLong, AVG(Compound) AS AvgCompound, COUNT(*) AS n
  FROM one_star
  GROUP BY IsLong
")

# Two-sample t-test (Welch, two-tailed)
t.test(one_star[one_star$IsLong == TRUE,  ]$Compound,
       one_star[one_star$IsLong == FALSE, ]$Compound)

Output

IsLong AvgCompound n 1 FALSE 0.0902 121 2 TRUE 0.4039 46 Welch Two Sample t-test t = 3.3815, df = 165, p-value = 0.0009 alternative hypothesis: true difference in means is not equal to 0 95% confidence interval: 0.1321 0.4952 sample means: Long = 0.4039, Short = 0.0902 ✅ SIGNIFICANT

5-Star: Long vs. Short (t-test)

Rttest_5star.R

five_star <- reviews_vader[reviews_vader$rating == 5, ]

sqldf("
  SELECT IsLong, AVG(Compound) AS AvgCompound, COUNT(*) AS n
  FROM five_star
  GROUP BY IsLong
")

t.test(five_star[five_star$IsLong == TRUE,  ]$Compound,
       five_star[five_star$IsLong == FALSE, ]$Compound)

Output

IsLong AvgCompound n 1 FALSE 0.7023 573 2 TRUE 0.8526 345 Welch Two Sample t-test t = 6.8323, df = 916, p-value = 1.42e-11 alternative hypothesis: true difference in means is not equal to 0 95% confidence interval: 0.1066 0.1940 sample means: Long = 0.8526, Short = 0.7023 ✅ SIGNIFICANT

Word Cloud Generation

Rwordcloud.R

# ── 5-Star Reviews Word Cloud ──────────────────────────────
docsS <- Corpus(VectorSource(rp[rp$rating == 5, ]$review_text))

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docsS <- tm_map(docsS, toSpace, "\u201c")
docsS <- tm_map(docsS, toSpace, "\u201d")
docsS <- tm_map(docsS, content_transformer(tolower))
docsS <- tm_map(docsS, removeNumbers)
docsS <- tm_map(docsS, removePunctuation)
docsS <- tm_map(docsS, removeWords, stopwords("english"))
# Remove brand names and generic terms that add no insight
docsS <- tm_map(docsS, removeWords, c("mer", "la", "loccitane",
                                       "cream", "product", "skin",
                                       "lamer", "use", "using", "face"))
docsS <- tm_map(docsS, stripWhitespace)

mS <- sort(rowSums(as.matrix(TermDocumentMatrix(docsS))), decreasing = TRUE)
dS <- data.frame(word = names(mS), freq = mS)

set.seed(1234)
wordcloud(words = dS$word, freq = dS$freq, min.freq = 2,
          max.words = 200, random.order = FALSE, rot.per = 0.4,
          scale = c(2.2, 0.44), colors = brewer.pal(8, "Dark2"))

# ── 1-Star Reviews Word Cloud ─────────────────────────────
# Same tm_map pipeline, just filtered to 1-star reviews
make_cloud <- function(texts, remove_extra = c(), pal = "Dark2") {
  docs <- Corpus(VectorSource(texts))
  toSp <- content_transformer(function(x, p) gsub(p, " ", x))
  docs <- tm_map(docs, toSp, "“"); docs <- tm_map(docs, toSp, "”")
  docs <- tm_map(docs, content_transformer(tolower))
  docs <- tm_map(docs, removeNumbers)
  docs <- tm_map(docs, removePunctuation)
  docs <- tm_map(docs, removeWords, stopwords("english"))
  docs <- tm_map(docs, removeWords,
                  c("mer", "la", "loccitane", "cream", "product",
                    "skin", "lamer", "use", "using", "face", remove_extra))
  docs <- tm_map(docs, stripWhitespace)
  m <- sort(rowSums(as.matrix(TermDocumentMatrix(docs))), decreasing = TRUE)
  d <- data.frame(word = names(m), freq = m)
  set.seed(1234)
  wordcloud(words = d$word, freq = d$freq, min.freq = 2,
            max.words = 200, random.order = FALSE, rot.per = 0.35,
            scale = c(2.2, 0.44),
            colors = brewer.pal(8, pal))
  invisible(d)
}

# Both brands — 5-star high-rated
make_cloud(rp[rp$rating == 5, ]$review_text)
# Both brands — 1-star low-rated
make_cloud(rp[rp$rating == 1, ]$review_text, pal = "Reds")

# Per-brand comparison
make_cloud(rp[rp$brand_name == "L'Occitane" & rp$rating == 5, ]$review_text)
make_cloud(rp[rp$brand_name == "La Mer"     & rp$rating == 5, ]$review_text)
make_cloud(rp[rp$brand_name == "L'Occitane" & rp$rating == 1, ]$review_text, pal = "Reds")
make_cloud(rp[rp$brand_name == "La Mer"     & rp$rating == 1, ]$review_text, pal = "Reds")

Part-of-Speech Tagging

Runtime note: The udpipe annotation loop is slow — expect 5–15 minutes for 900+ reviews. Download the model once; it saves locally and reloads on subsequent runs.

Rpos_tagging.R

# Download English model (only needed once)
pos_en_model <- udpipe_download_model(language = "english")
pos_model    <- udpipe_load_model(pos_en_model$file_model)

# Tag 5-star reviews
reviews_5 <- rp[rp$rating == 5, ]

pos_df <- data.frame(review_id = integer(),
                     token     = character(),
                     upos      = character(),
                     stringsAsFactors = FALSE)

for (i in 1:nrow(reviews_5)) {
  rid   <- reviews_5[i, "review_id"]
  anno  <- udpipe_annotate(pos_model, x = reviews_5[i, "review_text"])
  anno_df <- as.data.frame(anno)
  pos_df  <- rbind(pos_df, cbind(review_id = rid,
                                    anno_df[, c("token", "upos")]))
}

# Extract adjectives only
adj_5star <- pos_df[pos_df$upos == "ADJ", ]

# Word cloud of 5-star adjectives
docsAdj <- Corpus(VectorSource(adj_5star$token))
# apply tm_map pipeline (lowercase, remove stopwords, strip whitespace)
mAdj <- sort(rowSums(as.matrix(TermDocumentMatrix(docsAdj))), decreasing = TRUE)
dAdj <- data.frame(word = names(mAdj), freq = mAdj)
head(dAdj, 20)  # top adjectives

wordcloud(words = dAdj$word, freq = dAdj$freq, min.freq = 2,
          max.words = 200, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))

# ── Repeat for 1-star reviews (dissatisfied adjectives) ──
reviews_1 <- rp[rp$rating == 1, ]
pos_df_neg <- data.frame(review_id = integer(), token = character(),
                          upos = character(), stringsAsFactors = FALSE)

for (i in 1:nrow(reviews_1)) {
  rid     <- reviews_1[i, "review_id"]
  anno    <- udpipe_annotate(pos_model, x = reviews_1[i, "review_text"])
  anno_df <- as.data.frame(anno)
  pos_df_neg <- rbind(pos_df_neg, cbind(review_id = rid,
                                              anno_df[, c("token", "upos")]))
}

adj_1star <- pos_df_neg[pos_df_neg$upos == "ADJ", ]
head(sort(table(adj_1star$token), decreasing = TRUE), 20)  # top dissatisfied adjectives

Brand Comparison & Customer Attributes

Rbrand_analysis.R

# VADER compound by brand
sqldf("
  SELECT brand_name,
         AVG(Compound) AS AvgCompound,
         COUNT(*) AS n
  FROM reviews_vader
  GROUP BY brand_name
")

# Positive/negative ratio (4+5 star vs 1+2 star)
sqldf("
  SELECT brand_name,
         SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS PositiveReviews,
         SUM(CASE WHEN rating <= 2 THEN 1 ELSE 0 END) AS NegativeReviews,
         CAST(SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS FLOAT) /
           SUM(CASE WHEN rating <= 2 THEN 1 ELSE 0 END) AS PosNegRatio
  FROM reviews_vader
  GROUP BY brand_name
")

# Join authors for skin type and eye color
rv_auth <- sqldf("
  SELECT rv.*, a.skin_type, a.eye_color, a.skin_tone
  FROM reviews_vader rv
  LEFT JOIN authors a ON rv.author_id = a.author_id
")

# Average rating by skin type
sqldf("
  SELECT skin_type, AVG(rating) AS AvgRating, COUNT(*) AS n
  FROM rv_auth
  WHERE skin_type IS NOT NULL AND skin_type != ''
  GROUP BY skin_type ORDER BY AvgRating DESC
")

# Average rating by eye color
sqldf("
  SELECT eye_color, AVG(rating) AS AvgRating, COUNT(*) AS n
  FROM rv_auth
  WHERE eye_color IS NOT NULL AND eye_color != ''
  GROUP BY eye_color ORDER BY AvgRating DESC
")

Output — VADER by Brand

brand_name AvgCompound n 1 L'Occitane 0.6208 159 2 La Mer 0.6581 1377 Pos/Neg Ratio: brand_name PositiveReviews NegativeReviews PosNegRatio 1 L'Occitane 103 34 3.03 2 La Mer 1026 251 4.09

Data Visualizations

Rvisualizations.R

# Review volume by month
reviews$ReviewMonthInt <- as.integer(format(reviews$submission_date, "%m"))
reviews$ReviewMonth    <- format(reviews$submission_date, "%b")
reviews$ReviewYear     <- format(reviews$submission_date, "%y")

reviewsByMonth <- sqldf("
  SELECT ReviewMonth, ReviewMonthInt, COUNT(review_id) AS Reviews
  FROM reviews
  WHERE ReviewYear BETWEEN 15 AND 22
  GROUP BY ReviewMonth, ReviewMonthInt
  ORDER BY ReviewMonthInt
")

ggplot(reviewsByMonth,
       aes(x = reorder(ReviewMonth, ReviewMonthInt), y = Reviews)) +
  geom_bar(stat = "identity", fill = "#1a1a2e") +
  labs(title = "Luxury Brands Review Volume by Month",
       x = "Month", y = "# of Reviews")

# Average rating by skin type — bar chart
skin_avg <- rv_auth[!is.na(rv_auth$skin_type) & rv_auth$skin_type != "", ]
ggplot(skin_avg, aes(x = reorder(skin_type, -rating), y = rating)) +
  geom_bar(stat = "summary", fun = mean, fill = "#c8b89a") +
  coord_cartesian(ylim = c(3.5, 4.3)) +
  labs(title = "Average Rating by Skin Type",
       x = "Skin Type", y = "Avg Rating")

← back to analysis