K9 Resorts · End-to-End Pipeline
The raw Gingr database backup required preprocessing before it could be loaded into RDS. The primary issues were DEFINER clauses (which RDS rejects) and archive tables that caused the import to stall. The script below handles both.
import re, boto3
# Run on EC2 after downloading the raw dump from S3
# sed strips DEFINER clauses that RDS rejects
# Then re-upload the clean version
s3 = boto3.client('s3')
s3.download_file('your-bucket', 'gingr_backup.sql', '/tmp/gingr_raw.sql')
with open('/tmp/gingr_raw.sql', 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# Remove DEFINER=`user`@`host` patterns
content = re.sub(r'DEFINER=`[^`]+`@`[^`]+`', '', content)
# Skip archive tables that cause stalls
SKIP_TABLES = ['archive_reservations', 'archive_payments']
lines, skip = [], False
for line in content.splitlines():
if any(t in line for t in SKIP_TABLES): skip = True
if skip and line.startswith('-- Table'): skip = False
if not skip: lines.append(line)
with open('/tmp/gingr_clean.sql', 'w') as f:
f.write('\n'.join(lines))
s3.upload_file('/tmp/gingr_clean.sql', 'your-bucket', 'gingr_clean.sql')
print("Clean dump uploaded to S3")
# Run on EC2 in the same VPC as your RDS instance
# mysql client must be installed: sudo apt install mysql-client
mysql \
--host=gingr-clv-db.c9ga6o64i2ho.us-east-2.rds.amazonaws.com \
--user=admin \
--password \
--database=gingr \
--max_allowed_packet=512M \
< /tmp/gingr_clean.sql
# Verify row counts
mysql -h gingr-clv-db... -u admin -p gingr -e "
SELECT 'owners' AS tbl, COUNT(*) FROM owners
UNION ALL
SELECT 'reservations', COUNT(*) FROM reservations
UNION ALL
SELECT 'payments', COUNT(*) FROM payments;
"
The core CLV model computes historical CLV as avg_transaction_value × purchase_frequency × tenure_years per customer, then scores each customer on Recency, Frequency, and Monetary quintiles to assign RFM segments.
import pandas as pd
import numpy as np
# Column names match the SQL export query output
cols = [
"owner_id", "first_name", "last_name", "zip", "acquisition_source",
"customer_created_date", "last_transaction_date", "days_since_last_purchase",
"total_transactions", "unique_visit_days", "total_revenue",
"avg_transaction_value", "max_transaction_value", "first_transaction_date",
"tenure_days", "num_pets", "total_reservations",
"first_reservation_date", "last_reservation_date"
]
df = pd.read_csv("customer_base.csv", header=None, names=cols)
# Parse dates and numerics
date_cols = ["customer_created_date", "last_transaction_date",
"first_transaction_date", "first_reservation_date", "last_reservation_date"]
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors="coerce")
for col in ["total_revenue", "avg_transaction_value", "tenure_days", "days_since_last_purchase"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
# ── Historical CLV ───────────────────────────────────────────
df["tenure_years"] = (df["tenure_days"] / 365).clip(lower=0.083)
df["purchase_frequency"] = df["total_transactions"] / df["tenure_years"]
df["historical_clv"] = (df["avg_transaction_value"]
* df["purchase_frequency"]
* df["tenure_years"])
# ── RFM Scoring (quintiles, 1–5) ─────────────────────────────
# Recency: lower days_since = better → reverse label order
df["R_score"] = pd.qcut(df["days_since_last_purchase"], q=5,
labels=[5,4,3,2,1]).astype(int)
df["F_score"] = pd.qcut(df["total_transactions"].rank(method="first"),
q=5, labels=[1,2,3,4,5]).astype(int)
df["M_score"] = pd.qcut(df["total_revenue"].rank(method="first"),
q=5, labels=[1,2,3,4,5]).astype(int)
df["RFM_score"] = df["R_score"] + df["F_score"] + df["M_score"]
def rfm_segment(score):
if score >= 13: return "Champion"
elif score >= 10: return "Loyal"
elif score >= 7: return "Needs Attention"
elif score >= 4: return "At Risk"
else: return "Lost"
df["segment"] = df["RFM_score"].apply(rfm_segment)
# ── Segment Summary ──────────────────────────────────────────
seg = df.groupby("segment").agg(
customers=("owner_id", "count"),
revenue=("total_revenue", "sum"),
avg_clv=("historical_clv", "mean"),
).sort_values("revenue", ascending=False)
seg["rev_pct"] = seg["revenue"] / seg["revenue"].sum() * 100
print(seg.round(1))
df.to_csv("customer_base_scored.csv", index=False)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
coh = pd.read_csv("cohort_monthly.csv", header=None,
names=["owner_id", "cohort_month", "revenue", "months_since_first"])
# Count distinct active customers per cohort × month
cohort_counts = (coh.groupby(["cohort_month", "months_since_first"])["owner_id"]
.nunique()
.reset_index(name="active_customers"))
# Pivot to matrix: rows = cohort month, cols = months since first
pivot = cohort_counts.pivot(index="cohort_month",
columns="months_since_first",
values="active_customers")
# Retention % = active / month-0 cohort size
retention = pivot.div(pivot[0], axis=0) * 100
# Heatmap
fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(retention, annot=True, fmt=".0f",
cmap="YlOrRd_r", linewidths=0.5, ax=ax,
cbar_kws={"label": "Retention %"})
ax.set_title("Cohort Retention Heatmap — % Still Active by Month")
ax.set_xlabel("Months Since First Purchase")
ax.set_ylabel("Acquisition Cohort")
plt.tight_layout()
plt.savefig("chart8_cohort_retention.png", dpi=150)
# Average retention curve across all cohorts
avg_retention = retention.mean(axis=0)
print(avg_retention.round(1))
# pip install lifetimes
from lifetimes import BetaGeoFitter, GammaGammaFitter
import pandas as pd
# BG/NBD input: frequency, recency, T (age of customer in days)
# Customers must have made at least 2 transactions
inp = pd.read_csv("bgnbd_input.csv")
# Fit BG/NBD model — estimates purchase process + dropout process
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(inp["frequency"], inp["recency"], inp["T"])
# Predict probability customer is still alive
inp["prob_alive"] = bgf.conditional_probability_alive(
inp["frequency"], inp["recency"], inp["T"])
# Predict expected purchases in next 365 days
inp["predicted_purchases_1yr"] = bgf.conditional_expected_number_of_purchases_up_to_time(
365, inp["frequency"], inp["recency"], inp["T"])
# Fit Gamma-Gamma model for spend prediction
# (only customers with >1 repeat purchase)
gg_df = inp[inp["frequency"] > 0]
ggf = GammaGammaFitter(penalizer_coef=0.001)
ggf.fit(gg_df["frequency"], gg_df["monetary_value"])
# Predicted CLV over 1 year
inp["predicted_clv_1yr"] = ggf.customer_lifetime_value(
bgf, inp["frequency"], inp["recency"], inp["T"],
inp["monetary_value"],
time=12, discount_rate=0.01)
print(inp.nlargest(10, "predicted_clv_1yr")[[
"owner_id", "frequency", "prob_alive",
"predicted_purchases_1yr", "predicted_clv_1yr"]])
inp.to_csv("bgnbd_scored.csv", index=False)
import pandas as pd
cols = ["month", "room_type", "reservations",
"avg_nightly_rate", "avg_nights", "avg_total_per_stay"]
df = pd.read_csv("price_sensitivity_v2.csv", header=None, names=cols)
df["month"] = pd.to_datetime(df["month"] + "-01")
rooms = {
"Boarding | Executive Room (All Inclusive)": "Executive Room",
"Boarding | Luxury Suite (All Inclusive)": "Luxury Suite",
"Boarding | Double Compartment (All Inclusive)": "Double Compartment",
"Boarding | Single Compartment (All Inclusive)": "Single Compartment",
}
# Before: 3 months before Feb 2025 / After: 3 months from Mar 2025
for room, short in rooms.items():
sub = df[df["room_type"] == room].sort_values("month")
pre = sub[sub["month"] < "2025-02-01"].tail(3)
post = sub[(sub["month"] >= "2025-03-01") &
(sub["month"] < "2026-02-01")].head(3)
rate_chg = (post["avg_nightly_rate"].mean() -
pre["avg_nightly_rate"].mean()) / pre["avg_nightly_rate"].mean() * 100
vol_chg = (post["reservations"].mean() -
pre["reservations"].mean()) / pre["reservations"].mean() * 100
elasticity = vol_chg / rate_chg
print(f"{short}: rate {rate_chg:+.1f}% vol {vol_chg:+.1f}% elasticity {elasticity:.2f}")