FrankRuns · October 19, 2025 13:56
diff --git a/embeddings_transportation_experiment.py b/embeddings_transportation_experiment.py
 # %% [markdown]
 # # Why AI Thinks Phoenix and Miami Belong Together — Tutorial Notebook
 #
 # This notebook-style script walks through:
 # 1) Clustering US cities by **geography** (lat/lon) vs **meaning** (embeddings)
 # 2) Building a **synthetic lane-rate** dataset where semantic city characteristics
 #    (derived from embeddings) actually drive part of the rate variance
 # 3) Training three models to predict rate-per-mile (RPM):
 #       - Baseline (Distance-only)
 #       - Name IDs (one-hot origin/destination) — "memorizer"
 #       - Distance + Embeddings (semantic features)
 #
 # It’s written as a tutorial: short chunks, heavy comments, and conservative dependencies.
 # If you have an OPENAI_API_KEY, we’ll use OpenAI embeddings; otherwise we fall back to
 # a deterministic hashing embed that preserves the *workflow* (but not the same semantics).

 # %% [markdown]
 # ## 0) Imports & Configuration

 # %%
 import os, json, hashlib, math, random, pathlib
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt

 from sklearn.cluster import KMeans
 from sklearn.decomposition import TruncatedSVD, PCA
 from sklearn.metrics import r2_score, mean_squared_error
 from sklearn.preprocessing import OneHotEncoder

 # Regressors & Pipelines
 from sklearn.linear_model import RidgeCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline

 # Reproducibility
 random.seed(42)
 np.random.seed(42)

 # Embedding model name (only used if you have an API key set)
 EMB_MODEL = os.getenv("EMB_MODEL", "text-embedding-3-large")
 EMB_CACHE_PATH = pathlib.Path("city_embeddings_cache.npz")

 # Inline matplotlib plots (if running in a notebook)
 plt.rcParams["figure.dpi"] = 130

 # %% [markdown]
 # ## 1) City List (with coordinates)
 # Feel free to add/remove cities. If you change the list, delete the cache file
 # `city_embeddings_cache.npz` to refresh embeddings.

 # %%
 CITIES = [
    ("New York, NY",        40.7128, -74.0060),
    ("Boston, MA",          42.3601, -71.0589),
    ("Philadelphia, PA",    39.9526, -75.1652),
    ("Washington, DC",      38.9072, -77.0369),
    ("Charlotte, NC",       35.2271, -80.8431),
    ("Atlanta, GA",         33.7490, -84.3880),
    ("Miami, FL",           25.7617, -80.1918),
    ("Orlando, FL",         28.5383, -81.3792),

    ("Chicago, IL",         41.8781, -87.6298),
    ("Detroit, MI",         42.3314, -83.0458),
    ("Cleveland, OH",       41.4993, -81.6944),
    ("Minneapolis, MN",     44.9778, -93.2650),
    ("St. Louis, MO",       38.6270, -90.1994),
    ("Kansas City, MO",     39.0997, -94.5786),
    ("Omaha, NE",           41.2565, -95.9345),
    ("Indianapolis, IN",    39.7684, -86.1581),

    ("Dallas, TX",          32.7767, -96.7970),
    ("Houston, TX",         29.7604, -95.3698),
    ("Austin, TX",          30.2672, -97.7431),
    ("San Antonio, TX",     29.4241, -98.4936),
    ("Oklahoma City, OK",   35.4676, -97.5164),

    ("Denver, CO",          39.7392, -104.9903),
    ("Salt Lake City, UT",  40.7608, -111.8910),
    ("Phoenix, AZ",         33.4484, -112.0740),
    ("Las Vegas, NV",       36.1699, -115.1398),
    ("Boise, ID",           43.6150, -116.2023),

    ("Los Angeles, CA",     34.0522, -118.2437),
    ("San Diego, CA",       32.7157, -117.1611),
    ("San Francisco, CA",   37.7749, -122.4194),
    ("San Jose, CA",        37.3382, -121.8863),
    ("Sacramento, CA",      38.5816, -121.4944),
    ("Portland, OR",        45.5051, -122.6750),
    ("Seattle, WA",         47.6062, -122.3321),
 ]

 # %% [markdown]
 # ## 2) Utility Functions

 # %%
 def l2_normalize(X: np.ndarray) -> np.ndarray:
    """Row-wise L2 normalize matrix X (safe for zero rows)."""
    n = np.linalg.norm(X, axis=1, keepdims=True)
    n[n == 0] = 1.0
    return X / n

 def _openai_embed(batch, model):
    """
    Fetch embeddings for a list of strings from the OpenAI embeddings API.
    Requires OPENAI_API_KEY in your environment.
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not set")
    import requests
    url = "https://api.openai.com/v1/embeddings"
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    r = requests.post(url, headers=headers, json={"model": model, "input": batch}, timeout=60)
    if r.status_code != 200:
        raise RuntimeError(f"Embedding API error {r.status_code}: {r.text}")
    data = r.json()["data"]
    return [row["embedding"] for row in data]

 def hash_embed(names, dim=384):
    """
    Deterministic fallback embeddings using trigram hashing.
    This preserves the pipeline (clustering, PCA, modeling) even offline.
    """
    V = np.zeros((len(names), dim), dtype=np.float32)
    for i, s in enumerate(names):
        s = s.lower()
        for j in range(len(s) - 2):
            tri = s[j:j+3]
            h = int(hashlib.md5(tri.encode()).hexdigest(), 16) % dim
            V[i, h] += 1.0
    return l2_normalize(V)

 def load_cache():
    """Load {name -> vector} mapping from local npz cache, if present."""
    if EMB_CACHE_PATH.exists():
        npz = np.load(EMB_CACHE_PATH, allow_pickle=True)
        names = list(npz["names"])
        vecs = npz["vectors"]
        return {n: vecs[i] for i, n in enumerate(names)}
    return {}

 def save_cache(name_to_vec):
    """Save {name -> vector} mapping to local npz cache."""
    names = np.array(list(name_to_vec.keys()), dtype=object)
    vectors = np.vstack([name_to_vec[n] for n in names])
    np.savez_compressed(EMB_CACHE_PATH, names=names, vectors=vectors)

 def get_embeddings_with_cache(names, model=EMB_MODEL, allow_fallback=True, prefer_cache_only=False):
    """
    Get embeddings for a list of names with caching and graceful fallback.
    - If OPENAI_API_KEY is set, we call OpenAI for any missing names.
    - If not (or it fails), we use hash_embed() for those missing names.
    """
    names = list(names)
    cache = load_cache()
    missing = [n for n in names if n not in cache]

    if missing and not prefer_cache_only:
        try:
            all_vecs = []
            B = 256
            for i in range(0, len(missing), B):
                chunk = missing[i:i+B]
                all_vecs.extend(_openai_embed(chunk, model))
            for n, v in zip(missing, all_vecs):
                cache[n] = np.array(v, dtype=np.float32)
            print(f"Embeddings fetched from OpenAI for {len(missing)} new names ({len(all_vecs[0])} dims).")
            save_cache(cache)
        except Exception as e:
            if not allow_fallback:
                raise
            print(f"[warn] OpenAI fetch failed: {e}\nUsing fallback (hash) for {len(missing)} names.")
            F = hash_embed(missing)
            for i, n in enumerate(missing):
                cache[n] = F[i]
            save_cache(cache)

    V = np.vstack([cache[n] for n in names]).astype(np.float32)
    return l2_normalize(V)

 def haversine_miles(lat1, lon1, lat2, lon2):
    """Great-circle distance in miles between two lat/lon points."""
    R_km = 6371.0088
    dphi  = math.radians(lat2 - lat1)
    dlamb = math.radians(lon2 - lon1)
    phi1  = math.radians(lat1)
    phi2  = math.radians(lat2)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlamb/2)**2
    return 0.621371 * (2*R_km*math.asin(math.sqrt(a)))

 def rmse(y_true, y_pred):
    """Root Mean Squared Error (RMSE)."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

 def safe_one_hot_encoder():
    """
    Return a OneHotEncoder that works across sklearn versions.
    - sklearn >= 1.2 uses 'sparse_output'
    - older versions use 'sparse'
    """
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        # Fallback for older scikit-learn
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

 # %% [markdown]
 # ## 3) A. Clustering by Geography (lat/lon)
 # Simple sanity check: nearby cities should cluster together on coordinates.

 # %%
 def cluster_by_geography(k=7, save_path="map_geo_clusters.png"):
    names = [c[0] for c in CITIES]
    lat   = np.array([c[1] for c in CITIES])
    lon   = np.array([c[2] for c in CITIES])
    X     = np.c_[lat, lon]

    km = KMeans(n_clusters=k, n_init=20, random_state=42)
    labels = km.fit_predict(X)

    plt.figure(figsize=(10, 6))
    for lab in sorted(set(labels)):
        idx = labels == lab
        plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}")
        for i in np.where(idx)[0]:
            plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8)
    plt.title("US Cities — Clusters by Geography (k-means on lat/lon)")
    plt.xlabel("Longitude"); plt.ylabel("Latitude")
    plt.legend(loc="best", ncol=2, fontsize=9)
    plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close()
    print(f"Saved {save_path}")

 # %% [markdown]
 # ## 4) B. Clustering by Meaning (embeddings)
 # We’ll embed the **city names** and then cluster in a reduced semantic space (SVD).
 # If you’re offline/no API key, the fallback hashing embeddings keep the demo runnable.

 # %%
 def cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png"):
    names = [c[0] for c in CITIES]
    lat   = np.array([c[1] for c in CITIES])
    lon   = np.array([c[2] for c in CITIES])

    # 1) Get embeddings (OpenAI if available; fallback otherwise)
    V = get_embeddings_with_cache(names)

    # 2) Reduce dimensionality for clustering stability & interpretability
    V2 = TruncatedSVD(n_components=12, random_state=42).fit_transform(V)

    # 3) Cluster in semantic space
    km = KMeans(n_clusters=k, n_init=20, random_state=42)
    labels = km.fit_predict(V2)

    # 4) Plot clusters on the geographic map (to SEE semantic groupings)
    plt.figure(figsize=(10, 6))
    for lab in sorted(set(labels)):
        idx = labels == lab
        plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}")
        for i in np.where(idx)[0]:
            plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8)
    plt.title("US Cities — Clusters by Embeddings (k-means on text vectors)")
    plt.xlabel("Longitude"); plt.ylabel("Latitude")
    plt.legend(loc="best", ncol=2, fontsize=9)
    plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close()
    print(f"Saved {save_path}")

 # %% [markdown]
 # ## 5) C. Lane-Rate Simulation + Modeling
 # We create **synthetic** lane data where the *true* signal includes:
 # - A weak, noisy relationship to distance (miles)
 # - A semantic component using PCA of embeddings (e.g., “Sunbelt vs Rust Belt” style axes)
 # - A similarity effect (origin/destination semantic cosine similarity)
 #
 # Then we compare three models on **held-out destination cities**:
 # - Distance only
 # - Distance + one-hot origin/destination (memorizer)
 # - Distance + embedding features (semantic generalization)

 # %%
 def lane_demo(save_metrics="rates_oos_metrics.txt"):
    # ------------- Lookups -------------
    names = [c[0] for c in CITIES]
    lat   = {c[0]: c[1] for c in CITIES}
    lon   = {c[0]: c[2] for c in CITIES}

    # ------------- Embeddings & PCA -------------
    # Get normalized embeddings for each city
    V = get_embeddings_with_cache(names)
    # PCA = interpretable semantic axes (PC1/PC2 often align with broad traits)
    pca = PCA(n_components=8, random_state=42)
    V_pca = pca.fit_transform(V)
    name2pca = {n: V_pca[i] for i, n in enumerate(names)}
    name2vec = {n: V[i] for i, n in enumerate(names)}
    print(f"PCA top-3 explained variance ratios: {pca.explained_variance_ratio_[:3]}")

    # ------------- Generate synthetic lanes -------------
    N = 4000
    rows = []
    rng = np.random.RandomState(11)

    for _ in range(N):
        # Random origin/destination pair
        o, d = random.sample(names, 2)
        miles = haversine_miles(lat[o], lon[o], lat[d], lon[d])

        # Semantic features
        o_pca = name2pca[o]
        d_pca = name2pca[d]
        eo = name2vec[o]
        ed = name2vec[d]
        sim_od = float(np.dot(eo, ed))  # cosine since vectors are L2-normalized

        # --- Weak distance signal ---
        # (This keeps "miles matters" visible, but not dominant.)
        if miles < 300:
            base_rpm = rng.uniform(1.7, 2.6)
        elif miles < 800:
            base_rpm = rng.uniform(1.3, 2.0)
        elif miles < 1500:
            base_rpm = rng.uniform(1.1, 1.7)
        else:
            base_rpm = rng.uniform(0.8, 1.5)

        # --- Semantic signal (destination “pull,” origin “push,” similarity/backhaul, interactions) ---
        dest_effect = 0.45 * d_pca[0] + 0.35 * d_pca[1]
        orig_effect = -0.25 * o_pca[0] - 0.20 * o_pca[1]
        similarity_effect = -0.40 * max(0, sim_od)  # more similar => more efficient => cheaper
        cross_effect = 0.20 * d_pca[0] * o_pca[1]

        semantic_rpm = dest_effect + orig_effect + similarity_effect + cross_effect
        noise_rpm = rng.normal(0, 0.15)

        rpm = max(0.5, base_rpm + semantic_rpm + noise_rpm)
        total_rate = miles * rpm

        rows.append({
            'origin': o,
            'dest': d,
            'miles': miles,
            'sim_od': sim_od,
            'dest_pc1': d_pca[0], 'dest_pc2': d_pca[1], 'dest_pc3': d_pca[2],
            'orig_pc1': o_pca[0], 'orig_pc2': o_pca[1], 'orig_pc3': o_pca[2],
            'rpm': rpm,
            'total_rate': total_rate
        })

    df = pd.DataFrame(rows)

    # ------------- Out-of-sample split (HOLD OUT destination cities) -------------
    # This tests true generalization to unseen destinations.
    dests = df["dest"].unique()
    held  = set(np.random.RandomState(42).choice(dests, size=max(16, int(len(dests)*0.50)), replace=False))
    te    = df["dest"].isin(held).values
    tr    = ~te

    print(f"Train samples: {tr.sum():,} | Test samples: {te.sum():,}")
    print(f"Held-out destinations: {len(held)} cities")

    y_tr = df.loc[tr, "rpm"].values.astype(np.float32)
    y_te = df.loc[te, "rpm"].values.astype(np.float32)

    # ------------- Baseline Features (distance only) -------------
    # Add simple distance bins to allow some non-linearity without trees.
    df['dist_bin_short'] = (df['miles'] < 300).astype(float)
    df['dist_bin_mid']   = ((df['miles'] >= 300) & (df['miles'] < 800)).astype(float)
    df['dist_bin_long']  = ((df['miles'] >= 800) & (df['miles'] < 1500)).astype(float)

    def build_dist_matrix(mask):
        miles = df.loc[mask, "miles"].values / 1000.0
        return np.c_[
            miles,
            miles ** 2,
            df.loc[mask, "dist_bin_short"].values,
            df.loc[mask, "dist_bin_mid"].values,
            df.loc[mask, "dist_bin_long"].values
        ].astype(np.float32)

    Xb_tr = build_dist_matrix(tr)
    Xb_te = build_dist_matrix(te)

    # ------------- Name ID Features (memorizer) -------------
    enc = safe_one_hot_encoder()
    enc.fit(df.loc[tr, ["origin","dest"]])
    Xid_tr = enc.transform(df.loc[tr, ["origin","dest"]]).astype(np.float32)
    Xid_te = enc.transform(df.loc[te, ["origin","dest"]]).astype(np.float32)
    Xname_tr = np.c_[Xb_tr, Xid_tr]
    Xname_te = np.c_[Xb_te, Xid_te]

    # ------------- Embedding Features (semantic generalizer) -------------
    def build_emb_matrix(mask):
        return np.c_[
            build_dist_matrix(mask),
            df.loc[mask, "sim_od"].values,
            df.loc[mask, "dest_pc1"].values,
            df.loc[mask, "dest_pc2"].values,
            df.loc[mask, "dest_pc3"].values,
            df.loc[mask, "orig_pc1"].values,
            df.loc[mask, "orig_pc2"].values,
            df.loc[mask, "orig_pc3"].values,
            np.maximum(0, df.loc[mask, "sim_od"].values),  # ReLU(similarity)
            df.loc[mask, "dest_pc1"].values * df.loc[mask, "orig_pc2"].values  # simple interaction
        ].astype(np.float32)

    Xe_tr = build_emb_matrix(tr)
    Xe_te = build_emb_matrix(te)

    # ------------- Train Ridge Regressors (with CV over alpha) -------------
    alphas = np.logspace(-2, 3, 13)  # {0.01 ... 1000}
    base = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
    name = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
    emb  = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))

    base.fit(Xb_tr, y_tr)
    name.fit(Xname_tr, y_tr)
    emb.fit(Xe_tr, y_tr)

    yhat_b = base.predict(Xb_te)
    yhat_n = name.predict(Xname_te)
    yhat_e = emb.predict(Xe_te)

    r2_b, rmse_b = r2_score(y_te, yhat_b), rmse(y_te, yhat_b)
    r2_n, rmse_n = r2_score(y_te, yhat_n), rmse(y_te, yhat_n)
    r2_e, rmse_e = r2_score(y_te, yhat_e), rmse(y_te, yhat_e)

    # ------------- Write results -------------
    with open(save_metrics, "w") as f:
        f.write("TASK: Predict rate per mile (RPM)\n")
        f.write("Data generation: PCA components from embeddings drive semantic variance\n")
        f.write("OOS split: held-out destination cities only appear in TEST\n")
        f.write(f"Held-out dests: {sorted(list(held))}\n\n")
        f.write(f"Baseline (distance only)       -> R²: {r2_b:.3f} | RMSE: ${rmse_b:.3f}/mile\n")
        f.write(f"Name-IDs (one-hot O/D)         -> R²: {r2_n:.3f} | RMSE: ${rmse_n:.3f}/mile\n")
        f.write(f"Embeddings (dist + PCA + sim)  -> R²: {r2_e:.3f} | RMSE: ${rmse_e:.3f}/mile\n")
        f.write(f"\nΔ Embeddings vs Baseline:  ΔR²=+{(r2_e - r2_b):.3f} | ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile\n")
        f.write(f"Δ Embeddings vs Name-IDs:  ΔR²=+{(r2_e - r2_n):.3f} | ΔRMSE=-${(rmse_n - rmse_e):.3f}/mile\n")

    print("\n" + "="*60)
    print("RESULTS")
    print("="*60)
    print(f"Baseline    -> R²: {r2_b:.3f} | RMSE: ${rmse_b:.3f}/mile")
    print(f"Name-IDs    -> R²: {r2_n:.3f} | RMSE: ${rmse_n:.3f}/mile")
    print(f"Embeddings  -> R²: {r2_e:.3f} | RMSE: ${rmse_e:.3f}/mile")
    print(f"\nΔ Embeddings vs Baseline: ΔR²=+{(r2_e - r2_b):.3f} | ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile")
    if r2_b < 1.0:
        print(f"Improvement: {((r2_e - r2_b) / (1 - r2_b) * 100):.1f}% of remaining variance explained")
    print("="*60)
    print(f"Saved metrics to {save_metrics}")

    # ------------- Plots (Pred vs Actual) -------------
    def scatter(y_true, y_pred, r2, title, path):
        plt.figure(figsize=(6,6))
        plt.scatter(y_true, y_pred, alpha=0.5, s=20)
        lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
        plt.plot(lims, lims, 'r--', alpha=0.8, linewidth=2)
        plt.xlabel("Actual RPM ($/mile)", fontsize=11)
        plt.ylabel("Predicted RPM ($/mile)", fontsize=11)
        plt.title(f"{title}\nR² = {r2:.3f}", fontsize=12)
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig(path, dpi=180)
        plt.close()

    scatter(y_te, yhat_b, r2_b, "Baseline (distance only)",  "pred_vs_actual_baseline.png")
    scatter(y_te, yhat_n, r2_n, "Name-IDs (memorizer)",      "pred_vs_actual_names.png")
    scatter(y_te, yhat_e, r2_e, "Embeddings (semantic features)", "pred_vs_actual_embeddings.png")
    print("Saved scatter plots: pred_vs_actual_*.png")

 # %% [markdown]
 # ## 6) Run the Tutorial
 # Execute these cells to generate the maps and the modeling results.
 # - `map_geo_clusters.png`
 # - `map_embedding_clusters.png`
 # - `rates_oos_metrics.txt`
 # - `pred_vs_actual_baseline.png`, `pred_vs_actual_names.png`, `pred_vs_actual_embeddings.png`

 # %%
 if __name__ == "__main__":
    cluster_by_geography(k=7, save_path="map_geo_clusters.png")
    cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png")
    lane_demo(save_metrics="rates_oos_metrics.txt")
	# %% [markdown]
	# # Why AI Thinks Phoenix and Miami Belong Together — Tutorial Notebook
	#
	# This notebook-style script walks through:
	# 1) Clustering US cities by geography (lat/lon) vs meaning (embeddings)
	# 2) Building a synthetic lane-rate dataset where semantic city characteristics
	# (derived from embeddings) actually drive part of the rate variance
	# 3) Training three models to predict rate-per-mile (RPM):
	# - Baseline (Distance-only)
	# - Name IDs (one-hot origin/destination) — "memorizer"
	# - Distance + Embeddings (semantic features)
	#
	# It’s written as a tutorial: short chunks, heavy comments, and conservative dependencies.
	# If you have an OPENAI_API_KEY, we’ll use OpenAI embeddings; otherwise we fall back to
	# a deterministic hashing embed that preserves the workflow (but not the same semantics).

	# %% [markdown]
	# ## 0) Imports & Configuration

	# %%
	import os, json, hashlib, math, random, pathlib
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	from sklearn.cluster import KMeans
	from sklearn.decomposition import TruncatedSVD, PCA
	from sklearn.metrics import r2_score, mean_squared_error
	from sklearn.preprocessing import OneHotEncoder

	# Regressors & Pipelines
	from sklearn.linear_model import RidgeCV
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import make_pipeline

	# Reproducibility
	random.seed(42)
	np.random.seed(42)

	# Embedding model name (only used if you have an API key set)
	EMB_MODEL = os.getenv("EMB_MODEL", "text-embedding-3-large")
	EMB_CACHE_PATH = pathlib.Path("city_embeddings_cache.npz")

	# Inline matplotlib plots (if running in a notebook)
	plt.rcParams["figure.dpi"] = 130

	# %% [markdown]
	# ## 1) City List (with coordinates)
	# Feel free to add/remove cities. If you change the list, delete the cache file
	# `city_embeddings_cache.npz` to refresh embeddings.

	# %%
	CITIES = [
	("New York, NY", 40.7128, -74.0060),
	("Boston, MA", 42.3601, -71.0589),
	("Philadelphia, PA", 39.9526, -75.1652),
	("Washington, DC", 38.9072, -77.0369),
	("Charlotte, NC", 35.2271, -80.8431),
	("Atlanta, GA", 33.7490, -84.3880),
	("Miami, FL", 25.7617, -80.1918),
	("Orlando, FL", 28.5383, -81.3792),

	("Chicago, IL", 41.8781, -87.6298),
	("Detroit, MI", 42.3314, -83.0458),
	("Cleveland, OH", 41.4993, -81.6944),
	("Minneapolis, MN", 44.9778, -93.2650),
	("St. Louis, MO", 38.6270, -90.1994),
	("Kansas City, MO", 39.0997, -94.5786),
	("Omaha, NE", 41.2565, -95.9345),
	("Indianapolis, IN", 39.7684, -86.1581),

	("Dallas, TX", 32.7767, -96.7970),
	("Houston, TX", 29.7604, -95.3698),
	("Austin, TX", 30.2672, -97.7431),
	("San Antonio, TX", 29.4241, -98.4936),
	("Oklahoma City, OK", 35.4676, -97.5164),

	("Denver, CO", 39.7392, -104.9903),
	("Salt Lake City, UT", 40.7608, -111.8910),
	("Phoenix, AZ", 33.4484, -112.0740),
	("Las Vegas, NV", 36.1699, -115.1398),
	("Boise, ID", 43.6150, -116.2023),

	("Los Angeles, CA", 34.0522, -118.2437),
	("San Diego, CA", 32.7157, -117.1611),
	("San Francisco, CA", 37.7749, -122.4194),
	("San Jose, CA", 37.3382, -121.8863),
	("Sacramento, CA", 38.5816, -121.4944),
	("Portland, OR", 45.5051, -122.6750),
	("Seattle, WA", 47.6062, -122.3321),
	]

	# %% [markdown]
	# ## 2) Utility Functions

	# %%
	def l2_normalize(X: np.ndarray) -> np.ndarray:
	"""Row-wise L2 normalize matrix X (safe for zero rows)."""
	n = np.linalg.norm(X, axis=1, keepdims=True)
	n[n == 0] = 1.0
	return X / n

	def _openai_embed(batch, model):
	"""
	Fetch embeddings for a list of strings from the OpenAI embeddings API.
	Requires OPENAI_API_KEY in your environment.
	"""
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError("OPENAI_API_KEY not set")
	import requests
	url = "https://api.openai.com/v1/embeddings"
	headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
	r = requests.post(url, headers=headers, json={"model": model, "input": batch}, timeout=60)
	if r.status_code != 200:
	raise RuntimeError(f"Embedding API error {r.status_code}: {r.text}")
	data = r.json()["data"]
	return [row["embedding"] for row in data]

	def hash_embed(names, dim=384):
	"""
	Deterministic fallback embeddings using trigram hashing.
	This preserves the pipeline (clustering, PCA, modeling) even offline.
	"""
	V = np.zeros((len(names), dim), dtype=np.float32)
	for i, s in enumerate(names):
	s = s.lower()
	for j in range(len(s) - 2):
	tri = s[j:j+3]
	h = int(hashlib.md5(tri.encode()).hexdigest(), 16) % dim
	V[i, h] += 1.0
	return l2_normalize(V)

	def load_cache():
	"""Load {name -> vector} mapping from local npz cache, if present."""
	if EMB_CACHE_PATH.exists():
	npz = np.load(EMB_CACHE_PATH, allow_pickle=True)
	names = list(npz["names"])
	vecs = npz["vectors"]
	return {n: vecs[i] for i, n in enumerate(names)}
	return {}

	def save_cache(name_to_vec):
	"""Save {name -> vector} mapping to local npz cache."""
	names = np.array(list(name_to_vec.keys()), dtype=object)
	vectors = np.vstack([name_to_vec[n] for n in names])
	np.savez_compressed(EMB_CACHE_PATH, names=names, vectors=vectors)

	def get_embeddings_with_cache(names, model=EMB_MODEL, allow_fallback=True, prefer_cache_only=False):
	"""
	Get embeddings for a list of names with caching and graceful fallback.
	- If OPENAI_API_KEY is set, we call OpenAI for any missing names.
	- If not (or it fails), we use hash_embed() for those missing names.
	"""
	names = list(names)
	cache = load_cache()
	missing = [n for n in names if n not in cache]

	if missing and not prefer_cache_only:
	try:
	all_vecs = []
	B = 256
	for i in range(0, len(missing), B):
	chunk = missing[i:i+B]
	all_vecs.extend(_openai_embed(chunk, model))
	for n, v in zip(missing, all_vecs):
	cache[n] = np.array(v, dtype=np.float32)
	print(f"Embeddings fetched from OpenAI for {len(missing)} new names ({len(all_vecs[0])} dims).")
	save_cache(cache)
	except Exception as e:
	if not allow_fallback:
	raise
	print(f"[warn] OpenAI fetch failed: {e}\nUsing fallback (hash) for {len(missing)} names.")
	F = hash_embed(missing)
	for i, n in enumerate(missing):
	cache[n] = F[i]
	save_cache(cache)

	V = np.vstack([cache[n] for n in names]).astype(np.float32)
	return l2_normalize(V)

	def haversine_miles(lat1, lon1, lat2, lon2):
	"""Great-circle distance in miles between two lat/lon points."""
	R_km = 6371.0088
	dphi = math.radians(lat2 - lat1)
	dlamb = math.radians(lon2 - lon1)
	phi1 = math.radians(lat1)
	phi2 = math.radians(lat2)
	a = math.sin(dphi/2)*2 + math.cos(phi1)math.cos(phi2)math.sin(dlamb/2)*2
	return 0.621371 * (2R_kmmath.asin(math.sqrt(a)))

	def rmse(y_true, y_pred):
	"""Root Mean Squared Error (RMSE)."""
	return np.sqrt(mean_squared_error(y_true, y_pred))

	def safe_one_hot_encoder():
	"""
	Return a OneHotEncoder that works across sklearn versions.
	- sklearn >= 1.2 uses 'sparse_output'
	- older versions use 'sparse'
	"""
	try:
	return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
	except TypeError:
	# Fallback for older scikit-learn
	return OneHotEncoder(handle_unknown="ignore", sparse=False)

	# %% [markdown]
	# ## 3) A. Clustering by Geography (lat/lon)
	# Simple sanity check: nearby cities should cluster together on coordinates.

	# %%
	def cluster_by_geography(k=7, save_path="map_geo_clusters.png"):
	names = [c[0] for c in CITIES]
	lat = np.array([c[1] for c in CITIES])
	lon = np.array([c[2] for c in CITIES])
	X = np.c_[lat, lon]

	km = KMeans(n_clusters=k, n_init=20, random_state=42)
	labels = km.fit_predict(X)

	plt.figure(figsize=(10, 6))
	for lab in sorted(set(labels)):
	idx = labels == lab
	plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}")
	for i in np.where(idx)[0]:
	plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8)
	plt.title("US Cities — Clusters by Geography (k-means on lat/lon)")
	plt.xlabel("Longitude"); plt.ylabel("Latitude")
	plt.legend(loc="best", ncol=2, fontsize=9)
	plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close()
	print(f"Saved {save_path}")

	# %% [markdown]
	# ## 4) B. Clustering by Meaning (embeddings)
	# We’ll embed the city names and then cluster in a reduced semantic space (SVD).
	# If you’re offline/no API key, the fallback hashing embeddings keep the demo runnable.

	# %%
	def cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png"):
	names = [c[0] for c in CITIES]
	lat = np.array([c[1] for c in CITIES])
	lon = np.array([c[2] for c in CITIES])

	# 1) Get embeddings (OpenAI if available; fallback otherwise)
	V = get_embeddings_with_cache(names)

	# 2) Reduce dimensionality for clustering stability & interpretability
	V2 = TruncatedSVD(n_components=12, random_state=42).fit_transform(V)

	# 3) Cluster in semantic space
	km = KMeans(n_clusters=k, n_init=20, random_state=42)
	labels = km.fit_predict(V2)

	# 4) Plot clusters on the geographic map (to SEE semantic groupings)
	plt.figure(figsize=(10, 6))
	for lab in sorted(set(labels)):
	idx = labels == lab
	plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}")
	for i in np.where(idx)[0]:
	plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8)
	plt.title("US Cities — Clusters by Embeddings (k-means on text vectors)")
	plt.xlabel("Longitude"); plt.ylabel("Latitude")
	plt.legend(loc="best", ncol=2, fontsize=9)
	plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close()
	print(f"Saved {save_path}")

	# %% [markdown]
	# ## 5) C. Lane-Rate Simulation + Modeling
	# We create synthetic lane data where the true signal includes:
	# - A weak, noisy relationship to distance (miles)
	# - A semantic component using PCA of embeddings (e.g., “Sunbelt vs Rust Belt” style axes)
	# - A similarity effect (origin/destination semantic cosine similarity)
	#
	# Then we compare three models on held-out destination cities:
	# - Distance only
	# - Distance + one-hot origin/destination (memorizer)
	# - Distance + embedding features (semantic generalization)

	# %%
	def lane_demo(save_metrics="rates_oos_metrics.txt"):
	# ------------- Lookups -------------
	names = [c[0] for c in CITIES]
	lat = {c[0]: c[1] for c in CITIES}
	lon = {c[0]: c[2] for c in CITIES}

	# ------------- Embeddings & PCA -------------
	# Get normalized embeddings for each city
	V = get_embeddings_with_cache(names)
	# PCA = interpretable semantic axes (PC1/PC2 often align with broad traits)
	pca = PCA(n_components=8, random_state=42)
	V_pca = pca.fit_transform(V)
	name2pca = {n: V_pca[i] for i, n in enumerate(names)}
	name2vec = {n: V[i] for i, n in enumerate(names)}
	print(f"PCA top-3 explained variance ratios: {pca.explained_variance_ratio_[:3]}")

	# ------------- Generate synthetic lanes -------------
	N = 4000
	rows = []
	rng = np.random.RandomState(11)

	for _ in range(N):
	# Random origin/destination pair
	o, d = random.sample(names, 2)
	miles = haversine_miles(lat[o], lon[o], lat[d], lon[d])

	# Semantic features
	o_pca = name2pca[o]
	d_pca = name2pca[d]
	eo = name2vec[o]
	ed = name2vec[d]
	sim_od = float(np.dot(eo, ed)) # cosine since vectors are L2-normalized

	# --- Weak distance signal ---
	# (This keeps "miles matters" visible, but not dominant.)
	if miles < 300:
	base_rpm = rng.uniform(1.7, 2.6)
	elif miles < 800:
	base_rpm = rng.uniform(1.3, 2.0)
	elif miles < 1500:
	base_rpm = rng.uniform(1.1, 1.7)
	else:
	base_rpm = rng.uniform(0.8, 1.5)

	# --- Semantic signal (destination “pull,” origin “push,” similarity/backhaul, interactions) ---
	dest_effect = 0.45 * d_pca[0] + 0.35 * d_pca[1]
	orig_effect = -0.25 * o_pca[0] - 0.20 * o_pca[1]
	similarity_effect = -0.40 * max(0, sim_od) # more similar => more efficient => cheaper
	cross_effect = 0.20 * d_pca[0] * o_pca[1]

	semantic_rpm = dest_effect + orig_effect + similarity_effect + cross_effect
	noise_rpm = rng.normal(0, 0.15)

	rpm = max(0.5, base_rpm + semantic_rpm + noise_rpm)
	total_rate = miles * rpm

	rows.append({
	'origin': o,
	'dest': d,
	'miles': miles,
	'sim_od': sim_od,
	'dest_pc1': d_pca[0], 'dest_pc2': d_pca[1], 'dest_pc3': d_pca[2],
	'orig_pc1': o_pca[0], 'orig_pc2': o_pca[1], 'orig_pc3': o_pca[2],
	'rpm': rpm,
	'total_rate': total_rate
	})

	df = pd.DataFrame(rows)

	# ------------- Out-of-sample split (HOLD OUT destination cities) -------------
	# This tests true generalization to unseen destinations.
	dests = df["dest"].unique()
	held = set(np.random.RandomState(42).choice(dests, size=max(16, int(len(dests)*0.50)), replace=False))
	te = df["dest"].isin(held).values
	tr = ~te

	print(f"Train samples: {tr.sum():,} \| Test samples: {te.sum():,}")
	print(f"Held-out destinations: {len(held)} cities")

	y_tr = df.loc[tr, "rpm"].values.astype(np.float32)
	y_te = df.loc[te, "rpm"].values.astype(np.float32)

	# ------------- Baseline Features (distance only) -------------
	# Add simple distance bins to allow some non-linearity without trees.
	df['dist_bin_short'] = (df['miles'] < 300).astype(float)
	df['dist_bin_mid'] = ((df['miles'] >= 300) & (df['miles'] < 800)).astype(float)
	df['dist_bin_long'] = ((df['miles'] >= 800) & (df['miles'] < 1500)).astype(float)

	def build_dist_matrix(mask):
	miles = df.loc[mask, "miles"].values / 1000.0
	return np.c_[
	miles,
	miles ** 2,
	df.loc[mask, "dist_bin_short"].values,
	df.loc[mask, "dist_bin_mid"].values,
	df.loc[mask, "dist_bin_long"].values
	].astype(np.float32)

	Xb_tr = build_dist_matrix(tr)
	Xb_te = build_dist_matrix(te)

	# ------------- Name ID Features (memorizer) -------------
	enc = safe_one_hot_encoder()
	enc.fit(df.loc[tr, ["origin","dest"]])
	Xid_tr = enc.transform(df.loc[tr, ["origin","dest"]]).astype(np.float32)
	Xid_te = enc.transform(df.loc[te, ["origin","dest"]]).astype(np.float32)
	Xname_tr = np.c_[Xb_tr, Xid_tr]
	Xname_te = np.c_[Xb_te, Xid_te]

	# ------------- Embedding Features (semantic generalizer) -------------
	def build_emb_matrix(mask):
	return np.c_[
	build_dist_matrix(mask),
	df.loc[mask, "sim_od"].values,
	df.loc[mask, "dest_pc1"].values,
	df.loc[mask, "dest_pc2"].values,
	df.loc[mask, "dest_pc3"].values,
	df.loc[mask, "orig_pc1"].values,
	df.loc[mask, "orig_pc2"].values,
	df.loc[mask, "orig_pc3"].values,
	np.maximum(0, df.loc[mask, "sim_od"].values), # ReLU(similarity)
	df.loc[mask, "dest_pc1"].values * df.loc[mask, "orig_pc2"].values # simple interaction
	].astype(np.float32)

	Xe_tr = build_emb_matrix(tr)
	Xe_te = build_emb_matrix(te)

	# ------------- Train Ridge Regressors (with CV over alpha) -------------
	alphas = np.logspace(-2, 3, 13) # {0.01 ... 1000}
	base = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
	name = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
	emb = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))

	base.fit(Xb_tr, y_tr)
	name.fit(Xname_tr, y_tr)
	emb.fit(Xe_tr, y_tr)

	yhat_b = base.predict(Xb_te)
	yhat_n = name.predict(Xname_te)
	yhat_e = emb.predict(Xe_te)

	r2_b, rmse_b = r2_score(y_te, yhat_b), rmse(y_te, yhat_b)
	r2_n, rmse_n = r2_score(y_te, yhat_n), rmse(y_te, yhat_n)
	r2_e, rmse_e = r2_score(y_te, yhat_e), rmse(y_te, yhat_e)

	# ------------- Write results -------------
	with open(save_metrics, "w") as f:
	f.write("TASK: Predict rate per mile (RPM)\n")
	f.write("Data generation: PCA components from embeddings drive semantic variance\n")
	f.write("OOS split: held-out destination cities only appear in TEST\n")
	f.write(f"Held-out dests: {sorted(list(held))}\n\n")
	f.write(f"Baseline (distance only) -> R²: {r2_b:.3f} \| RMSE: ${rmse_b:.3f}/mile\n")
	f.write(f"Name-IDs (one-hot O/D) -> R²: {r2_n:.3f} \| RMSE: ${rmse_n:.3f}/mile\n")
	f.write(f"Embeddings (dist + PCA + sim) -> R²: {r2_e:.3f} \| RMSE: ${rmse_e:.3f}/mile\n")
	f.write(f"\nΔ Embeddings vs Baseline: ΔR²=+{(r2_e - r2_b):.3f} \| ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile\n")
	f.write(f"Δ Embeddings vs Name-IDs: ΔR²=+{(r2_e - r2_n):.3f} \| ΔRMSE=-${(rmse_n - rmse_e):.3f}/mile\n")

	print("\n" + "="*60)
	print("RESULTS")
	print("="*60)
	print(f"Baseline -> R²: {r2_b:.3f} \| RMSE: ${rmse_b:.3f}/mile")
	print(f"Name-IDs -> R²: {r2_n:.3f} \| RMSE: ${rmse_n:.3f}/mile")
	print(f"Embeddings -> R²: {r2_e:.3f} \| RMSE: ${rmse_e:.3f}/mile")
	print(f"\nΔ Embeddings vs Baseline: ΔR²=+{(r2_e - r2_b):.3f} \| ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile")
	if r2_b < 1.0:
	print(f"Improvement: {((r2_e - r2_b) / (1 - r2_b) * 100):.1f}% of remaining variance explained")
	print("="*60)
	print(f"Saved metrics to {save_metrics}")

	# ------------- Plots (Pred vs Actual) -------------
	def scatter(y_true, y_pred, r2, title, path):
	plt.figure(figsize=(6,6))
	plt.scatter(y_true, y_pred, alpha=0.5, s=20)
	lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
	plt.plot(lims, lims, 'r--', alpha=0.8, linewidth=2)
	plt.xlabel("Actual RPM ($/mile)", fontsize=11)
	plt.ylabel("Predicted RPM ($/mile)", fontsize=11)
	plt.title(f"{title}\nR² = {r2:.3f}", fontsize=12)
	plt.grid(alpha=0.3)
	plt.tight_layout()
	plt.savefig(path, dpi=180)
	plt.close()

	scatter(y_te, yhat_b, r2_b, "Baseline (distance only)", "pred_vs_actual_baseline.png")
	scatter(y_te, yhat_n, r2_n, "Name-IDs (memorizer)", "pred_vs_actual_names.png")
	scatter(y_te, yhat_e, r2_e, "Embeddings (semantic features)", "pred_vs_actual_embeddings.png")
	print("Saved scatter plots: pred_vs_actual_*.png")

	# %% [markdown]
	# ## 6) Run the Tutorial
	# Execute these cells to generate the maps and the modeling results.
	# - `map_geo_clusters.png`
	# - `map_embedding_clusters.png`
	# - `rates_oos_metrics.txt`
	# - `pred_vs_actual_baseline.png`, `pred_vs_actual_names.png`, `pred_vs_actual_embeddings.png`

	# %%
	if __name__ == "__main__":
	cluster_by_geography(k=7, save_path="map_geo_clusters.png")
	cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png")
	lane_demo(save_metrics="rates_oos_metrics.txt")
No results found