Created
October 19, 2025 13:56
-
-
Save FrankRuns/b0fbe1efce3507d7166c51972e080252 to your computer and use it in GitHub Desktop.
Synthetic data experiment to show how embeddings might improve transportation rate predictions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # %% [markdown] | |
| # # Why AI Thinks Phoenix and Miami Belong Together — Tutorial Notebook | |
| # | |
| # This notebook-style script walks through: | |
| # 1) Clustering US cities by **geography** (lat/lon) vs **meaning** (embeddings) | |
| # 2) Building a **synthetic lane-rate** dataset where semantic city characteristics | |
| # (derived from embeddings) actually drive part of the rate variance | |
| # 3) Training three models to predict rate-per-mile (RPM): | |
| # - Baseline (Distance-only) | |
| # - Name IDs (one-hot origin/destination) — "memorizer" | |
| # - Distance + Embeddings (semantic features) | |
| # | |
| # It’s written as a tutorial: short chunks, heavy comments, and conservative dependencies. | |
| # If you have an OPENAI_API_KEY, we’ll use OpenAI embeddings; otherwise we fall back to | |
| # a deterministic hashing embed that preserves the *workflow* (but not the same semantics). | |
| # %% [markdown] | |
| # ## 0) Imports & Configuration | |
| # %% | |
| import os, json, hashlib, math, random, pathlib | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import TruncatedSVD, PCA | |
| from sklearn.metrics import r2_score, mean_squared_error | |
| from sklearn.preprocessing import OneHotEncoder | |
| # Regressors & Pipelines | |
| from sklearn.linear_model import RidgeCV | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.pipeline import make_pipeline | |
| # Reproducibility | |
| random.seed(42) | |
| np.random.seed(42) | |
| # Embedding model name (only used if you have an API key set) | |
| EMB_MODEL = os.getenv("EMB_MODEL", "text-embedding-3-large") | |
| EMB_CACHE_PATH = pathlib.Path("city_embeddings_cache.npz") | |
| # Inline matplotlib plots (if running in a notebook) | |
| plt.rcParams["figure.dpi"] = 130 | |
| # %% [markdown] | |
| # ## 1) City List (with coordinates) | |
| # Feel free to add/remove cities. If you change the list, delete the cache file | |
| # `city_embeddings_cache.npz` to refresh embeddings. | |
| # %% | |
| CITIES = [ | |
| ("New York, NY", 40.7128, -74.0060), | |
| ("Boston, MA", 42.3601, -71.0589), | |
| ("Philadelphia, PA", 39.9526, -75.1652), | |
| ("Washington, DC", 38.9072, -77.0369), | |
| ("Charlotte, NC", 35.2271, -80.8431), | |
| ("Atlanta, GA", 33.7490, -84.3880), | |
| ("Miami, FL", 25.7617, -80.1918), | |
| ("Orlando, FL", 28.5383, -81.3792), | |
| ("Chicago, IL", 41.8781, -87.6298), | |
| ("Detroit, MI", 42.3314, -83.0458), | |
| ("Cleveland, OH", 41.4993, -81.6944), | |
| ("Minneapolis, MN", 44.9778, -93.2650), | |
| ("St. Louis, MO", 38.6270, -90.1994), | |
| ("Kansas City, MO", 39.0997, -94.5786), | |
| ("Omaha, NE", 41.2565, -95.9345), | |
| ("Indianapolis, IN", 39.7684, -86.1581), | |
| ("Dallas, TX", 32.7767, -96.7970), | |
| ("Houston, TX", 29.7604, -95.3698), | |
| ("Austin, TX", 30.2672, -97.7431), | |
| ("San Antonio, TX", 29.4241, -98.4936), | |
| ("Oklahoma City, OK", 35.4676, -97.5164), | |
| ("Denver, CO", 39.7392, -104.9903), | |
| ("Salt Lake City, UT", 40.7608, -111.8910), | |
| ("Phoenix, AZ", 33.4484, -112.0740), | |
| ("Las Vegas, NV", 36.1699, -115.1398), | |
| ("Boise, ID", 43.6150, -116.2023), | |
| ("Los Angeles, CA", 34.0522, -118.2437), | |
| ("San Diego, CA", 32.7157, -117.1611), | |
| ("San Francisco, CA", 37.7749, -122.4194), | |
| ("San Jose, CA", 37.3382, -121.8863), | |
| ("Sacramento, CA", 38.5816, -121.4944), | |
| ("Portland, OR", 45.5051, -122.6750), | |
| ("Seattle, WA", 47.6062, -122.3321), | |
| ] | |
| # %% [markdown] | |
| # ## 2) Utility Functions | |
| # %% | |
| def l2_normalize(X: np.ndarray) -> np.ndarray: | |
| """Row-wise L2 normalize matrix X (safe for zero rows).""" | |
| n = np.linalg.norm(X, axis=1, keepdims=True) | |
| n[n == 0] = 1.0 | |
| return X / n | |
| def _openai_embed(batch, model): | |
| """ | |
| Fetch embeddings for a list of strings from the OpenAI embeddings API. | |
| Requires OPENAI_API_KEY in your environment. | |
| """ | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("OPENAI_API_KEY not set") | |
| import requests | |
| url = "https://api.openai.com/v1/embeddings" | |
| headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} | |
| r = requests.post(url, headers=headers, json={"model": model, "input": batch}, timeout=60) | |
| if r.status_code != 200: | |
| raise RuntimeError(f"Embedding API error {r.status_code}: {r.text}") | |
| data = r.json()["data"] | |
| return [row["embedding"] for row in data] | |
| def hash_embed(names, dim=384): | |
| """ | |
| Deterministic fallback embeddings using trigram hashing. | |
| This preserves the pipeline (clustering, PCA, modeling) even offline. | |
| """ | |
| V = np.zeros((len(names), dim), dtype=np.float32) | |
| for i, s in enumerate(names): | |
| s = s.lower() | |
| for j in range(len(s) - 2): | |
| tri = s[j:j+3] | |
| h = int(hashlib.md5(tri.encode()).hexdigest(), 16) % dim | |
| V[i, h] += 1.0 | |
| return l2_normalize(V) | |
| def load_cache(): | |
| """Load {name -> vector} mapping from local npz cache, if present.""" | |
| if EMB_CACHE_PATH.exists(): | |
| npz = np.load(EMB_CACHE_PATH, allow_pickle=True) | |
| names = list(npz["names"]) | |
| vecs = npz["vectors"] | |
| return {n: vecs[i] for i, n in enumerate(names)} | |
| return {} | |
| def save_cache(name_to_vec): | |
| """Save {name -> vector} mapping to local npz cache.""" | |
| names = np.array(list(name_to_vec.keys()), dtype=object) | |
| vectors = np.vstack([name_to_vec[n] for n in names]) | |
| np.savez_compressed(EMB_CACHE_PATH, names=names, vectors=vectors) | |
| def get_embeddings_with_cache(names, model=EMB_MODEL, allow_fallback=True, prefer_cache_only=False): | |
| """ | |
| Get embeddings for a list of names with caching and graceful fallback. | |
| - If OPENAI_API_KEY is set, we call OpenAI for any missing names. | |
| - If not (or it fails), we use hash_embed() for those missing names. | |
| """ | |
| names = list(names) | |
| cache = load_cache() | |
| missing = [n for n in names if n not in cache] | |
| if missing and not prefer_cache_only: | |
| try: | |
| all_vecs = [] | |
| B = 256 | |
| for i in range(0, len(missing), B): | |
| chunk = missing[i:i+B] | |
| all_vecs.extend(_openai_embed(chunk, model)) | |
| for n, v in zip(missing, all_vecs): | |
| cache[n] = np.array(v, dtype=np.float32) | |
| print(f"Embeddings fetched from OpenAI for {len(missing)} new names ({len(all_vecs[0])} dims).") | |
| save_cache(cache) | |
| except Exception as e: | |
| if not allow_fallback: | |
| raise | |
| print(f"[warn] OpenAI fetch failed: {e}\nUsing fallback (hash) for {len(missing)} names.") | |
| F = hash_embed(missing) | |
| for i, n in enumerate(missing): | |
| cache[n] = F[i] | |
| save_cache(cache) | |
| V = np.vstack([cache[n] for n in names]).astype(np.float32) | |
| return l2_normalize(V) | |
| def haversine_miles(lat1, lon1, lat2, lon2): | |
| """Great-circle distance in miles between two lat/lon points.""" | |
| R_km = 6371.0088 | |
| dphi = math.radians(lat2 - lat1) | |
| dlamb = math.radians(lon2 - lon1) | |
| phi1 = math.radians(lat1) | |
| phi2 = math.radians(lat2) | |
| a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlamb/2)**2 | |
| return 0.621371 * (2*R_km*math.asin(math.sqrt(a))) | |
| def rmse(y_true, y_pred): | |
| """Root Mean Squared Error (RMSE).""" | |
| return np.sqrt(mean_squared_error(y_true, y_pred)) | |
| def safe_one_hot_encoder(): | |
| """ | |
| Return a OneHotEncoder that works across sklearn versions. | |
| - sklearn >= 1.2 uses 'sparse_output' | |
| - older versions use 'sparse' | |
| """ | |
| try: | |
| return OneHotEncoder(handle_unknown="ignore", sparse_output=False) | |
| except TypeError: | |
| # Fallback for older scikit-learn | |
| return OneHotEncoder(handle_unknown="ignore", sparse=False) | |
| # %% [markdown] | |
| # ## 3) A. Clustering by Geography (lat/lon) | |
| # Simple sanity check: nearby cities should cluster together on coordinates. | |
| # %% | |
| def cluster_by_geography(k=7, save_path="map_geo_clusters.png"): | |
| names = [c[0] for c in CITIES] | |
| lat = np.array([c[1] for c in CITIES]) | |
| lon = np.array([c[2] for c in CITIES]) | |
| X = np.c_[lat, lon] | |
| km = KMeans(n_clusters=k, n_init=20, random_state=42) | |
| labels = km.fit_predict(X) | |
| plt.figure(figsize=(10, 6)) | |
| for lab in sorted(set(labels)): | |
| idx = labels == lab | |
| plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}") | |
| for i in np.where(idx)[0]: | |
| plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8) | |
| plt.title("US Cities — Clusters by Geography (k-means on lat/lon)") | |
| plt.xlabel("Longitude"); plt.ylabel("Latitude") | |
| plt.legend(loc="best", ncol=2, fontsize=9) | |
| plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close() | |
| print(f"Saved {save_path}") | |
| # %% [markdown] | |
| # ## 4) B. Clustering by Meaning (embeddings) | |
| # We’ll embed the **city names** and then cluster in a reduced semantic space (SVD). | |
| # If you’re offline/no API key, the fallback hashing embeddings keep the demo runnable. | |
| # %% | |
| def cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png"): | |
| names = [c[0] for c in CITIES] | |
| lat = np.array([c[1] for c in CITIES]) | |
| lon = np.array([c[2] for c in CITIES]) | |
| # 1) Get embeddings (OpenAI if available; fallback otherwise) | |
| V = get_embeddings_with_cache(names) | |
| # 2) Reduce dimensionality for clustering stability & interpretability | |
| V2 = TruncatedSVD(n_components=12, random_state=42).fit_transform(V) | |
| # 3) Cluster in semantic space | |
| km = KMeans(n_clusters=k, n_init=20, random_state=42) | |
| labels = km.fit_predict(V2) | |
| # 4) Plot clusters on the geographic map (to SEE semantic groupings) | |
| plt.figure(figsize=(10, 6)) | |
| for lab in sorted(set(labels)): | |
| idx = labels == lab | |
| plt.scatter(lon[idx], lat[idx], alpha=0.85, label=f"Cluster {lab}") | |
| for i in np.where(idx)[0]: | |
| plt.annotate(names[i].split(",")[0], (lon[i], lat[i]), fontsize=8) | |
| plt.title("US Cities — Clusters by Embeddings (k-means on text vectors)") | |
| plt.xlabel("Longitude"); plt.ylabel("Latitude") | |
| plt.legend(loc="best", ncol=2, fontsize=9) | |
| plt.tight_layout(); plt.savefig(save_path, dpi=180); plt.close() | |
| print(f"Saved {save_path}") | |
| # %% [markdown] | |
| # ## 5) C. Lane-Rate Simulation + Modeling | |
| # We create **synthetic** lane data where the *true* signal includes: | |
| # - A weak, noisy relationship to distance (miles) | |
| # - A semantic component using PCA of embeddings (e.g., “Sunbelt vs Rust Belt” style axes) | |
| # - A similarity effect (origin/destination semantic cosine similarity) | |
| # | |
| # Then we compare three models on **held-out destination cities**: | |
| # - Distance only | |
| # - Distance + one-hot origin/destination (memorizer) | |
| # - Distance + embedding features (semantic generalization) | |
| # %% | |
| def lane_demo(save_metrics="rates_oos_metrics.txt"): | |
| # ------------- Lookups ------------- | |
| names = [c[0] for c in CITIES] | |
| lat = {c[0]: c[1] for c in CITIES} | |
| lon = {c[0]: c[2] for c in CITIES} | |
| # ------------- Embeddings & PCA ------------- | |
| # Get normalized embeddings for each city | |
| V = get_embeddings_with_cache(names) | |
| # PCA = interpretable semantic axes (PC1/PC2 often align with broad traits) | |
| pca = PCA(n_components=8, random_state=42) | |
| V_pca = pca.fit_transform(V) | |
| name2pca = {n: V_pca[i] for i, n in enumerate(names)} | |
| name2vec = {n: V[i] for i, n in enumerate(names)} | |
| print(f"PCA top-3 explained variance ratios: {pca.explained_variance_ratio_[:3]}") | |
| # ------------- Generate synthetic lanes ------------- | |
| N = 4000 | |
| rows = [] | |
| rng = np.random.RandomState(11) | |
| for _ in range(N): | |
| # Random origin/destination pair | |
| o, d = random.sample(names, 2) | |
| miles = haversine_miles(lat[o], lon[o], lat[d], lon[d]) | |
| # Semantic features | |
| o_pca = name2pca[o] | |
| d_pca = name2pca[d] | |
| eo = name2vec[o] | |
| ed = name2vec[d] | |
| sim_od = float(np.dot(eo, ed)) # cosine since vectors are L2-normalized | |
| # --- Weak distance signal --- | |
| # (This keeps "miles matters" visible, but not dominant.) | |
| if miles < 300: | |
| base_rpm = rng.uniform(1.7, 2.6) | |
| elif miles < 800: | |
| base_rpm = rng.uniform(1.3, 2.0) | |
| elif miles < 1500: | |
| base_rpm = rng.uniform(1.1, 1.7) | |
| else: | |
| base_rpm = rng.uniform(0.8, 1.5) | |
| # --- Semantic signal (destination “pull,” origin “push,” similarity/backhaul, interactions) --- | |
| dest_effect = 0.45 * d_pca[0] + 0.35 * d_pca[1] | |
| orig_effect = -0.25 * o_pca[0] - 0.20 * o_pca[1] | |
| similarity_effect = -0.40 * max(0, sim_od) # more similar => more efficient => cheaper | |
| cross_effect = 0.20 * d_pca[0] * o_pca[1] | |
| semantic_rpm = dest_effect + orig_effect + similarity_effect + cross_effect | |
| noise_rpm = rng.normal(0, 0.15) | |
| rpm = max(0.5, base_rpm + semantic_rpm + noise_rpm) | |
| total_rate = miles * rpm | |
| rows.append({ | |
| 'origin': o, | |
| 'dest': d, | |
| 'miles': miles, | |
| 'sim_od': sim_od, | |
| 'dest_pc1': d_pca[0], 'dest_pc2': d_pca[1], 'dest_pc3': d_pca[2], | |
| 'orig_pc1': o_pca[0], 'orig_pc2': o_pca[1], 'orig_pc3': o_pca[2], | |
| 'rpm': rpm, | |
| 'total_rate': total_rate | |
| }) | |
| df = pd.DataFrame(rows) | |
| # ------------- Out-of-sample split (HOLD OUT destination cities) ------------- | |
| # This tests true generalization to unseen destinations. | |
| dests = df["dest"].unique() | |
| held = set(np.random.RandomState(42).choice(dests, size=max(16, int(len(dests)*0.50)), replace=False)) | |
| te = df["dest"].isin(held).values | |
| tr = ~te | |
| print(f"Train samples: {tr.sum():,} | Test samples: {te.sum():,}") | |
| print(f"Held-out destinations: {len(held)} cities") | |
| y_tr = df.loc[tr, "rpm"].values.astype(np.float32) | |
| y_te = df.loc[te, "rpm"].values.astype(np.float32) | |
| # ------------- Baseline Features (distance only) ------------- | |
| # Add simple distance bins to allow some non-linearity without trees. | |
| df['dist_bin_short'] = (df['miles'] < 300).astype(float) | |
| df['dist_bin_mid'] = ((df['miles'] >= 300) & (df['miles'] < 800)).astype(float) | |
| df['dist_bin_long'] = ((df['miles'] >= 800) & (df['miles'] < 1500)).astype(float) | |
| def build_dist_matrix(mask): | |
| miles = df.loc[mask, "miles"].values / 1000.0 | |
| return np.c_[ | |
| miles, | |
| miles ** 2, | |
| df.loc[mask, "dist_bin_short"].values, | |
| df.loc[mask, "dist_bin_mid"].values, | |
| df.loc[mask, "dist_bin_long"].values | |
| ].astype(np.float32) | |
| Xb_tr = build_dist_matrix(tr) | |
| Xb_te = build_dist_matrix(te) | |
| # ------------- Name ID Features (memorizer) ------------- | |
| enc = safe_one_hot_encoder() | |
| enc.fit(df.loc[tr, ["origin","dest"]]) | |
| Xid_tr = enc.transform(df.loc[tr, ["origin","dest"]]).astype(np.float32) | |
| Xid_te = enc.transform(df.loc[te, ["origin","dest"]]).astype(np.float32) | |
| Xname_tr = np.c_[Xb_tr, Xid_tr] | |
| Xname_te = np.c_[Xb_te, Xid_te] | |
| # ------------- Embedding Features (semantic generalizer) ------------- | |
| def build_emb_matrix(mask): | |
| return np.c_[ | |
| build_dist_matrix(mask), | |
| df.loc[mask, "sim_od"].values, | |
| df.loc[mask, "dest_pc1"].values, | |
| df.loc[mask, "dest_pc2"].values, | |
| df.loc[mask, "dest_pc3"].values, | |
| df.loc[mask, "orig_pc1"].values, | |
| df.loc[mask, "orig_pc2"].values, | |
| df.loc[mask, "orig_pc3"].values, | |
| np.maximum(0, df.loc[mask, "sim_od"].values), # ReLU(similarity) | |
| df.loc[mask, "dest_pc1"].values * df.loc[mask, "orig_pc2"].values # simple interaction | |
| ].astype(np.float32) | |
| Xe_tr = build_emb_matrix(tr) | |
| Xe_te = build_emb_matrix(te) | |
| # ------------- Train Ridge Regressors (with CV over alpha) ------------- | |
| alphas = np.logspace(-2, 3, 13) # {0.01 ... 1000} | |
| base = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas)) | |
| name = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas)) | |
| emb = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas)) | |
| base.fit(Xb_tr, y_tr) | |
| name.fit(Xname_tr, y_tr) | |
| emb.fit(Xe_tr, y_tr) | |
| yhat_b = base.predict(Xb_te) | |
| yhat_n = name.predict(Xname_te) | |
| yhat_e = emb.predict(Xe_te) | |
| r2_b, rmse_b = r2_score(y_te, yhat_b), rmse(y_te, yhat_b) | |
| r2_n, rmse_n = r2_score(y_te, yhat_n), rmse(y_te, yhat_n) | |
| r2_e, rmse_e = r2_score(y_te, yhat_e), rmse(y_te, yhat_e) | |
| # ------------- Write results ------------- | |
| with open(save_metrics, "w") as f: | |
| f.write("TASK: Predict rate per mile (RPM)\n") | |
| f.write("Data generation: PCA components from embeddings drive semantic variance\n") | |
| f.write("OOS split: held-out destination cities only appear in TEST\n") | |
| f.write(f"Held-out dests: {sorted(list(held))}\n\n") | |
| f.write(f"Baseline (distance only) -> R²: {r2_b:.3f} | RMSE: ${rmse_b:.3f}/mile\n") | |
| f.write(f"Name-IDs (one-hot O/D) -> R²: {r2_n:.3f} | RMSE: ${rmse_n:.3f}/mile\n") | |
| f.write(f"Embeddings (dist + PCA + sim) -> R²: {r2_e:.3f} | RMSE: ${rmse_e:.3f}/mile\n") | |
| f.write(f"\nΔ Embeddings vs Baseline: ΔR²=+{(r2_e - r2_b):.3f} | ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile\n") | |
| f.write(f"Δ Embeddings vs Name-IDs: ΔR²=+{(r2_e - r2_n):.3f} | ΔRMSE=-${(rmse_n - rmse_e):.3f}/mile\n") | |
| print("\n" + "="*60) | |
| print("RESULTS") | |
| print("="*60) | |
| print(f"Baseline -> R²: {r2_b:.3f} | RMSE: ${rmse_b:.3f}/mile") | |
| print(f"Name-IDs -> R²: {r2_n:.3f} | RMSE: ${rmse_n:.3f}/mile") | |
| print(f"Embeddings -> R²: {r2_e:.3f} | RMSE: ${rmse_e:.3f}/mile") | |
| print(f"\nΔ Embeddings vs Baseline: ΔR²=+{(r2_e - r2_b):.3f} | ΔRMSE=-${(rmse_b - rmse_e):.3f}/mile") | |
| if r2_b < 1.0: | |
| print(f"Improvement: {((r2_e - r2_b) / (1 - r2_b) * 100):.1f}% of remaining variance explained") | |
| print("="*60) | |
| print(f"Saved metrics to {save_metrics}") | |
| # ------------- Plots (Pred vs Actual) ------------- | |
| def scatter(y_true, y_pred, r2, title, path): | |
| plt.figure(figsize=(6,6)) | |
| plt.scatter(y_true, y_pred, alpha=0.5, s=20) | |
| lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())] | |
| plt.plot(lims, lims, 'r--', alpha=0.8, linewidth=2) | |
| plt.xlabel("Actual RPM ($/mile)", fontsize=11) | |
| plt.ylabel("Predicted RPM ($/mile)", fontsize=11) | |
| plt.title(f"{title}\nR² = {r2:.3f}", fontsize=12) | |
| plt.grid(alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(path, dpi=180) | |
| plt.close() | |
| scatter(y_te, yhat_b, r2_b, "Baseline (distance only)", "pred_vs_actual_baseline.png") | |
| scatter(y_te, yhat_n, r2_n, "Name-IDs (memorizer)", "pred_vs_actual_names.png") | |
| scatter(y_te, yhat_e, r2_e, "Embeddings (semantic features)", "pred_vs_actual_embeddings.png") | |
| print("Saved scatter plots: pred_vs_actual_*.png") | |
| # %% [markdown] | |
| # ## 6) Run the Tutorial | |
| # Execute these cells to generate the maps and the modeling results. | |
| # - `map_geo_clusters.png` | |
| # - `map_embedding_clusters.png` | |
| # - `rates_oos_metrics.txt` | |
| # - `pred_vs_actual_baseline.png`, `pred_vs_actual_names.png`, `pred_vs_actual_embeddings.png` | |
| # %% | |
| if __name__ == "__main__": | |
| cluster_by_geography(k=7, save_path="map_geo_clusters.png") | |
| cluster_by_embeddings(k=7, save_path="map_embedding_clusters.png") | |
| lane_demo(save_metrics="rates_oos_metrics.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment