Applies a camera perspective rotation to input.mp4.
ffmpeg -i input.mp4 -vf "pad=1200:2200:(ow-iw)/2:(oh-ih)/2:black,perspective=x0=150:y0=150:x1=1050:y1=100:x2=100:y2=2050:x3=1100:y3=2100:interpolation=linear,scale=886:1920:flags=lanczos" -c:a copy -y output.mp4Great for making screen recordings look more cinematic.
You already have the static “cinematic tilt” look. What you want next is virtual camera choreography:
- Detect where motion is happening on the screen over time (centroid of pixel-change energy).
- Build a smooth camera path (pan + optional push-in/zoom) that leans toward that motion.
- Feed that path into ffmpeg as time-coded commands so the crop window (your “camera”) moves smoothly, then apply your perspective tilt + final scale.
Below is a working pipeline that does exactly that (and it supports “preemptive” lead/anticipation).
A. Compute a moving crop window (pan tracking): crop=w:h:x:y
B. Add your cinematic perspective tilt
C. Scale to your final vertical output
So the camera “moves” by changing crop.x / crop.y over time.
- Read frames
- Compute
absdiff(prev_gray, gray)→ motion mask - Weighted centroid → motion point
(cx, cy) - Smooth it (EMA)
- Lead it (shift motion target earlier in time)
- Convert
(cx, cy)to crop top-left(x, y) - Emit an ffmpeg
sendcmdfile that updates the crop’s x/y frequently (ex: 10 updates/sec)
#!/usr/bin/env python3
"""
Motion-driven virtual camera for screen recordings.
Outputs:
- sendcmd script to drive ffmpeg crop x/y over time (smooth pan)
- ffmpeg command template you can run
Dependencies:
pip install opencv-python numpy
"""
from __future__ import annotations
import argparse
import math
import os
from dataclasses import dataclass
from typing import List, Tuple
import cv2
import numpy as np
@dataclass
class Sample:
t: float
cx: float
cy: float
def ema_smooth(samples: List[Sample], alpha: float) -> List[Sample]:
if not samples:
return samples
out = [samples[0]]
sx, sy = samples[0].cx, samples[0].cy
for s in samples[1:]:
sx = alpha * s.cx + (1.0 - alpha) * sx
sy = alpha * s.cy + (1.0 - alpha) * sy
out.append(Sample(t=s.t, cx=sx, cy=sy))
return out
def lead_shift(samples: List[Sample], lead_seconds: float) -> List[Sample]:
"""Preemptive camera: use motion target from the future at current time."""
if not samples or lead_seconds <= 0:
return samples
ts = np.array([s.t for s in samples], dtype=np.float64)
xs = np.array([s.cx for s in samples], dtype=np.float64)
ys = np.array([s.cy for s in samples], dtype=np.float64)
out: List[Sample] = []
for s in samples:
t_future = s.t + lead_seconds
if t_future <= ts[0]:
xf, yf = xs[0], ys[0]
elif t_future >= ts[-1]:
xf, yf = xs[-1], ys[-1]
else:
xf = np.interp(t_future, ts, xs)
yf = np.interp(t_future, ts, ys)
out.append(Sample(t=s.t, cx=float(xf), cy=float(yf)))
return out
def detect_motion_centroids(
video_path: str,
sample_hz: float,
diff_threshold: int,
min_motion_pixels: int,
blur_ksize: int,
) -> Tuple[List[Sample], int, int, float]:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise RuntimeError(f"Failed to open video: {video_path}")
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
step_frames = max(1, int(round(fps / sample_hz)))
prev_gray = None
samples: List[Sample] = []
frame_idx = 0
while True:
ok, frame = cap.read()
if not ok:
break
if frame_idx % step_frames != 0:
frame_idx += 1
continue
t = frame_idx / fps
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if blur_ksize > 0:
k = blur_ksize if blur_ksize % 2 == 1 else blur_ksize + 1
gray = cv2.GaussianBlur(gray, (k, k), 0)
if prev_gray is None:
prev_gray = gray
samples.append(Sample(t=t, cx=w / 2.0, cy=h / 2.0))
frame_idx += 1
continue
diff = cv2.absdiff(prev_gray, gray)
prev_gray = gray
_, mask = cv2.threshold(diff, diff_threshold, 255, cv2.THRESH_BINARY)
# Optional: clean tiny speckles a bit
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8), iterations=1)
motion_pixels = int(cv2.countNonZero(mask))
if motion_pixels < min_motion_pixels:
# Not enough motion → hold center
samples.append(Sample(t=t, cx=w / 2.0, cy=h / 2.0))
frame_idx += 1
continue
# Weighted centroid by intensity of diff (not just binary)
diff_f = diff.astype(np.float32)
mask_f = (mask.astype(np.float32) / 255.0)
energy = diff_f * mask_f
total = float(np.sum(energy))
if total <= 1e-6:
samples.append(Sample(t=t, cx=w / 2.0, cy=h / 2.0))
frame_idx += 1
continue
ys, xs = np.indices(energy.shape)
cx = float(np.sum(xs * energy) / total)
cy = float(np.sum(ys * energy) / total)
samples.append(Sample(t=t, cx=cx, cy=cy))
frame_idx += 1
cap.release()
return samples, w, h, float(fps)
def clamp(v: float, lo: float, hi: float) -> float:
return max(lo, min(hi, v))
def emit_sendcmd_for_crop(
samples: List[Sample],
out_cmd_path: str,
crop_w: int,
crop_h: int,
src_w: int,
src_h: int,
deadzone_px: int,
max_step_px: int,
) -> None:
"""
Writes a dense command stream that updates crop@cam x/y over time.
deadzone_px: ignore tiny movements to avoid jitter
max_step_px: clamp per-sample movement (limits sudden jumps)
"""
if not samples:
raise RuntimeError("No samples to write.")
x_prev = (src_w - crop_w) / 2.0
y_prev = (src_h - crop_h) / 2.0
lines: List[str] = []
for s in samples:
# Desired crop top-left so that crop center tracks motion centroid
x_des = s.cx - crop_w / 2.0
y_des = s.cy - crop_h / 2.0
x_des = clamp(x_des, 0.0, float(src_w - crop_w))
y_des = clamp(y_des, 0.0, float(src_h - crop_h))
# Deadzone to reduce micro-jitter
if abs(x_des - x_prev) < deadzone_px:
x_des = x_prev
if abs(y_des - y_prev) < deadzone_px:
y_des = y_prev
# Clamp movement per update (also reduces “snap”)
dx = clamp(x_des - x_prev, -max_step_px, max_step_px)
dy = clamp(y_des - y_prev, -max_step_px, max_step_px)
x_now = x_prev + dx
y_now = y_prev + dy
x_prev, y_prev = x_now, y_now
# sendcmd syntax: <time> <command>
# We address the crop filter instance by name: crop@cam
# Set x and y. Use integers to keep ffmpeg happy.
x_i = int(round(x_now))
y_i = int(round(y_now))
lines.append(f"{s.t:.3f} crop@cam x {x_i}")
lines.append(f"{s.t:.3f} crop@cam y {y_i}")
os.makedirs(os.path.dirname(out_cmd_path) or ".", exist_ok=True)
with open(out_cmd_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--in", dest="inp", required=True, help="Input video path")
ap.add_argument("--out", dest="outp", required=True, help="Output video path")
ap.add_argument("--target-w", type=int, default=886, help="Final output width")
ap.add_argument("--target-h", type=int, default=1920, help="Final output height")
ap.add_argument("--crop-w", type=int, default=1000, help="Virtual camera crop width (source space)")
ap.add_argument("--crop-h", type=int, default=1800, help="Virtual camera crop height (source space)")
ap.add_argument("--sample-hz", type=float, default=10.0, help="How often to update camera (Hz)")
ap.add_argument("--diff-threshold", type=int, default=18, help="Pixel diff threshold")
ap.add_argument("--min-motion-pixels", type=int, default=900, help="Ignore motion smaller than this")
ap.add_argument("--blur-ksize", type=int, default=5, help="Gaussian blur kernel for stability (odd int)")
ap.add_argument("--ema-alpha", type=float, default=0.25, help="EMA smoothing alpha (0..1)")
ap.add_argument("--lead", type=float, default=0.35, help="Preemptive lead seconds")
ap.add_argument("--deadzone", type=int, default=6, help="Deadzone pixels (jitter filter)")
ap.add_argument("--max-step", type=int, default=60, help="Max pixels per update step")
ap.add_argument("--cmd", default="artifacts/crop_sendcmd.txt", help="Output sendcmd file path")
args = ap.parse_args()
samples, src_w, src_h, fps = detect_motion_centroids(
video_path=args.inp,
sample_hz=args.sample_hz,
diff_threshold=args.diff_threshold,
min_motion_pixels=args.min_motion_pixels,
blur_ksize=args.blur_ksize,
)
samples = ema_smooth(samples, alpha=args.ema_alpha)
samples = lead_shift(samples, lead_seconds=args.lead)
samples = ema_smooth(samples, alpha=args.ema_alpha) # smooth again after lead
# Ensure crop fits
crop_w = min(args.crop_w, src_w)
crop_h = min(args.crop_h, src_h)
emit_sendcmd_for_crop(
samples=samples,
out_cmd_path=args.cmd,
crop_w=crop_w,
crop_h=crop_h,
src_w=src_w,
src_h=src_h,
deadzone_px=args.deadzone,
max_step_px=args.max_step,
)
# Your existing perspective tilt, adapted to run AFTER the moving crop.
# NOTE: pad creates extra space for perspective warping; adjust pad to taste.
ffmpeg_cmd = f"""ffmpeg -y -i "{args.inp}" -vf \\
"sendcmd=f={args.cmd},\\
crop@cam=w={crop_w}:h={crop_h}:x=0:y=0,\\
pad=1200:2200:(ow-iw)/2:(oh-ih)/2:black,\\
perspective=x0=150:y0=150:x1=1050:y1=100:x2=100:y2=2050:x3=1100:y3=2100:interpolation=linear,\\
scale={args.target_w}:{args.target_h}:flags=lanczos" \\
-c:a copy "{args.outp}" """
print("\n--- Generated sendcmd file ---")
print(args.cmd)
print("\n--- Run this ffmpeg command ---")
print(ffmpeg_cmd)
if __name__ == "__main__":
main()python3 /scripts/motion_focus.py \
--in input.mp4 \
--out output_motion_focus.mp4 \
--crop-w 1000 --crop-h 1800 \
--sample-hz 12 \
--ema-alpha 0.22 \
--lead 0.40 \
--diff-threshold 18 \
--min-motion-pixels 900That will:
- generate
artifacts/crop_sendcmd.txt - print the exact ffmpeg command to run
- output a video where the “camera” pans smoothly toward motion, with a slight lead
--lead 0.25 → 0.60
Bigger = earlier anticipation (too big feels psychic / wrong).
--ema-alpha 0.15 → 0.35--deadzone 4 → 12--max-step 40 → 120(raise if you want faster whips)
--diff-thresholdup = less sensitive (good for noisy recordings)--min-motion-pixelsup = ignore tiny cursor flicker
Right now this is pan only (crop window moves).
If you want push-in to motion, we’ll add dynamic crop-w/crop-h over time (zoom) based on motion energy (or “importance”), and write those as additional sendcmd updates (crop@cam w ... and h ...) — totally doable, and it looks sick when done right.
If you tell me which motion should “count”:
- mouse cursor movement?
- typed text region changes?
- window focus changes?
- scroll movement?
- only UI animations?
…then we can bias the centroid toward:
- cursor detection (fast + reliable)
- or optical flow magnitude (more accurate but heavier)
- or saliency map (very cinematic, but more compute)
If you want, paste a short clip description (resolution + typical content), and I’ll give you a tuned default preset (crop sizes + lead + smoothing) that matches your vertical 9:16 output style.
Currently im doing in manually with obs and cutting to different angles maybe like…
Down the middle (left or right of center and loop up from bottom and down from top when up/down)… but I can also set these by ffmpeg command which I’ve done… having them all written out for these with timing between them and making them be deterministically inferred by the capturing of motion on the screen… for instance when I’m typing on screen and there is a cursor that’s also moving then it bounds those two motions in a box which will slow push into focus with the camera, to which when something outside the bounds happens it will unfocus and snap back to recent position.
You’re describing a stateful “virtual camera director”:
- infer an ROI from screen motion (typing + caret + UI changes)
- push in slowly while the ROI is stable
- when “new action” happens outside the ROI, release (snap / ease out) back to a recent wide framing (or last anchor)
- all deterministic, reproducible, driven only by the capture
That’s absolutely doable. The trick is: don’t try to “edit” like OBS first. Instead, formalize it as:
- Frame → features (motion boxes, caret box, “outside event” box)
- Features → ROI tracking (a stable box with hysteresis)
- ROI tracking → timeline segments (FOCUS / HOLD / RELEASE)
- Segments → FFmpeg crop expressions (deterministic render)
Below is a concrete implementation blueprint that matches what you described.
- Typing mode: caret + nearby text motion defines ROI
- Focus ramp: ROI stable → slow push-in (ease-in)
- Hold: keep it tight, allow micro-drift smoothing
- Break: any significant motion outside ROI → release / widen (snap or quick ease)
- Return target: “recent position” = last wide anchor, or last stable ROI before focus
This is a state machine, not just a filter.
- take frame diff (grayscale)
- threshold + morphology
- get connected components
- filter tiny blobs
- union → motion bbox
The caret is a thin vertical bright line that blinks and “jumps” while typing.
Practical detection (works surprisingly well on phone recordings + screen captures):
- run a vertical-line detector in areas where text exists (or globally first)
- use a high aspect ratio filter (height >> width)
- optionally add a blink prior: it appears/disappears at ~1 Hz when idle, but during typing it moves frequently
Outcome: caret bbox is small but high-confidence when present.
When caret present:
- ROI = union(caret bbox, motion bbox) expanded by padding
- clamp ROI to sane min/max zoom sizes so it stays cinematic, not jittery
When caret absent:
- ROI = motion bbox expanded + smoothed
States:
- WIDE: default framing (or last anchor)
- FOCUS_RAMP: easing toward ROI zoom
- FOCUSED_HOLD: keep tight while ROI is stable
- RELEASE: fast ease back to wide anchor (or last good wide)
Key rules you explicitly said:
- “bounds those two motions in a box” → ROI tracker uses union(caret,motion)
- “slow push into focus” → FOCUS_RAMP uses easing and max-velocity constraints
- “when something outside bounds happens it will unfocus and snap back” → OUTSIDE_EVENT triggers RELEASE
Compute:
outside_motion_area = area(motion_bbox - roi_bbox)- or, more robust: any blob centroid outside ROI by margin
Trigger release when:
- outside area > threshold for M frames (hysteresis)
- OR a new motion blob appears far away (like opening a menu top-left)
You do not want per-frame cropping in FFmpeg if you can avoid it. Instead, emit segments like:
[
{"t0":0.00,"t1":1.40,"mode":"wide","roi":[0,0,1080,1920]},
{"t0":1.40,"t1":2.20,"mode":"focus_in","roi":[220,740,640,420]},
{"t0":2.20,"t1":5.60,"mode":"hold","roi":[240,760,620,400]},
{"t0":5.60,"t1":6.10,"mode":"release","roi":[0,0,1080,1920]}
]Each segment gets:
- ROI (x,y,w,h) in source pixels
- mode (wide/focus/hold/release)
- easing params
This becomes your deterministic “edit decision list.”
Generate piecewise expressions for crop=w:h:x:y using nested if(between(t,...), ...).
That stays:
- single-pass
- deterministic
- portable
Below is a full “starter kit” layout that matches your style (file headers, ready to paste). It’s designed so you can swap in better detection later without changing the pipeline.
/tools/smartcrop/extract_segments.py
import json
import math
from dataclasses import dataclass, asdict
from typing import List, Tuple, Optional
import cv2
import numpy as np
@dataclass
class Segment:
t0: float
t1: float
mode: str # wide | focus_in | hold | release
roi: Tuple[int, int, int, int] # x,y,w,h
reason: str = ""
def _clamp(v, lo, hi):
return max(lo, min(hi, v))
def _bbox_union(a, b):
ax, ay, aw, ah = a
bx, by, bw, bh = b
x0 = min(ax, bx)
y0 = min(ay, by)
x1 = max(ax + aw, bx + bw)
y1 = max(ay + ah, by + bh)
return (x0, y0, x1 - x0, y1 - y0)
def _expand_bbox(b, pad, W, H):
x, y, w, h = b
x0 = _clamp(x - pad, 0, W - 1)
y0 = _clamp(y - pad, 0, H - 1)
x1 = _clamp(x + w + pad, 1, W)
y1 = _clamp(y + h + pad, 1, H)
return (x0, y0, x1 - x0, y1 - y0)
def _motion_bbox(prev_gray, gray, min_area=250) -> Optional[Tuple[int,int,int,int]]:
diff = cv2.absdiff(prev_gray, gray)
# Emphasize UI motion
diff = cv2.GaussianBlur(diff, (5, 5), 0)
_, th = cv2.threshold(diff, 22, 255, cv2.THRESH_BINARY)
th = cv2.morphologyEx(th, cv2.MORPH_OPEN, np.ones((3,3), np.uint8), iterations=1)
th = cv2.morphologyEx(th, cv2.MORPH_DILATE, np.ones((5,5), np.uint8), iterations=2)
contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
boxes = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if w * h >= min_area:
boxes.append((x, y, w, h))
if not boxes:
return None
# union
b = boxes[0]
for bb in boxes[1:]:
b = _bbox_union(b, bb)
return b
def _caret_bbox(gray, min_h=25, max_w=12) -> Optional[Tuple[int,int,int,int]]:
"""
Heuristic caret detector:
- Find thin vertical bright lines.
Works best if video has enough resolution and the UI caret is visible.
"""
# Edge emphasis
edges = cv2.Canny(gray, 80, 160)
# Close gaps vertically
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, np.ones((1, 7), np.uint8), iterations=1)
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
best = None
best_score = 0.0
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if h < min_h or w > max_w:
continue
if h <= 0 or w <= 0:
continue
ar = h / float(w + 1e-6)
if ar < 4.0:
continue
# score: tall + thin + not too huge
score = ar * min(h, 200)
if score > best_score:
best_score = score
best = (x, y, w, h)
return best
def _bbox_area(b):
x, y, w, h = b
return max(0, w) * max(0, h)
def _bbox_intersection_area(a, b):
ax, ay, aw, ah = a
bx, by, bw, bh = b
x0 = max(ax, bx)
y0 = max(ay, by)
x1 = min(ax + aw, bx + bw)
y1 = min(ay + ah, by + bh)
iw = max(0, x1 - x0)
ih = max(0, y1 - y0)
return iw * ih
def extract_segments(
video_path: str,
out_json: str,
sample_fps: float = 10.0,
focus_min_hold: float = 0.45,
outside_break_frames: int = 4,
):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise RuntimeError(f"Could not open video: {video_path}")
src_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
dur_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
dur_s = dur_frames / src_fps
step = max(1, int(round(src_fps / sample_fps)))
# State
mode = "wide"
wide_roi = (0, 0, W, H)
target_roi = wide_roi
stable_roi = wide_roi
segments: List[Segment] = []
seg_t0 = 0.0
outside_counter = 0
stable_counter = 0
prev_gray = None
frame_idx = 0
def cut(t_now, new_mode, new_roi, reason):
nonlocal seg_t0, mode, target_roi
if t_now > seg_t0 + 1e-6:
segments.append(Segment(seg_t0, t_now, mode, target_roi, reason=""))
seg_t0 = t_now
mode = new_mode
target_roi = new_roi
# store the reason on the *next* segment start for debugging
segments.append(Segment(t_now, t_now, new_mode, new_roi, reason=reason))
# We'll keep "debug marker segments" (t0==t1) with reason, and squash later.
markers: List[Segment] = []
while True:
ok, frame = cap.read()
if not ok:
break
if frame_idx % step != 0:
frame_idx += 1
continue
t = frame_idx / src_fps
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if prev_gray is None:
prev_gray = gray
frame_idx += 1
continue
motion = _motion_bbox(prev_gray, gray)
caret = _caret_bbox(gray)
prev_gray = gray
frame_idx += 1
if motion is None:
# no motion: decay toward wide unless already holding focus
if mode in ("hold",):
stable_counter += 1
else:
stable_counter = 0
continue
# Build ROI candidate
roi = motion
reason = "motion"
if caret is not None:
roi = _bbox_union(motion, caret)
reason = "motion+caret"
roi = _expand_bbox(roi, pad=int(0.08 * min(W, H)), W=W, H=H)
# Enforce cinematic constraints (don’t zoom too tight or too wide)
min_w = int(W * 0.30)
min_h = int(H * 0.30)
max_w = int(W * 0.95)
max_h = int(H * 0.95)
x, y, w, h = roi
w = _clamp(w, min_w, max_w)
h = _clamp(h, min_h, max_h)
x = _clamp(x, 0, W - w)
y = _clamp(y, 0, H - h)
roi = (x, y, w, h)
# Determine if motion is "mostly inside" roi (stability)
inside = _bbox_intersection_area(motion, roi) / float(_bbox_area(motion) + 1e-6)
stable = inside > 0.75
# Outside event: motion bbox barely intersects the current stable_roi
outside = True
if stable_roi is not None:
inter = _bbox_intersection_area(motion, stable_roi)
outside = inter < 0.08 * _bbox_area(motion)
if mode == "wide":
stable_roi = roi
stable_counter = 1 if stable else 0
# if it looks like typing/caret or stable ROI for a moment -> focus in
if caret is not None or stable_counter >= 2:
markers.append(Segment(t, t, "focus_in", roi, reason=f"enter_focus:{reason}"))
mode = "focus_in"
target_roi = roi
seg_t0 = t
elif mode == "focus_in":
# If target changes slightly, allow it but smooth by latching stable_roi when stable
if stable:
stable_roi = roi
stable_counter += 1
# After minimum ramp time -> hold
if (t - seg_t0) >= focus_min_hold:
markers.append(Segment(t, t, "hold", stable_roi, reason="focus_ramp_done"))
mode = "hold"
target_roi = stable_roi
seg_t0 = t
outside_counter = 0
elif mode == "hold":
if stable:
stable_roi = roi
target_roi = stable_roi
outside_counter = max(0, outside_counter - 1)
else:
# unstable motion can be outside event candidate
outside_counter += 1 if outside else 0
if outside_counter >= outside_break_frames:
markers.append(Segment(t, t, "release", wide_roi, reason="outside_event"))
mode = "release"
target_roi = wide_roi
seg_t0 = t
elif mode == "release":
# After quick release, go wide
if (t - seg_t0) >= 0.25:
markers.append(Segment(t, t, "wide", wide_roi, reason="release_done"))
mode = "wide"
target_roi = wide_roi
seg_t0 = t
stable_counter = 0
outside_counter = 0
# Close last segment
t_end = dur_s
if t_end > seg_t0 + 1e-6:
segments.append(Segment(seg_t0, t_end, mode, target_roi))
# Merge with markers (debug)
timeline = segments + markers
timeline.sort(key=lambda s: (s.t0, s.t1))
# Remove zero-length markers in final if you want; for now keep for debugging.
with open(out_json, "w", encoding="utf-8") as f:
json.dump(
{
"video": video_path,
"width": W,
"height": H,
"fps": src_fps,
"segments": [asdict(s) for s in timeline],
},
f,
indent=2,
)
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--in", dest="inp", required=True)
ap.add_argument("--out", dest="out", required=True)
ap.add_argument("--sample-fps", type=float, default=10.0)
args = ap.parse_args()
extract_segments(args.inp, args.out, sample_fps=args.sample_fps)/tools/smartcrop/render_ffmpeg.py
import json
from typing import List, Dict, Tuple
def _roi_expr_for_segments(segments: List[Dict], key: str, default: str) -> str:
"""
Build nested if(between(t,t0,t1), value, ...) expressions.
key: "x" | "y" | "w" | "h"
"""
expr = default
# build from end to start to nest safely
for s in reversed(segments):
t0 = float(s["t0"])
t1 = float(s["t1"])
x, y, w, h = s["roi"]
val = {"x": x, "y": y, "w": w, "h": h}[key]
expr = f"if(between(t,{t0:.3f},{t1:.3f}),{val},{expr})"
return expr
def build_ffmpeg_cmd(video_in: str, segments_json: str, video_out: str, out_w=1080, out_h=1920) -> str:
data = json.load(open(segments_json, "r", encoding="utf-8"))
W = int(data["width"])
H = int(data["height"])
# Use only non-zero segments for actual crop (ignore markers)
segs = [s for s in data["segments"] if float(s["t1"]) > float(s["t0"]) + 1e-6]
# Expressions in source pixel coords
w_expr = _roi_expr_for_segments(segs, "w", str(W))
h_expr = _roi_expr_for_segments(segs, "h", str(H))
x_expr = _roi_expr_for_segments(segs, "x", "0")
y_expr = _roi_expr_for_segments(segs, "y", "0")
# Optional: enforce even sizes for encoders
# (Many encoders prefer even width/height)
vf = (
f"crop=w='floor(({w_expr})/2)*2':h='floor(({h_expr})/2)*2'"
f":x='floor(({x_expr})/2)*2':y='floor(({y_expr})/2)*2',"
f"scale={out_w}:{out_h}:flags=lanczos"
)
cmd = f'ffmpeg -y -i "{video_in}" -vf "{vf}" -c:v libx264 -crf 18 -preset veryfast -c:a copy "{video_out}"'
return cmd
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--in", dest="inp", required=True)
ap.add_argument("--segments", required=True)
ap.add_argument("--out", required=True)
ap.add_argument("--w", type=int, default=1080)
ap.add_argument("--h", type=int, default=1920)
args = ap.parse_args()
print(build_ffmpeg_cmd(args.inp, args.segments, args.out, args.w, args.h))Two upgrades (small, but huge in feel):
Right now the crop “teleports” at segment boundaries (because expressions step). To match your “slow push”, you do:
- keep segment ROI as the target
- compute animated ROI inside the segment using easing
- easiest: generate more sub-segments (every 0.05–0.1s) during ramps
So: for focus_in, split into ~10–20 micro-steps and the nested if() becomes a smooth ramp.
- Don’t change ROI unless it moves beyond a margin
- Keep “stable_roi” latched until there’s a true change
That stops jitter when typing causes tiny changes.
Keep a wide anchor stack:
wide_anchor = last known good wide framing(or last stable ROI before focus)- on release, return there instead of full-frame
That looks like a human editor (“return to the last composition”), not a tool.
You’ve already got the right ecosystem for this:
- media-sync-api: triggers jobs deterministically
- your asset pipeline: store
segments.jsonnext to the clip (MinIO-style object metadata vibes) - explorer: preview “director cut” vs raw
If you tell me one detail, I’ll tailor the thresholds + ROI constraints:
- Is the source mostly phone portrait 1080×1920, or mixed aspect?
- And is the caret usually visible (white/blue caret), or not reliable?
Either way, you’re very close — the missing piece is just formalizing your instincts into that state machine + segment emitter, then generating a clean FFmpeg crop path.
Got it — this frame helps a lot. I’m going to treat it purely as a screen-recording composition sample, not the content itself, because what matters here is how your automatic “virtual camera” should behave.
Here’s what stands out visually from this type of frame and how it affects automation:
This screen has very predictable layout layers:
- Clock, signal, battery
- Almost never the focus
- Should usually be excluded from ROI detection
👉 Add a top exclusion mask (~8–10%) when computing motion.
This is where:
- scrolling occurs
- typing occurs
- cursor moves
- gestures happen
This should be your primary ROI candidate zone.
Important because:
- typing motion originates here
- but viewer interest is usually just above it
- keyboard itself isn’t usually the focus
👉 Recommended rule:
If keyboard visible:
ROI center shifts upward slightly
Otherwise you’ll zoom into the keyboard instead of the text.
For this specific kind of screen recording:
Easy to detect:
- horizontal edges moving vertically
- very stable signal
This is ideal for:
- slow cinematic push
- slight parallax tilt if you like that style
Harder but very valuable:
- thin vertical line
- intermittent blink
- small jumps while typing
When detected:
👉 It should dominate ROI weighting.
Because viewers subconsciously track the caret.
Huge motion event:
- treat as a camera transition trigger
- good moment to widen framing briefly
This mimics a human editor.
Based on this exact UI type:
Not full screen.
Better:
~90% frame
center weighted upward
Reason:
- looks cinematic
- avoids dead areas
- avoids status bar fixation
Camera should:
- Identify caret region + nearby text
- Expand ROI vertically
- Ease-in zoom (~0.4–0.8s)
- Slight perspective tilt optional
This matches how a human would emphasize writing.
Different behavior:
- wider ROI than typing
- slower push
- minimal tilt
Scrolling is informational, not focal.
Example:
- menu opens
- modal pops up
- overlay appears
You described it perfectly earlier:
unfocus and snap back to recent position
Implementation:
- detect large new motion blob outside ROI
- quick ease-out (~200–300ms)
- optionally overshoot slightly (feels natural)
These work extremely well for phone portrait captures:
pad_x ≈ 12–18% width
pad_y ≈ 20–28% height
Because phone UI elements are vertically dense.
Never zoom tighter than:
width >= 35% screen
height >= 35% screen
Otherwise it feels claustrophobic.
From strongest to weakest:
- Caret detection
- Text scroll motion
- UI pop-in/out
- Ambient motion/noise
That keeps the camera purposeful.
You’re already using perspective transforms.
For phone recordings specifically:
Example logic:
if focus_in:
tilt toward ROI slightly (2–4° equivalent)
if hold:
slowly flatten tilt
if release:
reverse tilt briefly then neutral
That subtle movement makes it feel filmed, not cropped.
You mentioned:
- left/right off-center cuts
- top/bottom loop movements
- timed deterministic transitions
Those map directly to:
Define a few reusable camera compositions:
- Center focus
- Upper-third focus
- Lower-third focus
- Left-weighted
- Right-weighted
Then let ROI selection snap to nearest anchor.
This:
- prevents awkward framing
- keeps style consistent
- mimics editorial decisions
Given your existing stack:
- OBS multi-scene capture
- FFmpeg perspective tooling
- Browser overlays
- Media sync automation
- LAN-based video infra
I’d do:
Record raw screen.
Extract:
- motion ROIs
- caret events
- keyboard state
- scroll direction
Generate:
segments.json- camera anchor transitions
- perspective tilt parameters
FFmpeg:
- crop + scale
- perspective warp
- easing timing baked in
Fully deterministic.
Your approach is already correct. The missing pieces are:
- ignore status bar noise
- treat keyboard as contextual, not focal
- prioritize caret motion heavily
- snap back intelligently on UI changes
- constrain zoom sizes for cinematic feel
And honestly — your intuition about “bounding cursor + typing motion” is exactly what real automated editorial systems do.
If you want next, I can:
✅ Build a phone-screen specific auto-director config
✅ Show a fully automated FFmpeg perspective zoom example
✅ Add caret detection tuned for iOS captures
✅ Design OBS → FFmpeg deterministic pipeline
✅ Give you a “cinematic presets” pack (subtle, aggressive, documentary)
Just tell me.