Skip to content

Instantly share code, notes, and snippets.

@jin-zhe
Last active December 19, 2025 10:10
Show Gist options
  • Select an option

  • Save jin-zhe/53a8f5783aafdc6afb91b0e1e86432c4 to your computer and use it in GitHub Desktop.

Select an option

Save jin-zhe/53a8f5783aafdc6afb91b0e1e86432c4 to your computer and use it in GitHub Desktop.
Finds large files under given directory (optionally matching given extension) with specifiable filesize limit (default 10MB). Good for identifying files for GIT LFS.
#!/usr/bin/env bash
set -euo pipefail
# --- Default Values ---
TARGET_DIR="."
FILE_EXT=""
SIZE_LIMIT_MB=10
OUTPUT_FILE="large_files_report.txt"
# --- Help Function ---
usage() {
echo "Usage: $0 -d <directory> [-e <extension>] [-l <limit_in_mb>] [-o <output_file>]"
echo " -d Directory to scan (required)"
echo " -e File extension to filter (optional, e.g., 'png', 'csv'). If omitted, checks all files."
echo " -l Size limit in MB (optional, default: 10)"
echo " -o Output filename (optional, default: large_files_report.txt)"
exit 1
}
# --- Parse Arguments ---
while getopts ":d:e:l:o:" opt; do
case ${opt} in
d) TARGET_DIR="$OPTARG" ;;
e) FILE_EXT="$OPTARG" ;;
l) SIZE_LIMIT_MB="$OPTARG" ;;
o) OUTPUT_FILE="$OPTARG" ;;
\?) echo "Invalid option: -$OPTARG" >&2; usage ;;
:) echo "Option -$OPTARG requires an argument." >&2; usage ;;
esac
done
# --- Validation ---
if [[ -z "$TARGET_DIR" || ! -d "$TARGET_DIR" ]]; then
echo "Error: Directory '$TARGET_DIR' does not exist or was not provided." >&2
usage
fi
# Convert MB to Bytes
THRESHOLD=$((SIZE_LIMIT_MB * 1024 * 1024))
echo "--- Configuration ---"
echo "Target Directory: $TARGET_DIR"
echo "File Filter: ${FILE_EXT:-All files}"
echo "Size Limit: ${SIZE_LIMIT_MB}MB ($THRESHOLD bytes)"
echo "Output File: $OUTPUT_FILE"
echo "---------------------"
# --- Step 1: Find Files ---
echo "Scanning for files..."
# Create a temporary file to store the list of candidate files
# Using mktemp for safety
TEMP_LIST=$(mktemp)
# Build the find command
if [[ -n "$FILE_EXT" ]]; then
# If extension provided, filter by it (case insensitive)
find "$TARGET_DIR" -type f -iname "*.$FILE_EXT" > "$TEMP_LIST"
else
# Otherwise, find all files
find "$TARGET_DIR" -type f > "$TEMP_LIST"
fi
TOTAL_FILES=$(wc -l < "$TEMP_LIST" | tr -d ' ')
if [[ "$TOTAL_FILES" -eq 0 ]]; then
echo "No files found matching the criteria."
rm "$TEMP_LIST"
exit 0
fi
echo "Found $TOTAL_FILES candidate files. Checking sizes..."
# --- Step 2: Check Sizes with Progress Bar ---
# Initialize output file
> "$OUTPUT_FILE"
CURRENT=0
START_TIME=$(date +%s)
# Function to draw progress bar
draw_progress_bar() {
local current=$1
local total=$2
local width=40
# Calculate percentage
local percent=$((current * 100 / total))
# Calculate number of filled bars
local filled=$((percent * width / 100))
local empty=$((width - filled))
# Create the bar string
printf -v bar "[%*s%*s]" $filled "" $empty ""
# Fill the empty spaces with # and space (using substitution)
bar=${bar// /#}
bar=${bar//[/[}
bar=${bar//]/]}
# The trick above is complex in pure bash, simpler approach:
# Construct string manually
local str=""
for ((i=0; i<filled; i++)); do str+="#"; done
for ((i=0; i<empty; i++)); do str+="."; done
printf "\rProgress: [%s] %d%% (%d/%d)" "$str" "$percent" "$current" "$total"
}
# Determine stat command syntax once
STAT_CMD=""
if stat --version >/dev/null 2>&1; then
# GNU stat (Linux)
STAT_CMD="stat -c%s"
else
# BSD stat (macOS)
STAT_CMD="stat -f%z"
fi
while IFS= read -r file; do
((CURRENT++))
# Update progress bar every 10 files or on the last file to speed up processing
if (( CURRENT % 10 == 0 || CURRENT == TOTAL_FILES )); then
draw_progress_bar "$CURRENT" "$TOTAL_FILES"
fi
# Skip if file was deleted during the process
if [[ ! -f "$file" ]]; then continue; fi
# Get size
size=$($STAT_CMD "$file")
if (( size > THRESHOLD )); then
echo "$file" >> "$OUTPUT_FILE"
fi
done < "$TEMP_LIST"
echo "" # Newline after progress bar
echo "Done! Report saved to: $OUTPUT_FILE"
# Cleanup
rm "$TEMP_LIST"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment