Skip to content

Instantly share code, notes, and snippets.

@ruben-arts
Last active February 11, 2026 12:49
Show Gist options
  • Select an option

  • Save ruben-arts/389749b62ce5c9defbbe3f6c9f613cca to your computer and use it in GitHub Desktop.

Select an option

Save ruben-arts/389749b62ce5c9defbbe3f6c9f613cca to your computer and use it in GitHub Desktop.
Duplication checking of a pixi environment
#!/usr/bin/env -S pixi exec -- python
"""
Calculate total size_in_bytes for paths with prefix_placeholder defined.
These files are duplicated on disk (not hardlinked).
"""
import argparse
import json
from pathlib import Path
def calculate_duplicated_size(directory: Path) -> dict:
"""Calculate total size for duplicated files across all JSON files."""
total_size = 0
file_count = 0
package_details = []
for json_file in directory.glob("*.json"):
try:
with open(json_file, 'r') as f:
data = json.load(f)
package_name = data.get('name', 'unknown')
package_version = data.get('version', 'unknown')
package_size = 0
package_files = 0
# Check if paths_data exists and has paths
if 'paths_data' in data and 'paths' in data['paths_data']:
for path_entry in data['paths_data']['paths']:
# Only count entries with prefix_placeholder
if 'prefix_placeholder' in path_entry:
size = path_entry.get('size_in_bytes', 0)
package_size += size
package_files += 1
file_count += 1
if package_size > 0:
total_size += package_size
package_details.append({
'name': f"{package_name}-{package_version}",
'size': package_size,
'files': package_files
})
except (json.JSONDecodeError, KeyError) as e:
print(f"Error processing {json_file.name}: {e}")
continue
return {
'total_size': total_size,
'total_files': file_count,
'packages': sorted(package_details, key=lambda x: x['size'], reverse=True)
}
def format_size(bytes: int) -> str:
"""Format bytes to human-readable string."""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes < 1024.0:
return f"{bytes:.2f} {unit}"
bytes /= 1024.0
return f"{bytes:.2f} TB"
def find_conda_meta_dirs(workspace: Path) -> list[Path]:
"""Find all conda-meta directories in the workspace."""
pixi_envs_path = workspace / ".pixi" / "envs"
if not pixi_envs_path.exists():
return []
conda_meta_dirs = []
for env_dir in pixi_envs_path.iterdir():
if env_dir.is_dir():
conda_meta = env_dir / "conda-meta"
if conda_meta.exists() and conda_meta.is_dir():
conda_meta_dirs.append(conda_meta)
return sorted(conda_meta_dirs)
def print_results(env_name: str, results: dict, show_packages: bool = True):
"""Print results for a single environment."""
if show_packages and results['packages']:
print(f"\n{'='*70}")
print(f"TOP PACKAGES IN {env_name}")
print(f"{'='*70}")
print(f"{'Package':<50} {'Size':<15} {'Files':<10}")
print(f"{'-'*70}")
for pkg in results['packages'][:10]: # Show top 10 per env
print(f"{pkg['name']:<50} {format_size(pkg['size']):<15} {pkg['files']:<10}")
if len(results['packages']) > 10:
print(f"\n... and {len(results['packages']) - 10} more packages")
print(f"\n{env_name} Summary:")
print(f" Duplicated size: {format_size(results['total_size'])} ({results['total_size']:,} bytes)")
print(f" Duplicated files: {results['total_files']:,}")
print(f" Packages with duplicated files: {len(results['packages'])}")
def main():
parser = argparse.ArgumentParser(
description="Calculate size of duplicated files across Pixi environments"
)
parser.add_argument(
"workspace",
nargs="?",
type=Path,
default=Path.cwd(),
help="Path to the workspace directory (default: current directory)"
)
parser.add_argument(
"--env",
type=str,
help="Process only a specific environment (e.g., 'default')"
)
parser.add_argument(
"--no-packages",
action="store_true",
help="Don't show per-package breakdown"
)
args = parser.parse_args()
workspace = args.workspace.resolve()
print("Calculating size of duplicated files (with prefix_placeholder)...")
print(f"Workspace: {workspace}\n")
conda_meta_dirs = find_conda_meta_dirs(workspace)
if not conda_meta_dirs:
print(f"No conda-meta directories found in {workspace}/.pixi/envs/")
return
# Filter by environment if specified
if args.env:
conda_meta_dirs = [d for d in conda_meta_dirs if d.parent.name == args.env]
if not conda_meta_dirs:
print(f"Environment '{args.env}' not found")
return
print(f"Found {len(conda_meta_dirs)} environment(s):")
for d in conda_meta_dirs:
print(f" - {d.parent.name}")
# Process each environment
all_results = {}
total_across_envs = {'total_size': 0, 'total_files': 0, 'total_packages': 0}
for conda_meta_dir in conda_meta_dirs:
env_name = conda_meta_dir.parent.name
results = calculate_duplicated_size(conda_meta_dir)
all_results[env_name] = results
total_across_envs['total_size'] += results['total_size']
total_across_envs['total_files'] += results['total_files']
total_across_envs['total_packages'] += len(results['packages'])
# Print results
for env_name, results in all_results.items():
print_results(env_name, results, show_packages=not args.no_packages)
# Print aggregate summary if multiple environments
if len(all_results) > 1:
print(f"\n{'='*70}")
print(f"AGGREGATE SUMMARY (ALL ENVIRONMENTS)")
print(f"{'='*70}")
print(f"Total duplicated size: {format_size(total_across_envs['total_size'])} ({total_across_envs['total_size']:,} bytes)")
print(f"Total duplicated files: {total_across_envs['total_files']:,}")
print(f"Total packages with duplicated files: {total_across_envs['total_packages']}")
print(f"Number of environments: {len(all_results)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment