Last active
February 11, 2026 12:49
-
-
Save ruben-arts/389749b62ce5c9defbbe3f6c9f613cca to your computer and use it in GitHub Desktop.
Duplication checking of a pixi environment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S pixi exec -- python | |
| """ | |
| Calculate total size_in_bytes for paths with prefix_placeholder defined. | |
| These files are duplicated on disk (not hardlinked). | |
| """ | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| def calculate_duplicated_size(directory: Path) -> dict: | |
| """Calculate total size for duplicated files across all JSON files.""" | |
| total_size = 0 | |
| file_count = 0 | |
| package_details = [] | |
| for json_file in directory.glob("*.json"): | |
| try: | |
| with open(json_file, 'r') as f: | |
| data = json.load(f) | |
| package_name = data.get('name', 'unknown') | |
| package_version = data.get('version', 'unknown') | |
| package_size = 0 | |
| package_files = 0 | |
| # Check if paths_data exists and has paths | |
| if 'paths_data' in data and 'paths' in data['paths_data']: | |
| for path_entry in data['paths_data']['paths']: | |
| # Only count entries with prefix_placeholder | |
| if 'prefix_placeholder' in path_entry: | |
| size = path_entry.get('size_in_bytes', 0) | |
| package_size += size | |
| package_files += 1 | |
| file_count += 1 | |
| if package_size > 0: | |
| total_size += package_size | |
| package_details.append({ | |
| 'name': f"{package_name}-{package_version}", | |
| 'size': package_size, | |
| 'files': package_files | |
| }) | |
| except (json.JSONDecodeError, KeyError) as e: | |
| print(f"Error processing {json_file.name}: {e}") | |
| continue | |
| return { | |
| 'total_size': total_size, | |
| 'total_files': file_count, | |
| 'packages': sorted(package_details, key=lambda x: x['size'], reverse=True) | |
| } | |
| def format_size(bytes: int) -> str: | |
| """Format bytes to human-readable string.""" | |
| for unit in ['B', 'KB', 'MB', 'GB']: | |
| if bytes < 1024.0: | |
| return f"{bytes:.2f} {unit}" | |
| bytes /= 1024.0 | |
| return f"{bytes:.2f} TB" | |
| def find_conda_meta_dirs(workspace: Path) -> list[Path]: | |
| """Find all conda-meta directories in the workspace.""" | |
| pixi_envs_path = workspace / ".pixi" / "envs" | |
| if not pixi_envs_path.exists(): | |
| return [] | |
| conda_meta_dirs = [] | |
| for env_dir in pixi_envs_path.iterdir(): | |
| if env_dir.is_dir(): | |
| conda_meta = env_dir / "conda-meta" | |
| if conda_meta.exists() and conda_meta.is_dir(): | |
| conda_meta_dirs.append(conda_meta) | |
| return sorted(conda_meta_dirs) | |
| def print_results(env_name: str, results: dict, show_packages: bool = True): | |
| """Print results for a single environment.""" | |
| if show_packages and results['packages']: | |
| print(f"\n{'='*70}") | |
| print(f"TOP PACKAGES IN {env_name}") | |
| print(f"{'='*70}") | |
| print(f"{'Package':<50} {'Size':<15} {'Files':<10}") | |
| print(f"{'-'*70}") | |
| for pkg in results['packages'][:10]: # Show top 10 per env | |
| print(f"{pkg['name']:<50} {format_size(pkg['size']):<15} {pkg['files']:<10}") | |
| if len(results['packages']) > 10: | |
| print(f"\n... and {len(results['packages']) - 10} more packages") | |
| print(f"\n{env_name} Summary:") | |
| print(f" Duplicated size: {format_size(results['total_size'])} ({results['total_size']:,} bytes)") | |
| print(f" Duplicated files: {results['total_files']:,}") | |
| print(f" Packages with duplicated files: {len(results['packages'])}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Calculate size of duplicated files across Pixi environments" | |
| ) | |
| parser.add_argument( | |
| "workspace", | |
| nargs="?", | |
| type=Path, | |
| default=Path.cwd(), | |
| help="Path to the workspace directory (default: current directory)" | |
| ) | |
| parser.add_argument( | |
| "--env", | |
| type=str, | |
| help="Process only a specific environment (e.g., 'default')" | |
| ) | |
| parser.add_argument( | |
| "--no-packages", | |
| action="store_true", | |
| help="Don't show per-package breakdown" | |
| ) | |
| args = parser.parse_args() | |
| workspace = args.workspace.resolve() | |
| print("Calculating size of duplicated files (with prefix_placeholder)...") | |
| print(f"Workspace: {workspace}\n") | |
| conda_meta_dirs = find_conda_meta_dirs(workspace) | |
| if not conda_meta_dirs: | |
| print(f"No conda-meta directories found in {workspace}/.pixi/envs/") | |
| return | |
| # Filter by environment if specified | |
| if args.env: | |
| conda_meta_dirs = [d for d in conda_meta_dirs if d.parent.name == args.env] | |
| if not conda_meta_dirs: | |
| print(f"Environment '{args.env}' not found") | |
| return | |
| print(f"Found {len(conda_meta_dirs)} environment(s):") | |
| for d in conda_meta_dirs: | |
| print(f" - {d.parent.name}") | |
| # Process each environment | |
| all_results = {} | |
| total_across_envs = {'total_size': 0, 'total_files': 0, 'total_packages': 0} | |
| for conda_meta_dir in conda_meta_dirs: | |
| env_name = conda_meta_dir.parent.name | |
| results = calculate_duplicated_size(conda_meta_dir) | |
| all_results[env_name] = results | |
| total_across_envs['total_size'] += results['total_size'] | |
| total_across_envs['total_files'] += results['total_files'] | |
| total_across_envs['total_packages'] += len(results['packages']) | |
| # Print results | |
| for env_name, results in all_results.items(): | |
| print_results(env_name, results, show_packages=not args.no_packages) | |
| # Print aggregate summary if multiple environments | |
| if len(all_results) > 1: | |
| print(f"\n{'='*70}") | |
| print(f"AGGREGATE SUMMARY (ALL ENVIRONMENTS)") | |
| print(f"{'='*70}") | |
| print(f"Total duplicated size: {format_size(total_across_envs['total_size'])} ({total_across_envs['total_size']:,} bytes)") | |
| print(f"Total duplicated files: {total_across_envs['total_files']:,}") | |
| print(f"Total packages with duplicated files: {total_across_envs['total_packages']}") | |
| print(f"Number of environments: {len(all_results)}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment