Skip to content

Instantly share code, notes, and snippets.

@LukeB42
Created March 11, 2026 14:52
Show Gist options
  • Select an option

  • Save LukeB42/7b7a19a663130309c555c926ad55ee58 to your computer and use it in GitHub Desktop.

Select an option

Save LukeB42/7b7a19a663130309c555c926ad55ee58 to your computer and use it in GitHub Desktop.
import os
import hashlib
import sys
def get_sha1(file_path):
"""Calculate the SHA1 hash of a file in chunks to handle large files."""
sha1 = hashlib.sha1()
try:
with open(file_path, 'rb') as f:
# Read in 64KB chunks for efficiency
for chunk in iter(lambda: f.read(65536), b""):
sha1.update(chunk)
return sha1.hexdigest()
except (OSError, IOError):
return None
def find_duplicates(paths):
hashes = {} # {sha1_hash: first_file_path}
duplicates = []
for path in paths:
if not os.path.exists(path):
print(f"Skipping: {path} does not exist.")
continue
# os.walk recursively traverses directories
for root, _, files in os.walk(path):
for filename in files:
full_path = os.path.join(root, filename)
# Skip symlinks to avoid circularity or false positives
if os.path.islink(full_path):
continue
file_hash = get_sha1(full_path)
if file_hash:
if file_hash in hashes:
# Found a duplicate!
duplicates.append((full_path, hashes[file_hash]))
print(f"[DUPE] {full_path} == {hashes[file_hash]}")
else:
hashes[file_hash] = full_path
return duplicates
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 dup_check.py <path1> [path2]")
else:
# sys.argv[1:] captures all paths provided after the script name
results = find_duplicates(sys.argv[1:])
print(f"\nTotal duplicates found: {len(results)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment