See https://gist.github.com/maalrron/877b2edb23cc5d99d6a6b4c22f708e58 for more context
Run like python dlMegascans.py && python validate_zips.py
See https://gist.github.com/maalrron/877b2edb23cc5d99d6a6b4c22f708e58 for more context
Run like python dlMegascans.py && python validate_zips.py
| import os | |
| import zipfile | |
| from pathlib import Path | |
| import re | |
| from multiprocessing import Pool | |
| # EDIT THESE PATHS | |
| # root of quixel zips | |
| ROOT_DIR=Path("./Quixel Zips").resolve() | |
| # dir where bad files will be moved to, can be left unchanged | |
| TRASH_DIR=Path(str(ROOT_DIR)+"_trash") | |
| # location of cache.txt | |
| CACHEFILE=Path("cache.txt").resolve() | |
| # validation file generated by this script | |
| VALIDATION_RECORD=Path("validated.txt").resolve() | |
| ### | |
| def zip_is_valid(fname): | |
| try: | |
| test_zip=zipfile.ZipFile(fname) | |
| if test_zip.testzip() is not None: | |
| print(f"'{fname}' is corrupt") | |
| return False | |
| except Exception as ex: | |
| print(f"Exception opening '{fname}':",ex) | |
| return False | |
| return True | |
| def to_trash(fname): | |
| orig=Path(fname).resolve() | |
| relative_path=orig.relative_to(ROOT_DIR) | |
| destination=TRASH_DIR/relative_path | |
| destination.parent.mkdir(parents=True,exist_ok=True) | |
| #print(f"moving '{orig}' to '{destination}'") | |
| orig.rename(destination) | |
| cache_ids=[] | |
| with open(CACHEFILE) as f: | |
| for line in f: | |
| line=line.strip() | |
| cache_ids.append(line) | |
| previously_validated_ids=[] | |
| if VALIDATION_RECORD.exists(): | |
| with open(VALIDATION_RECORD) as f: | |
| for line in f: | |
| line=line.strip() | |
| previously_validated_ids.append(line) | |
| #print(previously_validated_ids) | |
| files=[] | |
| zip_count=0 | |
| for fpath in ROOT_DIR.glob("**/*"): | |
| if fpath.is_file(): | |
| fname=str(fpath) | |
| files.append(fname) | |
| if fname.endswith(".zip"): | |
| zip_count+=1 | |
| id_regex=re.compile(r'(?P<id>[a-zA-Z0-9]+)_(?P<resolution>\dK)_(?P<type>\w+)_ms.zip') | |
| def check_file(fname): | |
| global id_regex | |
| global previously_validated_ids | |
| if not fname.endswith(".zip"): | |
| print(f"'{fname}' is not a zip") | |
| to_trash(fname) | |
| return (None, False) | |
| else: | |
| match=id_regex.search(fname) | |
| if match is None: | |
| print(f"'{fname}' did not match regex") | |
| return (None, False) | |
| asset_id=match["id"] | |
| #print(previously_validated_ids) | |
| if asset_id in previously_validated_ids: | |
| #print(f"asset {asset_id} previously validated") | |
| return (asset_id, True) | |
| fpath=Path(fname) | |
| if fpath.stat().st_size < 4*1024*1024: | |
| #print(f"'{fname}' is too small") | |
| to_trash(fname) | |
| return (asset_id, False) | |
| if not zip_is_valid(fname): | |
| #print(f"'{fname}' is invalid zip") | |
| to_trash(fname) | |
| return(asset_id,False) | |
| if asset_id not in cache_ids: | |
| print(f"'{fname}', id {asset_id}, not in cache") | |
| return (asset_id, True) | |
| #trash non-zip files, files under 4mb, and ones that fail the validation test | |
| bad_ids=set() | |
| valid_zip_ids=set() | |
| total_skipped=0 | |
| i=0 | |
| total_items=len(files) | |
| report_step=total_items*5.0/100.0 | |
| with Pool(8) as pool: | |
| for i, result in enumerate(pool.imap_unordered(check_file,files,chunksize=8)): | |
| (asset_id, is_valid)=result | |
| if asset_id is None: | |
| continue | |
| elif is_valid: | |
| valid_zip_ids.add(asset_id) | |
| if asset_id not in previously_validated_ids: | |
| #print(f"recording new validation {asset_id}") | |
| with open(VALIDATION_RECORD,'a') as f: | |
| #print("recording valid id",asset_id) | |
| f.write(asset_id+'\n') | |
| else: | |
| total_skipped+=1 | |
| else: | |
| bad_ids.add(asset_id) | |
| j=i+1 | |
| if int(j/report_step)>int(i/report_step): | |
| print(f"completed {j}/{total_items}") | |
| i=j | |
| print(len(cache_ids),"lines in cache") | |
| print(zip_count,"zips found") | |
| print(f"found {len(bad_ids)} bad asset ids") | |
| print(f"found {len(valid_zip_ids)} good asset ids") | |
| print(f"was able to skip {total_skipped} assets that had been previously validated") | |
| append_digit=0 | |
| def bak_name(fpath,index=0): | |
| append=".bak" | |
| if index>0: | |
| append+=str(index) | |
| return Path(str(fpath)+append) | |
| while bak_name(CACHEFILE,append_digit).exists(): | |
| append_digit+=1 | |
| CACHEFILE.rename(bak_name(CACHEFILE,append_digit)) | |
| with open(CACHEFILE,'w') as f: | |
| for item in cache_ids: | |
| if item in valid_zip_ids: | |
| f.write(item+'\n') |