mirror of
https://github.com/nadimkobeissi/mkbsd.git
synced 2025-01-18 19:27:47 -05:00
refactor(mkbsd.py): replace async image download with synchronous requests for simplicity and add duplicate removal and zipping functionality
feat(mkbsd.py): add argparse for command-line options to zip downloads and remove duplicates chore: add requirements.txt for dependency management
This commit is contained in:
parent
82e50c64f0
commit
ebd7a47ebe
2 changed files with 165 additions and 65 deletions
229
mkbsd.py
229
mkbsd.py
|
@ -1,76 +1,175 @@
|
|||
# Licensed under the WTFPL License
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import time
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from urllib.parse import urlparse
|
||||
url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s'
|
||||
import re
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from urllib.parse import unquote
|
||||
|
||||
async def delay(ms):
|
||||
await asyncio.sleep(ms / 1000)
|
||||
import imagehash
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
async def download_image(session, image_url, file_path):
|
||||
|
||||
# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
|
||||
|
||||
|
||||
def fetch_json_data(url):
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
raise Exception(
|
||||
f"Failed to fetch JSON data. Status code: {response.status_code}"
|
||||
)
|
||||
|
||||
|
||||
def extract_urls(element):
|
||||
urls = []
|
||||
if isinstance(element, dict):
|
||||
for key, value in element.items():
|
||||
if key == "url":
|
||||
urls.append(value)
|
||||
else:
|
||||
urls.extend(extract_urls(value))
|
||||
elif isinstance(element, list):
|
||||
for item in element:
|
||||
urls.extend(extract_urls(item))
|
||||
return urls
|
||||
|
||||
|
||||
def download_file(url):
|
||||
file_name = os.path.basename(unquote(url.split("?")[0]))
|
||||
file_name = clean_filename(file_name)
|
||||
file_path = os.path.join("downloads", file_name)
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Downloading {url}")
|
||||
response = requests.get(url, stream=True)
|
||||
with open(file_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
else:
|
||||
print(f"Skipping {url}")
|
||||
return file_path
|
||||
|
||||
|
||||
def clean_filename(filename):
|
||||
sanitized_name = filename.replace("~", " ")
|
||||
sanitized_name = re.sub(r'[<>:"/\\|?*]', "_", sanitized_name)
|
||||
sanitized_name = re.sub(r"[\s_]+", " ", sanitized_name).strip()
|
||||
return sanitized_name
|
||||
|
||||
|
||||
def zip_directory(path, zip_name):
|
||||
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
arcname = os.path.relpath(file_path, path)
|
||||
zipf.write(file_path, arcname)
|
||||
print(f"Created zip file: {zip_name}")
|
||||
|
||||
|
||||
def compute_hash(filepath):
|
||||
try:
|
||||
async with session.get(image_url) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed to download image: {response.status}")
|
||||
content = await response.read()
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(content)
|
||||
with Image.open(filepath) as img:
|
||||
return imagehash.phash(img, hash_size=8), filepath
|
||||
except Exception as e:
|
||||
print(f"Error downloading image: {str(e)}")
|
||||
print(f"Error processing {filepath}: {e}")
|
||||
return None
|
||||
|
||||
async def main():
|
||||
|
||||
def find_duplicate_images(directory, threshold=2):
|
||||
image_files = [
|
||||
os.path.join(directory, f)
|
||||
for f in os.listdir(directory)
|
||||
if f.lower().endswith((".jpg", ".jpeg", ".png"))
|
||||
]
|
||||
|
||||
image_files.sort(key=os.path.getsize)
|
||||
|
||||
with mp.Pool(mp.cpu_count()) as pool:
|
||||
results = pool.map(compute_hash, image_files)
|
||||
|
||||
hash_groups = defaultdict(list)
|
||||
for result in filter(None, results):
|
||||
hash_value, filepath = result
|
||||
hash_groups[hash_value].append(filepath)
|
||||
|
||||
duplicates = []
|
||||
for hash_value, filepaths in hash_groups.items():
|
||||
if len(filepaths) > 1:
|
||||
for i in range(len(filepaths)):
|
||||
for j in range(i + 1, len(filepaths)):
|
||||
duplicates.append((filepaths[i], filepaths[j]))
|
||||
|
||||
return duplicates
|
||||
|
||||
|
||||
def remove_duplicates(duplicates):
|
||||
for image1, image2 in duplicates:
|
||||
try:
|
||||
if os.path.getsize(image1) < os.path.getsize(image2):
|
||||
os.remove(image1)
|
||||
print(f"Removed duplicate: {image1}")
|
||||
else:
|
||||
os.remove(image2)
|
||||
print(f"Removed duplicate: {image2}")
|
||||
except Exception as e:
|
||||
print(f"Error removing duplicate: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download images from JSON data and remove duplicates."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zip", action="store_true", help="Create a zip file of the downloaded images"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zip-name",
|
||||
type=str,
|
||||
help="Custom name for the zip file (default: downloads.zip)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--remove-duplicates",
|
||||
action="store_true",
|
||||
help="Remove duplicate images after download",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"⛔ Failed to fetch JSON file: {response.status}")
|
||||
json_data = await response.json()
|
||||
data = json_data.get('data')
|
||||
|
||||
if not data:
|
||||
raise Exception('⛔ JSON does not have a "data" property at its root.')
|
||||
|
||||
download_dir = os.path.join(os.getcwd(), 'downloads')
|
||||
if not os.path.exists(download_dir):
|
||||
os.makedirs(download_dir)
|
||||
print(f"📁 Created directory: {download_dir}")
|
||||
|
||||
file_index = 1
|
||||
for key, subproperty in data.items():
|
||||
if subproperty and subproperty.get('dhd'):
|
||||
image_url = subproperty['dhd']
|
||||
print(f"🔍 Found image URL!")
|
||||
parsed_url = urlparse(image_url)
|
||||
ext = os.path.splitext(parsed_url.path)[-1] or '.jpg'
|
||||
filename = f"{file_index}{ext}"
|
||||
file_path = os.path.join(download_dir, filename)
|
||||
|
||||
await download_image(session, image_url, file_path)
|
||||
print(f"🖼️ Saved image to {file_path}")
|
||||
|
||||
file_index += 1
|
||||
await delay(250)
|
||||
|
||||
json_data = fetch_json_data(json_url)
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
print(f"Error: {e}")
|
||||
return
|
||||
|
||||
urls = extract_urls(json_data)
|
||||
print(f"Found {len(urls)} URLs")
|
||||
|
||||
if not os.path.exists("downloads"):
|
||||
os.makedirs("downloads")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
executor.map(download_file, urls)
|
||||
|
||||
if args.remove_duplicates:
|
||||
print("Searching for duplicate images...")
|
||||
duplicates = find_duplicate_images("downloads")
|
||||
if duplicates:
|
||||
print(f"Found {len(duplicates)} pairs of duplicate images.")
|
||||
remove_duplicates(duplicates)
|
||||
else:
|
||||
print("No duplicate images found.")
|
||||
|
||||
if args.zip:
|
||||
zip_name = args.zip_name if args.zip_name else "downloads.zip"
|
||||
if not zip_name.endswith(".zip"):
|
||||
zip_name += ".zip"
|
||||
zip_directory("downloads", zip_name)
|
||||
|
||||
def ascii_art():
|
||||
print("""
|
||||
/$$ /$$ /$$ /$$ /$$$$$$$ /$$$$$$ /$$$$$$$
|
||||
| $$$ /$$$| $$ /$$/| $$__ $$ /$$__ $$| $$__ $$
|
||||
| $$$$ /$$$$| $$ /$$/ | $$ \\ $$| $$ \\__/| $$ \\ $$
|
||||
| $$ $$/$$ $$| $$$$$/ | $$$$$$$ | $$$$$$ | $$ | $$
|
||||
| $$ $$$| $$| $$ $$ | $$__ $$ \\____ $$| $$ | $$
|
||||
| $$\\ $ | $$| $$\\ $$ | $$ \\ $$ /$$ \\ $$| $$ | $$
|
||||
| $$ \\/ | $$| $$ \\ $$| $$$$$$$/| $$$$$$/| $$$$$$$/
|
||||
|__/ |__/|__/ \\__/|_______/ \\______/ |_______/""")
|
||||
print("")
|
||||
print("🤑 Starting downloads from your favorite sellout grifter's wallpaper app...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
ascii_art()
|
||||
time.sleep(5)
|
||||
asyncio.run(main())
|
||||
main()
|
||||
|
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
|
@ -0,0 +1 @@
|
|||
imagehash
|
Loading…
Reference in a new issue