Update mkbsd.py

Refactor image download script to track and skip previously downloaded files based on unique key and consistent naming

- Added logic to use unique key from JSON data for consistent filenames and tracking
- Improved file existence check to skip downloading files that already exist in the directory
- Updated JSON list to store keys of downloaded files for persistent tracking across runs
- Incorporated delay between downloads for smoother processing
- Cleaned up old redundant code and ensured consistency in file naming and tracking
This commit is contained in:
Nabil Mohammed Nalakath 2024-09-29 02:18:54 +05:30 committed by GitHub
parent b87195fa08
commit 9b1264c54a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,11 +1,15 @@
# Licensed under the WTFPL License # Licensed under the WTFPL License
import os import os
import json
import time import time
import aiohttp import aiohttp
import asyncio import asyncio
from urllib.parse import urlparse from urllib.parse import urlparse, urlsplit
import hashlib
url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s' url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s'
downloaded_list_path = 'downloadedList.json'
async def delay(ms): async def delay(ms):
await asyncio.sleep(ms / 1000) await asyncio.sleep(ms / 1000)
@ -21,15 +25,40 @@ async def download_image(session, image_url, file_path):
except Exception as e: except Exception as e:
print(f"Error downloading image: {str(e)}") print(f"Error downloading image: {str(e)}")
def extract_name_from_url(url):
try:
path = urlsplit(url).path
name_with_extension = os.path.basename(path)
name_without_query = name_with_extension.split('?')[0]
# Get prefix (e.g., 'hytha', 'outrunyouth', etc.)
prefix_part = next((part for part in path.split('/') if part.startswith('a~')), None)
prefix = prefix_part.split('~')[1].split('_')[0].lower() if prefix_part else 'unknown'
# Get base name
base_name = name_without_query.split('.')[0].split('~')[0].replace(r'[^a-zA-Z0-9]+', '').lower()
return f"{prefix}-{base_name}"
except Exception as e:
print(f"Error extracting name from URL: {str(e)}")
return hashlib.md5(url.encode()).hexdigest()
async def main(): async def main():
try: try:
# Load existing downloaded list
if os.path.exists(downloaded_list_path):
with open(downloaded_list_path, 'r') as f:
downloaded_list = json.load(f)
else:
downloaded_list = []
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
raise Exception(f"⛔ Failed to fetch JSON file: {response.status}") raise Exception(f"⛔ Failed to fetch JSON file: {response.status}")
json_data = await response.json() json_data = await response.json()
data = json_data.get('data') data = json_data.get('data')
if not data: if not data:
raise Exception('⛔ JSON does not have a "data" property at its root.') raise Exception('⛔ JSON does not have a "data" property at its root.')
@ -38,21 +67,42 @@ async def main():
os.makedirs(download_dir) os.makedirs(download_dir)
print(f"📁 Created directory: {download_dir}") print(f"📁 Created directory: {download_dir}")
file_index = 1 downloaded_count = 0
skipped_count = 0
for key, subproperty in data.items(): for key, subproperty in data.items():
if subproperty and subproperty.get('dhd'): if subproperty and subproperty.get('dhd'):
image_url = subproperty['dhd'] image_url = subproperty['dhd']
print(f"🔍 Found image URL!") image_name = f"{extract_name_from_url(image_url)}-{key}"
parsed_url = urlparse(image_url) ext = os.path.splitext(urlparse(image_url).path)[-1] or '.jpg'
ext = os.path.splitext(parsed_url.path)[-1] or '.jpg' file_path = os.path.join(download_dir, f"{image_name}{ext}")
filename = f"{file_index}{ext}"
file_path = os.path.join(download_dir, filename)
await download_image(session, image_url, file_path) # Check if file already exists
print(f"🖼️ Saved image to {file_path}") if os.path.exists(file_path):
if key not in downloaded_list:
downloaded_list.append(key)
print(f"✅ Found existing file, added key to list: {file_path}")
with open(downloaded_list_path, 'w') as f:
json.dump(downloaded_list, f, indent=2)
skipped_count += 1
else:
# Download the image if it doesn't exist
downloaded_count += 1
print(f"🔍 Found new image URL: {image_url}")
file_index += 1 await download_image(session, image_url, file_path)
await delay(250) print(f"🖼️ Saved image to {file_path}")
# Add key to downloaded list
downloaded_list.append(key)
with open(downloaded_list_path, 'w') as f:
json.dump(downloaded_list, f, indent=2)
print(f"📄 Updated downloaded list with key: {key}")
await delay(250)
print(f"🚀 Downloaded {downloaded_count} new images")
print(f"✅ Skipped {skipped_count} images that already exist")
except Exception as e: except Exception as e:
print(f"Error: {str(e)}") print(f"Error: {str(e)}")