import os
from PIL import Image
import imagehash
import matplotlib.pyplot as plt
import time
# Starttime for runtime measurement
start = time.time()
# Function to calculate perceptual hash for an image
def get_image_hash(image_path):
#Open image and compute a perceptual hash (pHash) for an image
with Image.open(image_path) as img:
return imagehash.phash(img)
# Function to find duplicate images in a dataset
def find_duplicates(image_folder):
hashes = {} # Dictionary to store image hashes and their associated files
duplicates = [] # List to store paths of duplicate image pairs
count = 0 # Number of images trakced
# Iterate through the files in the USB
for image_filename in os.listdir(image_folder):
# Create full file path
image_path = os.path.join(image_folder, image_filename)
# Check if the file is an image (this is kinda optional, but makes it safe)
if image_filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
try:
image_hash = get_image_hash(image_path) # Calculate the hash for the current image
count = count+1
print(count)
except Exception as e:
print(f"Error processing image {image_filename}: {e}") # Skip file if there's an error
continue
# If the hash already exists, it's a duplicate
if image_hash in hashes:
duplicates.append((image_filename, hashes[image_hash])) # If the hash exists, add
else:
hashes[image_hash] = image_filename
return duplicates
def display_image_pairs(image_folder, duplicates):
for dup in duplicates:
img1_path = os.path.join(image_folder, dup[0])
img2_path = os.path.join(image_folder, dup[1])
# img1 = Image.open(img1_path)
# img2 = Image.open(img2_path)
print("Duplicate: ", img1_path, img2_path)
# Plot the images side by side - shown for demonstration
# fig, axes = plt.subplots(1, 2, figsize=(10, 5))
# axes[0].imshow(img1)
# axes[0].set_title(f'Duplicate 1: {dup[0]}')
# axes[0].axis('off')
# axes[1].imshow(img2)
# axes[1].set_title(f'Duplicate 2: {dup[1]}')
# axes[1].axis('off')
# plt.show()
# Set the path to the USB (D:\) <-- This is the SPLASHSTICK
image_folder = 'D:\\'
# Find and print duplicate images
duplicates = find_duplicates(image_folder)
if duplicates:
pass
# Display the duplicates
display_image_pairs(image_folder, duplicates)
else:
print("No duplicate images found.")
end = time.time()
print("Runtime: ", end - start," seconds")