We use a pre-trained Deep Learning model to classify an object in an image and then convert the top-ranked label into sound.
import torch
from torchvision import models
from torchvision import transforms
from PIL import Image
from gtts import gTTS
from io import BytesIO
import pygame
# Many different deep learning vision models are available.
# Big models take long time to load and evaluate.
#net = torch.hub.load('pytorch/vision:v0.4.2', 'squeezenet1_0', pretrained=True)
#net = torch.hub.load('pytorch/vision:v0.4.2', 'shufflenet_v2_x1_0', pretrained=True)
net = torch.hub.load('pytorch/vision:v0.4.2', 'mobilenet_v2', pretrained=True)
#net = torch.hub.load('pytorch/vision:v0.4.2', 'resnext50_32x4d', pretrained=True)
#net = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl')
#net = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x16d_wsl')
net.eval()
with open('imagenet_classes.txt') as f:
labels = [line.strip() for line in f.readlines()]
pil_image = Image.open('junco.jpeg') # replace with a camera capture later
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
img_t = transform(pil_image)
batch = torch.unsqueeze(img_t, 0)
out = net(batch)
percentage = torch.nn.functional.softmax(out, dim=1)[0] * 100
_, indices = torch.sort(out, descending=True)
for idx in indices[0][:5]:
print(labels[idx], percentage[idx].item())
pygame.mixer.init()
mp3_fp = BytesIO()
tts = gTTS(labels[indices[0][0]], 'en')
tts.write_to_fp(mp3_fp)
sound = BytesIO(mp3_fp.getvalue())
pygame.mixer.music.load(sound)
pygame.mixer.music.play()