Image Processing ML - Lipsync

Plot face and landmarks

Static fps data loader code (working)

Only getting lip data

Find 4 points

Midpoint

Transforming the lips to face coordinate system

Get angle

Transformation (Rotation matrix)

Implementation

References

!pip install mediapipe

import torch

from torch import nn

import torch.nn.functional as F

import numpy as np

import torch.optim as optim

import cv2

from google.colab.patches import cv2_imshow

import mediapipe as mp

import matplotlib.pyplot as plt

import matplotlib as mpl

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

cap = cv2.VideoCapture('/content/drive/MyDrive/Project Docments/lip sync/1.mp4')

Plot face and landmarks

mpDraw = mp.solutions.drawing_utils

mpFaceMesh = mp.solutions.face_mesh

faceMesh = mpFaceMesh.FaceMesh(max_num_faces=1)

X_data = []

Y_data = []

Z_data = []

while True:

success,img = cap.read()

#img =cv2.cvtColor(img,cv2.COLOR_GRB2RGB)

results = faceMesh.process(img)

if results.multi_face_landmarks:

for faceLms in results.multi_face_landmarks:

mpDraw.draw_landmarks(img, faceLms)

# tempx = []

# tempy = []

# tempz = []

# for lm in faceLms.landmark:

# tempx.append(lm.x)

# tempy.append(lm.y)

# tempz.append(lm.z)

# X_data.append(tempx)

# Y_data.append(tempy)

# Z_data.append(tempz)

# print(np.array(X_data).shape)

cv2_imshow(img)

#print(results.shape)

#print(img.shape)

cv2.waitKey(1)

Static fps data loader code (working)

X_data = []

Y_data = []

Z_data = []

aud = []

# X_data = np.zeros([0,468])

# Y_data = np.zeros([ length, 468])

# Z_data = np.zeros([ length, 468])

for files in vid_ids:

video = vid_dir +'/'+files+'.mp4'

audio = aud_dir + '/' + files+'.wav'

cap = cv2.VideoCapture(video) #reading the video

Fs,audio = wavfile.read(audio) #reading the audio

#finding the fps

(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

if int(major_ver) < 3 :

fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)

#print ("Frames per second using video.get(cv2.cv.CV_CAP_PROP_FPS): {0}".format(fps))

else :

fps = cap.get(cv2.CAP_PROP_FPS)

#print ("Frames per second using video.get(cv2.CAP_PROP_FPS) : {0}".format(fps))

#length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

#creating facepoints

for i in range(length):

success,img = cap.read()

results = faceMesh.process(img)

if results.multi_face_landmarks:

tempx = []

tempy = []

tempz = []

for faceLms in results.multi_face_landmarks:

#mpDraw.draw_landmarks(img, faceLms)

# tempx = []

# tempy = []

# tempz = []

for lm in faceLms.landmark:

tempx.append(lm.x)

tempy.append(lm.y)

tempz.append(lm.z)

X_data.append(tempx)

Y_data.append(tempy)

Z_data.append(tempz)

cap.release()

#creating audio

aud = np.array(np.concatenate((aud, audio.T[0])))

#print(np.array(audio.T[0]).shape)

video----->

'1.mp4'

'2.mp4'

'3.mp4'

etc...

audio ----->

'1.wav'

'2.wav'

'3.wav'

etc...

Here 1,2,3 are file names. you can name it anything you prefer.but make sure to have the exact same name to the corresponding audio file. All the files should be same fps and same duration

Only getting lip data

mpDraw = mp.solutions.drawing_utils

mpFaceMesh = mp.solutions.face_mesh

connection_lips = get_unique( mpFaceMesh.FACEMESH_LIPS )#FaceMesh(max_num_faces=1)

connection_face = get_unique( mpFaceMesh.FACEMESH_FACE_OVAL )

#print(connection_lips)

with mpFaceMesh.FaceMesh(static_image_mode = True, max_num_faces=1, refine_landmarks = True, min_detection_confidence = 0.5) as faceMesh:

success,img = cap.read()

results = faceMesh.process( cv2.cvtColor(img, cv2.COLOR_BGR2RGB) )

for face_landmark in results.multi_face_landmarks:

lms = face_landmark.landmark

d = {}

for index in connection_lips:

#print(index, lms[index].x)

x = int(lms[index].x * img.shape[1] )

y = int(lms[index].y * img.shape[0] )

d[index] = (x, y)

#draw

for index in connection_lips:

cv2.circle(img, ( d[index][0], d[index][1] ) , 2, (0,255,0), -1 )

cv2_imshow(img)

Find 4 points

#for image face point

mpDraw = mp.solutions.drawing_utils

mpFaceMesh = mp.solutions.face_mesh

connection_lips = get_unique( mpFaceMesh.FACEMESH_LIPS )#FaceMesh(max_num_faces=1)

connection_face = get_unique( mpFaceMesh.FACEMESH_FACE_OVAL )

print(connection_lips)

#connection_face2 = [connection_face[0],connection_face[3],connection_face[6],connection_face[10],connection_face[12],connection_face[20],connection_face[24],connection_face[27], connection_face[29], connection_face[32], connection_face[32] ]

with mpFaceMesh.FaceMesh(static_image_mode = True, max_num_faces=1, refine_landmarks = True, min_detection_confidence = 0.5) as faceMesh:

for i in range(36):

success,img = cap.read()

results = faceMesh.process( cv2.cvtColor(img, cv2.COLOR_BGR2RGB) )

for face_landmark in results.multi_face_landmarks:

lms = face_landmark.landmark

d = {}

for index in connection_face:

#print(index, lms[index].x)

x = int(lms[index].x * img.shape[1] )

y = int(lms[index].y * img.shape[0] )

d[index] = (x, y)

cv2.circle(img, ( x, y ) , 2, (255,0,0), -1 )

#for index in connection_face[i]:

index = connection_face[i]

#print(index, lms[index].x)

x = int(lms[index].x * img.shape[1] )

y = int(lms[index].y * img.shape[0] )

d[index] = (x, y)

cv2.circle(img, ( x, y ) , 2, (0,255,0), -1 )

#draw

# for index in connection_face:

# cv2.circle(img, ( d[index][0], d[index][1] ) , 2, (0,255,0), -1 )

print(i)

cv2_imshow(img)

print(connection_face2)

Midpoint

#for image face point

mpDraw = mp.solutions.drawing_utils

mpFaceMesh = mp.solutions.face_mesh

connection_lips = get_unique( mpFaceMesh.FACEMESH_LIPS )#FaceMesh(max_num_faces=1)

connection_face = get_unique( mpFaceMesh.FACEMESH_FACE_OVAL )

print(connection_lips)

#connection_face2 = [connection_face[0],connection_face[3],connection_face[6],connection_face[10],connection_face[12],connection_face[20],connection_face[24],connection_face[27], connection_face[29], connection_face[32], connection_face[32] ]

facepoints = [connection_face[3],connection_face[10],connection_face[20],connection_face[24] ]

with mpFaceMesh.FaceMesh(static_image_mode = True, max_num_faces=1, refine_landmarks = True, min_detection_confidence = 0.5) as faceMesh:

success,img = cap.read()

results = faceMesh.process( cv2.cvtColor(img, cv2.COLOR_BGR2RGB) )

for face_landmark in results.multi_face_landmarks:

lms = face_landmark.landmark

x1,y1 = midpoint(lms[facepoints[0]] ,lms[facepoints[1]],lms[facepoints[2]] ,lms[facepoints[3]], img)

d = {}

for index in connection_lips:

#print(index, lms[index].x)

x = int(lms[index].x * img.shape[1] -x1)

y = int(lms[index].y * img.shape[0] -y1)

d[index] = (x, y)

cv2.circle(img, ( x, y ) , 2, (255,0,0), -1 )

cv2.circle(img, ( x1, y1 ) , 2, (0,255,0), -1 )

cv2_imshow(img)

print(connection_face2)

def midpoint(point1,point2,point3,point4, img):

x = int( (point1.x * img.shape[1] + point2.x * img.shape[1] + point3.x * img.shape[1] + point4.x * img.shape[1] )/4 )

y = int( (point1.y * img.shape[0] + point2.y * img.shape[0] + point3.y * img.shape[0] + point4.y * img.shape[0] )/4 )

return x,y

Transforming the lips to face coordinate system

Get angle

def angleTheta(x,y):

return math.atan2(y,x)

Transformation (Rotation matrix)

def Facetransform(x0,y0,theta):

x1 = int(x0*math.sin(theta) - y0* math.cos(theta) )

y1 = int(x0*math.cos(theta) + y0* math.sin(theta) )

return x1,y1

Implementation

#for image face point

mpDraw = mp.solutions.drawing_utils

mpFaceMesh = mp.solutions.face_mesh

connection_lips = get_unique( mpFaceMesh.FACEMESH_LIPS )#FaceMesh(max_num_faces=1)

connection_face = get_unique( mpFaceMesh.FACEMESH_FACE_OVAL )

print(connection_lips)

#connection_face2 = [connection_face[0],connection_face[3],connection_face[6],connection_face[10],connection_face[12],connection_face[20],connection_face[24],connection_face[27], connection_face[29], connection_face[32], connection_face[32] ]

facepoints = [connection_face[3],connection_face[10],connection_face[20],connection_face[24] ]

with mpFaceMesh.FaceMesh(static_image_mode = True, max_num_faces=1, refine_landmarks = True, min_detection_confidence = 0.5) as faceMesh:

success,img = cap.read()

results = faceMesh.process( cv2.cvtColor(img, cv2.COLOR_BGR2RGB) )

for face_landmark in results.multi_face_landmarks:

lms = face_landmark.landmark

x1,y1 = midpoint(lms[facepoints[0]] ,lms[facepoints[1]],lms[facepoints[2]] ,lms[facepoints[3]], img)

theta = angleTheta( int(lms[facepoints[1]].x* img.shape[1]-x1) ,int(lms[facepoints[1]].y* img.shape[0]-y1) )

d = {}

for index in connection_lips:

#print(index, lms[index].x)

x = int(lms[index].x * img.shape[1] -x1)

y = int(lms[index].y * img.shape[0] -y1)

x,y = Facetransform(x,y,theta)

x = int(x+x1)

y = int(y+y1)

d[index] = (x, y)

cv2.circle(img, ( x, y ) , 2, (255,0,0), -1 )

print(math.tan(theta))

cv2.circle(img, ( x1, y1 ) , 2, (0,255,0), -1 )

cv2_imshow(img)

Here's the lip movement related to face. Here I have transformed the lips to the coordinate system I defined on face. So when it's plot again the lips appear as corresponding to a straight face rather than a tilted face here. But this is essentially the lips relatively to the face.

References

https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/

https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

https://sites.google.com/d/1teA0HcGihgG2HMKo-etYlD6RuRTU8Tii/p/1E0PJYlvC41Z6VRWM_4n4IPNpLUKUDlYg/edit

https://discuss.pytorch.org/t/one-to-many-lstm/96932/2

http://grail.cs.washington.edu/projects/AudioToObama/siggraph17_obama.pdf

https://google.github.io/mediapipe/solutions/face_mesh.html