Pitch estimation is the process of determining the fundamental frequency of a sound signal.
Pitch Estimation using Fourier Transform
To estimate the pitch of a sound using the Fourier Transform, the first step is to take a short window of the sound signal and apply the Fourier Transform to it. The resulting frequency-domain representation is then analyzed to identify the frequency with the highest amplitude. This frequency is considered to be the fundamental frequency, or pitch, of the sound.
It's important to note that this method is not always accurate and can be affected by various factors such as noise, multiple sources, and non-harmonic sounds. Also, it's sensitive to the window size and windowing function used.
Pitch Estimation using Autocorrelation method
The first step is to divide the given speech signal into 30-40ms blocks of speech frames. The auto correlation sequence of each frame is then found. The pitch period can be computed by finding the time lag corresponds to the second largest peak from the central peak of autocorrelation sequence.
Activity 3.1
Pitch Estimation using Fourier Transform
[y,fs] = audioread("cirk-barabannaya-drob-i-marsh_-silachi.mp3");
y = y(:,1); % Converts to a single channel
dt = 1/fs;
t = 0:dt:(length(y)*dt)-dt;
figure()
plot(t,y);
title('Song Signal');
xlabel('Time (s)');
ylabel('Amplitude');
figure()
sample_array = 1:length(y);
plot(sample_array,y);
window_size = 2 sec
window_time = 0.5;
window_samples = fs*window_time;
n_windows = floor(length(sample_array)/window_samples);
M(n_windows) = struct('cdata',[],'colormap',[]);
h = figure;
h.Visible = 'on';
aviobj = VideoWriter('myMovie_2.avi');
open(aviobj)
pitch_arr = zeros(1,n_windows);
for i=1:n_windows
filtered_samples = (i-1)*window_samples+1 : i*window_samples+1;
filtered_signal = y(filtered_samples);
% Fourier Transform
L = size(filtered_signal,1);
Fn = fs/2;
FTy = fft(filtered_signal)/L;
Fv = linspace(0, 1, fix(L/2)+1)*Fn;
Iv = 1:numel(Fv);
[~,max_index] = max(abs(FTy(Iv,:))*2);
max_frequency = Fv(max_index(1));
pitch_arr(i) = max_frequency;
%figure()
subplot(3,1,1)
plot(sample_array,y);
yl = ylim;
xl = xlim;
xBox = [(i-1)*window_samples+1,i*window_samples+1,i*window_samples+1, (i-1)*window_samples+1];
yBox = [yl(1),yl(1),yl(2),yl(2)];
patch(xBox, yBox, 'black', 'FaceColor', 'black', 'FaceAlpha', 0.5);
xlabel('Samples')
ylabel('Amplitude')
title("Non Overlapping Window")
grid on
subplot(3,1,2)
t_sample = 0:dt:(length(filtered_signal)*dt)-dt;
plot(t_sample,filtered_signal);
xlabel('Time')
ylabel('Amplitude')
title("Amplitude Vartion of Window")
grid on
subplot(3,1,3)
stem(Fv, abs(FTy(Iv,:))*2);
dim = [.7 .7 .3 .3];
annotate_txt = sprintf("Pitch : %6.f Hz",max_frequency);
t = annotation('textbox',dim,'String', ...
annotate_txt,'FitBoxToText','on');
xlabel('Frequency')
ylabel('Amplitude')
title("Frequency Vartion of Window")
grid on
t.Color = "white";
t.BackgroundColor = "black";
drawnow
pause(0.1)
M(i) = getframe;
frame = getframe(gcf);
writeVideo(aviobj,frame);
end
close(aviobj);
Activity 3.2
Pitch Estimation using Correlation
[y,Fs]= audioread('cirk-barabannaya-drob-i-marsh_-silachi.mp3');
y = y(1:10000,1);
max_value = max(abs(y));
y=y/max_value;
t=(1/Fs:1/Fs:(length(y)/Fs))*1000;
subplot(2,1,1);
plot(t,y);
title('A 30 millisecond segment of speech')
xlabel('time in milliseconds');
sum1=0;autocorrelation=0;
for l=0:(length(y)-1)
sum1=0;
for u=1:(length(y)-l)
s=y(u)*y(u+l);
sum1=sum1+s;
end
autocor(l+1)=sum1;
end
kk=(1/Fs:1/Fs:(length(autocor)/Fs))*1000;
subplot(2,1,2);
plot(kk,autocor);
title('Autocorrelation of the 30 millisecond segment of speech')
xlabel('time in milliseconds');
auto=autocor(21:160);
max1=0;
for uu=1:140
if(auto(uu)>max1)
max1=auto(uu);
sample_no=uu;
end
end
pitch_period_To=(20+sample_no)*(1/Fs)
pitch_freq_Fo=1/pitch_period_To