#code begins here
#importing relevent libraries
import numpy as np
import pandas as pd
import matplotlib . pyplot as plt
#reading the given unemployment file into a panda dataframe
datapandas=pd.read_table('unemploymentrate.csv', delimiter=',')
#creating a numpy array using the above panda dataframe
datanumpy=datapandas.values
#extracting the rows that contain the relevent data of UP
DataUnemprate=datanumpy[0:36,4]
#reading the given crimerate file into a panda dataframe
datapandas=pd.read_table('crimerate.csv', delimiter=',')
#creating a numpy array using the above panda dataframe
datanumpy=datapandas.values
#Extracting the rows that contain the relevent data of crimerate.
#Comparing the crime rate and unemployment file, we see that Delhi is in different positions
#and that the positions of Uttarakhand and Uttar Pradesh are interchanged.
#We hence modify the array storing crime rates to match the array storing unemployment rates
DataCrimeTillChatt=datanumpy[0:5,9]
DataCrimeStateDelhi=datanumpy[34:35,9]
DataCrimeTillTripura=datanumpy[5:26,9]
DataCrimeUttarakhand=datanumpy[27:28,9]
DataCrimeUttarpradesh=datanumpy[26:27,9]
DataCrimeWestbengal=datanumpy[28:29,9]
DataCrimeUT1=datanumpy[30:34,9]
DataCrimeUT2=datanumpy[35:37,9]
DataCrimerate= np.concatenate((DataCrimeTillChatt,DataCrimeStateDelhi,DataCrimeTillTripura,DataCrimeUttarakhand,DataCrimeUttarpradesh,DataCrimeWestbengal,DataCrimeUT1,DataCrimeUT2))
#the discrepencies have now been removed
#N is the total number of states and UT combined(36)
N=36
#1.Calculating the sample means for unemployment and crime rates
i=0
SumOFunemp=0
SumOFcrime=0
while(i<N):
SumOFunemp=SumOFunemp+(DataUnemprate[i])
SumOFcrime=SumOFcrime+(DataCrimerate[i])
i=i+1
meanOFunemp=SumOFunemp/N
print("mean of unemployment rate is ",meanOFunemp)
meanOFcrime=SumOFcrime/N
print("mean of crime rate is ",meanOFcrime)
#2.Next, we find the variance and then the standard deviations
sumU=0
sumC=0
i=0
#entering a while loop to store the relevant data
while(i<N):
sumU=sumU+((DataUnemprate[i]-meanOFunemp)**(2))
sumC=sumC+((DataCrimerate[i]-meanOFcrime)**(2))
i=i+1
#dividing the final sum of squares by N-1 to get the variance of the sample
UnempVariance=(sumU/((N-1)))
CrimeVariance=(sumC/((N-1)))
#taking the square-root of the variance of the sample to get the standard deviation of sample
unempStandardD=UnempVariance**(0.5)
crimeStandardD=CrimeVariance**(0.5)
print("standard deviation of unemployment is", unempStandardD)
print("standard deviation of crime rate is",crimeStandardD)
#3.Estimating Covariance using the given formula
i=0
Sumcov=0
while(i<N):
Sumcov=Sumcov+((DataCrimerate[i]-meanOFcrime)*(DataUnemprate[i]-meanOFunemp))
i=i+1
cov=Sumcov/(N-1)
print (cov)
#4.calculating the correlation coefficient(CorrelCoeff) using the given formula
den=((crimeStandardD*unempStandardD))**(0.5)
CorrelCoeff=(cov)**(0.5)/den
print("Correlation coefficient between unemployment rate and crime rate is",CorrelCoeff)
#The direct approach of plotting histograms of the arrays DataCrimerate and DataUnemprate fails
#This is possibly due to the non specification of variable type early on in the array intialisation.
#To plot the histograms, we transfer the data from our old arrays to two new arrays of datatype float
unemprateperstate=np.zeros(36,dtype=float)
crimerateperstate=np.zeros(36,dtype=float)
i=0
#assigning the values as per the data
while(i<36):
unemprateperstate[i]=DataUnemprate[i]
crimerateperstate[i]=DataCrimerate[i]
i=i+1
#5.Plotting histogram of unemployment rate,taking 12 bins of size 1 units,
plt.hist(unemprateperstate,bins=[0,1,2,3,4,5,6,7,8,9,10,11,12],rwidth=0.95,color="orange")
#assigning a title to the plot
plt.title('Number of states vs Unemployment rate')
#labelling the axes
plt.xlabel('Unemployment rate')
plt.ylabel('Number of states')
#marking the estimated mean and standard deviation of unemployment on the histogram
plt.axvline(meanOFunemp, color="indigo", linestyle='dashed', linewidth=2)
plt.text(meanOFunemp+0.02, 10, 'Mean of unemployment rate', color='indigo')
plt.axhline(unempStandardD, color="blue", linestyle='dashed', linewidth=2)
plt.text(unempStandardD+9.9, 3, 'Standard Deviation of unemployment rate', color='blue')
plt.show()
#6.Plotting the histogram of unemployment rate,taking 10 bins of size 100 units,
plt.hist(crimerateperstate,bins=[0,100,200,300,400,500,600,700,800,900,1000],rwidth=0.95,color="green")
plt.title('Number of states vs Crime rate')
plt.xlabel('Crime rate (per 100,000)')
plt.ylabel('Number of states')
#marking the estimated mean and standard deviation of unemployment on the histogram
plt.axvline(meanOFcrime, color="indigo", linestyle='dashed', linewidth=2)
plt.text(meanOFcrime+0.02, 10, 'Mean of crimerate', color='indigo')
#marking standard deviation in vertical direction(in x-axis) due to its high value compared to no of states
plt.axvline(crimeStandardD, color="blue", linestyle='dashed', linewidth=2)
plt.text(crimeStandardD+0.02, 16, 'Standard Deviation of crimerate', color='blue')
plt.show()
#7.Making the Scatterplot
x = DataUnemprate
y = DataCrimerate
#labelling the axes
plt.xlabel('Unemployment-rate')
plt.ylabel('Crime-rate (per 100,000)')
plt.title('Scatter Plot of unemployment rate and crime rate')
plt.scatter(x, y, alpha=0.5,color='blue')
plt.show()
#8.plotting the 2D histogram with crimerate on Y-axis and unemployment on X-axis
plt.hist2d(unemprateperstate,crimerateperstate, bins=(30,30), cmap=plt.cm.Reds)
plt.xlabel("Unemployment rate")
plt.ylabel("Crime Rate (per 100,000)")
plt.title("2D Histogram showing the unemployment and crime rates")
plt.show()
#End of code