This page contains the code written by us to study biodiversity. We have put this on our website so that anyone who needs it can access it.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Importing libraries which are to be used
import pandas as pd
import numpy as np
# In[2]:
# Reading the file downloaded from https://www.gbif.org/occurrence/
main_df = pd.read_csv('lomdi.csv',sep='\t')
main_df.head()
# In[3]:
# Just taking important columns and leaving else...
main_df = main_df[['species','decimalLatitude','decimalLongitude','day','month','year']]
main_df.head()
# In[4]:
# Checking that are there null elements in the data.
main_df.isnull().sum()
# In[5]:
# Dropping all null element containing rows
main_df = main_df.dropna()
# Reseting index
main_df = main_df.reset_index(drop=True)
main_df.isnull().sum()
# In[6]:
# Converting day, month, year to string...
main_df['day'] = main_df['day'].astype(int).astype(str)
main_df['month'] = main_df['month'].astype(int).astype(str)
main_df['year'] = main_df['year'].astype(int).astype(str)
main_df.head()
# In[7]:
# Reading air temprature data downloaded from https://datasearch.globe.gov/
temp_df = pd.read_csv('air_temp.csv')
# Dropping empty cells
temp_df = temp_df.dropna()
# Setting date into the required format
temp_df['measured_on'] = temp_df['measured_on'].astype(str)
temp_df['measured_on'] = temp_df['measured_on'].apply(lambda x: str(int(x[3:5]))+'-'+str(x[6:]))
# Setting measured on as index.
temp_df = temp_df.set_index('measured_on')
temp_df.head()
# In[8]:
# Reading soil temprature data downloaded from https://datasearch.globe.gov/
soil_temp_df = pd.read_csv('soil_temp.csv')
# Dropping empty cells
soil_temp_df = soil_temp_df.dropna()
# Setting date into the required format
soil_temp_df['measured_on'] = soil_temp_df['measured_on'].astype(str)
soil_temp_df['measured_on'] = soil_temp_df['measured_on'].apply(lambda x: str(int(x[3:5]))+'-'+str(x[6:]))
# Setting measured on as index.
soil_temp_df = soil_temp_df.set_index('measured_on')
soil_temp_df.head()
# In[9]:
# Reading precipitation data downloaded from https://datasearch.globe.gov/
rain_df = pd.read_csv('precipitation.csv')
# Dropping empty cells
rain_df = rain_df.dropna()
# Setting date into the required format
rain_df['measured_on'] = rain_df['measured_on'].astype(str)
rain_df['measured_on'] = rain_df['measured_on'].apply(lambda x: str(int(x[3:5]))+'-'+str(x[6:]))
# Setting measured on as index.
rain_df = rain_df.set_index('measured_on')
rain_df.head()
# In[10]:
# Reading precipitation data downloaded from https://datasearch.globe.gov/
aerosol_df = pd.read_csv('aerosol.csv')
# Dropping empty cells
aerosol_df = aerosol_df.dropna()
# Setting date into the required format
aerosol_df['measured_on'] = aerosol_df['measured_on'].astype(str)
aerosol_df['measured_on'] = aerosol_df['measured_on'].apply(lambda x: str(int(x[3:5]))+'-'+str(x[6:]))
# Removing duplicate entries...
aerosol_df = aerosol_df.drop_duplicates(subset=['measured_on','latitude', 'longitude'])
# Setting measured on as index.
aerosol_df = aerosol_df.set_index('measured_on')
aerosol_df.head()
# In[11]:
from math import cos, asin, sqrt
# Calculating distance between two points with latitude and longitude...
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
# Calculating minimum distant point from data
def closest(data, v):
return min(data, key=lambda p: distance(v['decimalLatitude'],v['decimalLongitude'],p['latitude'],p['longitude']))
# In[12]:
# Getting data of all the parameters of each occurence of species...
for i in range (main_df.shape[0]):
time = main_df.iloc[i][4] + '-' + main_df.iloc[i][5]
v = main_df.iloc[i][['decimalLatitude','decimalLongitude']].to_dict()
try:
time_rain = rain_df.loc[time]
rain_arr = time_rain[['latitude','longitude']].to_dict('records')
closest_rain_point = closest(rain_arr, v)
rain = time_rain[time_rain['latitude']==closest_rain_point['latitude']]['precipitation monthlies'][0]
main_df.at[i, 'rain'] = rain
except:
pass
try:
time_temp = temp_df.loc[time]
temp_arr = time_temp[['latitude','longitude']].to_dict('records')
closest_temp_point = closest(temp_arr, v)
temp = time_temp[time_temp['latitude']==closest_temp_point['latitude']]['average temp (deg C)'][0]
main_df.at[i, 'temp'] = temp
except:
pass
try:
time_soil_temp = soil_temp_df.loc[time]
soil_temp_arr = time_soil_temp[['latitude','longitude']].to_dict('records')
closest_soil_temp_point = closest(soil_temp_arr, v)
soil_temp = time_soil_temp[time_soil_temp['latitude']==closest_soil_temp_point['latitude']]['average temp (deg C)'][0]
main_df.at[i, 'soil_temp'] = soil_temp
except:
pass
try:
time_aerosol = aerosol_df.loc[time]
aerosol_arr = time_aerosol[['latitude','longitude']].to_dict('records')
closest_aerosol_point = closest(aerosol_arr, v)
optical_thickness = time_aerosol[time_aerosol['latitude']==closest_aerosol_point['latitude']]['optical thickness'][0]
transmission_percent = time_aerosol[time_aerosol['latitude']==closest_aerosol_point['latitude']]['transmission percent'][0]
main_df.at[i, 'optical_thickness'] = optical_thickness
main_df.at[i, 'transmission_percent'] = transmission_percent
except:
pass
main_df.head()
# In[13]:
# Checking nan again as there may be some points with no data of that year so those particular entries will be nan.
main_df.isnull().sum()
# In[14]:
# Dropping nan containg rows...
main_df = main_df.dropna()
main_df.isnull().sum()
# In[15]:
# Calculating mean and stdv for each parameter...
temp_mean = np.mean(main_df['temp'])
temp_stdv = np.std(main_df['temp'])
soil_temp_mean = np.mean(main_df['soil_temp'])
soil_temp_stdv = np.std(main_df['soil_temp'])
rain_mean = np.mean(main_df['rain'])
rain_stdv = np.std(main_df['rain'])
optical_thickness_mean = np.mean(main_df['optical_thickness'])
optical_thickness_stdv = np.std(main_df['optical_thickness'])
transmission_percent_mean = np.mean(main_df['transmission_percent'])
transmission_percent_stdv = np.std(main_df['transmission_percent'])
# In[16]:
# Calculating range in which 70 percent of species will occur according to Haversine Formula
temp_range = [temp_mean - 1.5 * temp_stdv, temp_mean + 1.5 * temp_stdv]
soil_temp_range = [soil_temp_mean - 1.5 * soil_temp_stdv, soil_temp_mean + 1.5 * soil_temp_stdv]
rain_range = [rain_mean - 1.5 * rain_stdv, rain_mean + 1.5 * rain_stdv]
optical_thickness_range = [optical_thickness_mean - 1.5 * optical_thickness_stdv, optical_thickness_mean + 1.5 * optical_thickness_stdv]
transmission_percent_range = [transmission_percent_mean - 1.5 * transmission_percent_stdv, transmission_percent_mean + 1.5 * transmission_percent_stdv]
print('Temprature range --- from ' + str(temp_range[0]) + ' to ' + str(temp_range[1]))
print('Soil Temprature range --- from ' + str(soil_temp_range[0]) + ' to ' + str(soil_temp_range[1]))
print('Rain range --- from ' + str(rain_range[0]) + ' to ' + str(rain_range[1]))
print('Optical Thickness range --- from ' + str(optical_thickness_range[0]) + ' to ' + str(optical_thickness_range[1]))
print('Transmission Percent range --- from ' + str(transmission_percent_range[0]) + ' to ' + str(transmission_percent_range[1]))
# In[17]:
# Plotting all the points on google map
import gmplot
latitude_list = np.array(main_df['decimalLatitude'])
longitude_list = np.array(main_df['decimalLongitude'])
gmap4 = gmplot.GoogleMapPlotter(30.3164945,
78.03219179999999, 13)
gmap4.scatter( latitude_list, longitude_list, '#FF0000', size = 40, marker = False )
gmap4.draw( "C:\\Users\\cmgupta159\\Desktop\\lomdi.html")
# In[18]:
# Importing libraries to plot graphs
import matplotlib.pyplot as plt
import seaborn as sns
# In[19]:
# Plotting fraction of Temprature
graph1 = sns.kdeplot(data=main_df['temp'], shade=True)
graph1.axvline(temp_range[0])
graph1.axvline(temp_range[1])
graph1.set_xlabel("Temprature")
graph1.set_ylabel("Fraction")
plt.text(29,.05,'mean - ' + str(round(temp_mean,2)))
plt.text(29,.045,'stdv - ' + str(round(temp_stdv,2)))
plt.savefig('kde_temp_lomdi.png')
plt.show()
# In[20]:
# Plotting fraction of Soil Temprature
graph2 = sns.kdeplot(data=main_df['soil_temp'], shade=True)
graph2.axvline(soil_temp_range[0])
graph2.axvline(soil_temp_range[1])
plt.xlabel("Soil Temprature")
plt.ylabel("Fraction")
plt.text(29,.04,'mean - ' + str(round(soil_temp_mean,2)))
plt.text(29,.035,'stdv - ' + str(round(soil_temp_stdv,2)))
plt.savefig('soil_temp_lomdi.png')
plt.show()
# In[21]:
# Plotting fraction of Optical Thickness
graph = sns.kdeplot(data=main_df['optical_thickness'], shade=True)
graph.axvline(optical_thickness_range[0])
graph.axvline(optical_thickness_range[1])
plt.xlabel("Optical Thickness")
plt.ylabel("Fraction")
plt.text(6,3.1,'mean - ' + str(round(optical_thickness_mean,2)))
plt.text(6,3.5,'stdv - ' + str(round(optical_thickness_stdv,2)))
plt.savefig('optical_lomdi.png')
plt.show()
# In[22]:
# Plotting fraction of Transmission Percent
graph = sns.kdeplot(data=main_df['transmission_percent'], shade=True)
graph.axvline(transmission_percent_range[0])
graph.axvline(transmission_percent_range[1])
plt.xlabel("Transmission Percent")
plt.ylabel("Fraction")
plt.text(0,.05,'mean - ' + str(round(transmission_percent_mean,2)))
plt.text(0,.055,'stdv - ' + str(round(transmission_percent_stdv,2)))
plt.savefig('trans_lomdi.png')
plt.show()
# In[23]:
plt.hist(main_df['temp'])
plt.xlabel("Temprature")
plt.ylabel("No. of Occurrences in Data")
plt.text(25,3000,'mean - ' + str(round(temp_mean,2)))
plt.text(25,2700,'stdv - ' + str(round(temp_stdv,2)))
plt.savefig('hist_temp_lomdi.png')
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Importing libraries
import pandas as pd
import numpy as np
# In[2]:
# Importing data from which graph is made
df = pd.read_csv('vege.csv',header=None)
# Making a copy
df_copy = df.copy()
df.head()
# In[3]:
# Replacing Ocean area with 0 and land area with 100...
for i in range (df.shape[0]):
for j in range (df.shape[1]):
if df[j][i] == 99999:
df_copy[j][i] =0
else:
df_copy[j][i] =100
# In[4]:
arr = np.array(df_copy)
# In[5]:
# Saving the map
import matplotlib.pyplot as plt
plt.imsave('map0.25.png', arr)
# In[6]:
# Replacing Ocean area with nan...
for i in range (df.shape[0]):
for j in range (df.shape[1]):
if df[j][i] == 99999:
df[j][i] =np.nan
else:
df[j][i] =1
df.head()
# In[7]:
df.to_csv('helper.csv',index=None)
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Importing Libraries
from PIL import Image
import numpy as np
import pandas as pd
# In[2]:
# Importing Images
img = Image.open("map0.25.png")
numpydata = np.asarray(img)
arr = numpydata.copy()
numpydata
# In[3]:
# Importing vegetation data downloaded from https://neo.sci.gsfc.nasa.gov/
vege_df = pd.read_csv('vege.csv',header=None)
vege_df.head()
# In[4]:
# Importing rain data downloaded from https://neo.sci.gsfc.nasa.gov/
rain_df = pd.read_csv('rain.csv',header=None)
rain_df.head()
# In[5]:
# Importing temprature data downloaded from https://neo.sci.gsfc.nasa.gov/
temp_df = pd.read_csv('temp.csv',header=None)
temp_df.head()
# In[6]:
# Importing optical data downloaded from https://neo.sci.gsfc.nasa.gov/
optical_df = pd.read_csv('optical.csv',header=None)
optical_df.head()
# In[7]:
helper_df = pd.read_csv('helper.csv',header=None)
rain_land = helper_df*rain_df
optical_land = helper_df*optical_df
temp_land = helper_df*temp_df
vege_land = helper_df*vege_df
# In[8]:
# Getting points with all conditions favourable
for i in range (helper_df.shape[0]):
for j in range (helper_df.shape[1]):
try:
if 0.5 < vege_land[j][i] and 1.641749188574094 < temp_land[j][i] < 26.505227476389237 and 0 < optical_land[j][i] < 0.5:
arr[i,j] = np.array([255,255,255,255])
except:
pass
# In[9]:
# Saving image
import matplotlib.pyplot as plt
plt.imsave('inter_map.png', arr)
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 22 18:28:21 2020
@author: Raman Agarwal
"""
import random, pylab, numpy
import pandas as pd
from matplotlib import pyplot as plt
import datetime
from get_connector import get_connector
#set line width
pylab.rcParams['lines.linewidth'] = 4
#set font size for titles
pylab.rcParams['axes.titlesize'] = 20
#set font size for labels on axes
pylab.rcParams['axes.labelsize'] = 20
#set size of numbers on x-axis
pylab.rcParams['xtick.labelsize'] = 16
#set size of numbers on y-axis
pylab.rcParams['ytick.labelsize'] = 16
#set size of ticks on x-axis
pylab.rcParams['xtick.major.size'] = 7
#set size of ticks on y-axis
pylab.rcParams['ytick.major.size'] = 7
#set size of markers
pylab.rcParams['lines.markersize'] = 10
#set number of examples shown in legends
pylab.rcParams['legend.numpoints'] = 1
random.seed(0)
def isNaN(num):
return num != num
def getDataPd(filepath1, header1 = [0]):
df = pd.read_csv(filepath1, header = header1 )
return df
def rSquared(observed, predicted):
error = ((predicted - observed)**2).sum()
# print(error)
meanError = error/len(observed)
# print(meanError,numpy.var(observed))
return (1 - (meanError/numpy.var(observed)))
class envVariable(object):
def __init__(self, lat1, long1, year1, data1):
self.latitude = float(lat1)
self.longitude = float(long1)
# global yearIndex
# print(year1)
# print(len(str(year1)))
if len(str(year1)) > 4:
self.year = int(year1[yearIndex:yearIndex+4])
else:
self.year = int(year1)
self.data = float(data1)
def getloc(self):
return (self.latitude, self.longitude)
def getyear(self):
return self.year
def getdata(self):
return self.data
def getEnvData(df, varHeader,
yearHeader, latHeader, longHeader, isDataYearly=True):
variables = [[] for i in range(len(varHeader))]
length1 = df.shape[0]
for j in range(len(variables)):
for i in range(length1):
object1 = envVariable(df[latHeader][i], df[longHeader][i], df[yearHeader][i], df[varHeader[j]][i])
variables[j].append(object1)
if isDataYearly == False:
yearsList = []
newVariables = [[] for i in range(len(variables)) ]
for i in range(length1):
if variables[0][i].getyear() not in yearsList:
yearsList.append(variables[0][i].getyear())
yearsList.sort()
for k in range(len(variables)):
for i in range(len(yearsList)):
curYear = yearsList[i]
curLatSum = 0
curLongSum = 0
curDataSum = 0
counter1 = 0
for j in range(length1):
if variables[k][j].getyear() == curYear:
counter1 += 1
curDataSum += variables[k][j].getdata()
curLatSum += variables[k][j].getloc()[0]
curLongSum += variables[k][j].getloc()[1]
curDataSum = curDataSum/counter1
curLatSum = curLatSum/counter1
curLongSum = curLongSum/counter1
# print(curYear, curDataSum)
newVariables[k].append(envVariable(curLatSum, curLongSum,curYear,curDataSum))
variables = newVariables
# print(len(variables[0]))
return variables
def splitData(xVals, yVals):
toTrain = random.sample(range(len(xVals)),
len(xVals)//2)
trainX, trainY, testX, testY = [],[],[],[]
for i in range(len(xVals)):
if i in toTrain:
trainX.append(xVals[i])
trainY.append(yVals[i])
else:
testX.append(xVals[i])
testY.append(yVals[i])
return trainX, trainY, testX, testY
outPath = 'outputGraphs'
fileType = '.png'
connector1 = get_connector()
curTime = datetime.datetime.now()
curTime = curTime.strftime("%Y%m%d%H%M%S")
# print(curTime)
myFilePath = 'airPortData.csv'
myDataframe = getDataPd(myFilePath)
tempData = []
precipData = []
varNames = ['TAVG']
plotTitles = ['Temperature']
yearH = 'DATE'
dateFormat = 'DD-MM-YYYY'
global yearIndex
for i in range(0,len(dateFormat)-3):
# print(dateFormat[i:i+4])
if dateFormat[i:i+4] == 'YYYY':
yearIndex = i
break
# print(yearIndex)
latH = 'LATITUDE'
longH = 'LONGITUDE'
futureYears = [2030,2040,2050,2060,2070,2080]
# futureYears = [2025,2030,2035,2040]
# futureYears = [2021, 2022, 2023, 2024]
#print(myDataframe[latH][4], myDataframe[longH][4], myDataframe[yearH][4], my)
envVar = getEnvData(myDataframe, varNames, yearH, latH, longH, isDataYearly = False)
#print(tempData[2].getdata())
# print(len(envVar[0]))
for k in range(len(envVar)):
xvals, yvals = [], []
for data1 in envVar[k]:
xvals.append(data1.getyear())
yvals.append(data1.getdata())
indexToPop = []
for i in range(len(xvals)):
if isNaN(xvals[i]) or isNaN(yvals[i]):
indexToPop.append(i)
for i in range(len(indexToPop)):
xvals.pop(indexToPop[i]-i)
yvals.pop(indexToPop[i]-i)
# print(xvals, yvals)
#xvals, yvals = xvals[:-1], yvals[:-1]
numSubsets = 100
dimensions = (1, 2, 3, 4)
rSquares = {}
rSqMeanStd = []
for d in dimensions:
rSquares[d] = []
for f in range(numSubsets):
trainX, trainY, testX, testY = splitData(xvals, yvals)
for d in dimensions:
model = pylab.polyfit(trainX, trainY, d)
estYVals = pylab.polyval(model, testX)
rSquares[d].append(rSquared(testY, estYVals))
print('Curves for -> '+plotTitles[k])
print('Mean R-squares for test data')
# print(rSquares)
for d in dimensions:
mean = round(sum(rSquares[d])/len(rSquares[d]), 4)
sd = round(numpy.std(rSquares[d]), 4)
rSqMeanStd.append((d,mean,sd))
print('For dimensionality', d, 'mean =', mean,
'Std =', sd)
bestFitDegInd = 0
for i in range(1,len(rSqMeanStd)):
if rSqMeanStd[i][1]>rSqMeanStd[i-1][1]:
bestFitDegInd = i
else:
break
# bestFitDegInd = 0
print('----- The best fit polynomial is -> ' + str(rSqMeanStd[bestFitDegInd][0])+ ' Degree Poly.')
print('With R2 mean value = ' + str(rSqMeanStd[bestFitDegInd][1]) \
+ 'and std = ' +str(rSqMeanStd[bestFitDegInd][2]) + ' ----' )
model = pylab.polyfit(xvals,yvals,rSqMeanStd[bestFitDegInd][0])
estvals = pylab.polyval(model, xvals+ futureYears)
plt.plot(xvals,yvals, label = 'Observed Data')
plt.plot(xvals + futureYears , estvals,'--', color = 'red', label = 'Best fit curve')
lastPointStr = '('+str(futureYears[-1])+', '+str(estvals[-1].round(1))+')'
plt.annotate(lastPointStr, (futureYears[-1],estvals[-1]))
plt.grid()
plt.legend()
plt.ylabel(plotTitles[k])
plt.xlabel('Years')
plt.title(label=plotTitles[k])
currOutPath = outPath + connector1 + curTime+plotTitles[k]+fileType
plt.savefig(currOutPath, bbox_inches ="tight")
plt.show()