from scipy import stats
import matplotlib.pyplot as plt
# Read in the brain_size.csv file and display a histogram graph
data = pd.read_csv('brain_size.csv', sep=';', na_values=".")
print(data.head())
data['VIQ'].hist()
plt.show()
Unnamed: 0 Gender FSIQ VIQ PIQ Weight Height MRI_Count
0 1 Female 133 132 124 118.0 64.5 816932
1 2 Male 140 150 124 NaN 72.5 1001121
2 3 Male 139 123 150 143.0 73.3 1038437
3 4 Male 133 129 128 172.0 68.8 965353
4 5 Female 137 132 134 147.0 65.0 951545
# A two tail one sample t test on the VIQ with value 110
print(stats.ttest_1samp(data['VIQ'], 110))
# p-value = 0.533 which is more than (0.05/2 = 0.025), accept null hypothesis
Ttest_1sampResult(statistic=0.6293461053092635, pvalue=0.5327920500038907)
# A two tail one sample t test on the VIQ with value 100
print(stats.ttest_1samp(data['VIQ'], 100))
# p-value = 0.002 which is less than (0.05/2 = 0.025), reject null hypothesis
Ttest_1sampResult(statistic=3.3074146385401786, pvalue=0.002030117404781822)
# One-tailed one sample t test, less than 130
m = 130
results = stats.ttest_1samp(data['VIQ'], m)
print(results)
alpha = 0.05
if (results[0] < 0) & (results[1]/2 < alpha):
print("Reject null hypothesis, mean is less than {}".format(m))
else:
print("Qccept null hypothesis")
Ttest_1sampResult(statistic=-4.726790961152567, pvalue=2.9541289074597695e-05)
Reject null hypothesis, mean is less than 130
# one-tailed one sample t test, greater than 80
m = 80
results = stats.ttest_1samp(data['VIQ'], m)
print(results)
alpha = 0.05
if (results[0] > 0) & (results[1]/2 < alpha):
print("reject null hypothesis, mean is greater than {}".format(m))
else:
print("accept null hypothesis")
Ttest_1sampResult(statistic=8.663551705002009, pvalue=1.2618678412297256e-10)
reject null hypothesis, mean is greater than 80
import scipy.stats as s # provides some additional predefined statistics functions.
import pandas as pd
import numpy as np
from math import *
from functools import reduce
df = pd.read_csv('p4_data1.tsv','\t')
print(df.head())
print(df.describe())
Consumer Before After
0 1 3.15 3.42
1 2 2.33 2.70
2 3 2.95 3.20
3 4 2.15 2.50
4 5 3.33 3.40
Consumer Before After
count 12.000000 12.000000 12.000000
mean 6.500000 2.752500 2.930833
std 3.605551 0.643529 0.559114
min 1.000000 1.950000 2.100000
25% 3.750000 2.150000 2.475000
50% 6.500000 2.665000 2.950000
75% 9.250000 3.245000 3.405000
max 12.000000 3.950000 3.800000
# Two-Sampled T-test
results = s.ttest_ind(df['Before'], df['After'])
print(results)
# pvalue = 0.48 which is greater than 0.05, accept null hypothesis
alpha = 0.05
if (results[0] > 0) & (results[1] < alpha/2):
print("reject null hypothesis")
else:
print("accept null hypothesis")
Ttest_indResult(statistic=-0.7246599369194594, pvalue=0.47629775671675556)
accept null hypothesis
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stests
df = pd.read_csv('blood_pressure.csv')
print(df[['bp_before','bp_after']].describe())
print(df.head(5))
bp_before bp_after
count 120.000000 120.000000
mean 156.450000 151.358333
std 11.389845 14.177622
min 138.000000 125.000000
25% 147.000000 140.750000
50% 154.500000 149.500000
75% 164.000000 161.000000
max 185.000000 185.000000
patient sex agegrp bp_before bp_after
0 1 Male 30-45 143 153
1 2 Male 30-45 163 170
2 3 Male 30-45 153 168
3 4 Male 30-45 153 142
4 5 Male 30-45 146 141
# Pair Sampled t test
results = stats.ttest_rel(df['bp_before'], df['bp_after']) #Calculate t-test on two related samples
print(results)
# pvalue= 0.0011 which is less than 0.05, reject null hypothesis
alpha = 0.05
if (results[0] > 0) & (results[1] < alpha/2):
print("reject null hypothesis")
else:
print("accept null hypothesis")
Ttest_relResult(statistic=3.3371870510833657, pvalue=0.0011297914644840823)
reject null hypothesis
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stests
df = pd.read_csv('blood_pressure.csv')
print(df.head())
print(df[['bp_before', 'bp_after']].describe())
patient sex agegrp bp_before bp_after
0 1 Male 30-45 143 153
1 2 Male 30-45 163 170
2 3 Male 30-45 153 168
3 4 Male 30-45 153 142
4 5 Male 30-45 146 141
bp_before bp_after
count 120.000000 120.000000
mean 156.450000 151.358333
std 11.389845 14.177622
min 138.000000 125.000000
25% 147.000000 140.750000
50% 154.500000 149.500000
75% 164.000000 161.000000
max 185.000000 185.000000
ztest, pval1 = stests.ztest(df['bp_before'], x2=df['bp_after'], value=0, alternative='two-sided')
print(float(pval1))
# # pvalue = 0.0022 which is lesser than 0.05, reject null hypothesis
0.002162306611369422
import scipy.stats as s # provides some additional predefined statistics functions.
import pandas as pd
import numpy as np
from math import *
from functools import reduce
df = pd.read_csv('p4_data1.tsv','\t')
print(df.head())
Consumer Before After
0 1 3.15 3.42
1 2 2.33 2.70
2 3 2.95 3.20
3 4 2.15 2.50
4 5 3.33 3.40
var_before = df['Before'].var()
var_after = df['After'].var()
print("Var Before: ", var_before)
print("Var After: ", var_after)
f = var_before / var_after if (var_before > var_after) else (var_after / var_before)
print(f)
Var Before: 0.41412954545454556
Var After: 0.31260833333333327
1.3247552969516028
p_value_var_test = s.f.sf(f, df['Before'].count() -1 , df['After'].count() - 1) # same as 1 - cdf (Cumulative distribution function)
print(p_value_var_test)
# pvalue = 0.32 which is greater than 0.05, accept null hypothesis
0.32449424960699924