Statement of Completion#3229ddf5
Introduction to Inferential Statistics
easy
Correlation
Resolution
Activities
Project.ipynb
Covariance¶
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import math
np.random.seed(1)
# 1000 random integers between 0 and 50
x = np.random.randint(0, 50, 1000)
pi = math.pi
def PointsInCircum(r,n=10):
return [(math.cos(2*pi/n*x)*r,math.sin(2*pi/n*x)*r) for x in range(0,n+1)]
test_list=PointsInCircum(1,10)
# using list comprehension to
# perform Unzipping
res = [[ i for i, j in test_list ],
[ j for i, j in test_list ]]
# Positive Correlation with some noise
y_p = x + np.random.normal(0, 10, 1000)
y_n=100 - x + np.random.normal(0, 5, 1000)
x_r = [0,1,2,3,4,0,1,2,3,4,0,1,2,3,4]
y_r = [1,1,1,1,1,3,3,3,3,3,2,2,2,2,2]
fig, axs = plt.subplots(2, 2,figsize=(8,8))
axs[0, 0].scatter(x, y_p)
axs[0, 0].set_title('Positive covariance')
axs[0, 1].scatter(x, y_n)
axs[0, 1].set_title('Negative covariance')
axs[1, 0].scatter(x_r ,y_r)
axs[1, 0].set_title('Zero covariance')
axs[1, 1].scatter(res[0],res[1])
axs[1, 1].set_title('Zero covariance')
for ax in axs.flat:
ax.set(xlabel='x', ylabel='y')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
Correlation¶
Example 1: Define a function to calculate correlation from scratch
In [2]:
np.random.seed(1)
# 1000 random integers between 0 and 50
x = np.random.randint(0, 30, 1000)
# Positive Correlation with some noise
y = x + np.random.normal(0, 10, 1000)
def cov(x,y):
if len(x) != len(y):
return
n = len(x)
xy = [x[i]*y[i] for i in range(n)]
mean_x = sum(x)/float(n)
mean_y = sum(y)/float(n)
return (sum(xy) - n*mean_x * mean_y) / float(n)
# following code is can also be used to calculate the same result
#return sum([(x[i]-mean_x)*(y[i]-mean_y) for i in range(n)])
def sd(x):
if len(x) == 0:
return 0
n = len(x)
mean_x = sum(x)/float(n)
variance = sum( [(x[i] - mean_x)**2 for i in range(n)])/float(n)
return variance**0.5
def corr(x,y):
if len(x) != len(y):
return
correlation = cov(x,y) / float(sd(x)*sd(y))
return correlation
print ('Correlation : ' , np.round(corr(x,y),2) )
Correlation : 0.67
In [3]:
np.corrcoef(x,y)
Out[3]:
array([[1. , 0.67259004], [0.67259004, 1. ]])
Activity 1:¶
Determine the correlation for the following sample.
In [4]:
np.random.seed(1)
data1 = np.random.randint(0, 50, 1000)
data2= data1+ np.random.normal(0, 10, 1000)
In [5]:
np.round(corr(data1,data2),2)
Out[5]:
0.82
Activity 2:¶
Determine the correlation for the following sample.
In [6]:
np.random.seed(1)
x = np.random.randint(0, 50, 1000)
y_p = 100- x + np.random.normal(0, 10, 1000)
In [7]:
np.round(corr(x,y_p),2)
Out[7]:
-0.82
Activity 3:¶
For the iris dataset, how to determine $\rho_{XY}$ of sepal_length and sepal_width using pandas
In [8]:
import seaborn as sns
import pandas as pd
df = pd.read_csv("iris.csv")
df.sample(5)
Out[8]:
Unnamed: 0 | sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|---|
147 | 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
57 | 57 | 4.9 | 2.4 | 3.3 | 1.0 | versicolor |
114 | 114 | 5.8 | 2.8 | 5.1 | 2.4 | virginica |
15 | 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
33 | 33 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |
In [9]:
sns.scatterplot(x='sepal_length',y='sepal_width',data=df)
Out[9]:
<Axes: xlabel='sepal_length', ylabel='sepal_width'>
In [10]:
df[[ 'sepal_length','sepal_width']].corr()
Out[10]:
sepal_length | sepal_width | |
---|---|---|
sepal_length | 1.00000 | -0.11757 |
sepal_width | -0.11757 | 1.00000 |
In [ ]: