%matplotlib inline
import numpy as np
from pylab import *
def de_mean(x):
xmean = mean(x)
return [xi - xmean for xi in x]
def covariance(x, y):
n = len(x)
return dot(de_mean(x), de_mean(y)) / (n-1)
pageSpeeds = np.random.normal(3.0, 1.0, 1000)
purchaseAmount = np.random.normal(50.0, 10.0, 1000)
scatter(pageSpeeds, purchaseAmount)
covariance (pageSpeeds, purchaseAmount)
-0.019528192170968867
purchaseAmount = np.random.normal(50.0, 10.0, 1000) / pageSpeeds
scatter(pageSpeeds, purchaseAmount)
covariance (pageSpeeds, purchaseAmount)
-8.8565771898786672
def correlation(x, y):
stddevx = x.std()
stddevy = y.std()
return covariance(x,y) / stddevx / stddevy #In real life you'd check for divide by zero here
correlation(pageSpeeds, purchaseAmount)
-0.62897824783314804
np.corrcoef(pageSpeeds, purchaseAmount)
array([[ 1. , -0.62834927],
[-0.62834927, 1. ]])
purchaseAmount = 100 - pageSpeeds * 3
scatter(pageSpeeds, purchaseAmount)
correlation (pageSpeeds, purchaseAmount)
-1.0010010010010009
Remember, correlation does not imply causality!