In atmospheric and ocean data analysis, a correlation coefficient map is often drawn. In Python, Numpy and Scipy have functions to calculate the correlation coefficient, so you can use them to draw a map of the correlation coefficient. However, there are multiple ways to do this, so here we will compare their performance.
def crr_loop_scipy(x1, x2):
ret = numpy.empty([x1.shape[1], x2.shape[1]])
for jj in xrange(ret.shape[1]):
for ii in xrange(ret.shape[0]):
ret[ii,jj], _ = scipy.stats.pearsonr(x1[:,ii], x2[:,jj])
return ret
def crr_loop_numpy(x1, x2):
ret = numpy.empty([x1.shape[1], x2.shape[1]])
for jj in xrange(ret.shape[1]):
for ii in xrange(ret.shape[0]):
ret[ii,jj] = numpy.corrcoef(x1[:,ii], x2[:,jj])[1,0]
return ret
def crr_numpy(x1, x2):
return numpy.corrcoef(x1, x2, rowvar=False)[0:x1.shape[1], x1.shape[1]:]
def crr_original(x1, x2):
crscov = np.dot(x1.T, x2) / x1.shape[0]
std1 = np.std(x1, axis=0).reshape(x1.shape[1], 1)
std2 = np.std(x2, axis=0).reshape(x2.shape[1], 1)
crsstd = np.dot(std1, std2.T)
return crscov / crsstd
def generate_data(nx):
nt = 1000
data1 = numpy.random.randn(nt, nx)
data2 = numpy.random.randn(nt, nx)
return data1, data2
python
import time
def measure_time(f, *args):
start = time.time()
f(*args)
return time.time() - start
def measure_mean_time(f, *args):
n = 2
return numpy.mean([measure_time(f, *args) for i in xrange(n)])
Result is,
Was overwhelmingly fast. Also, since scipy is faster than numpy, it may be better to use scipy instead of aiming at the correlation coefficient map like this time.
import numpy as np
from scipy import stats
import time
def crr_loop_scipy(x1, x2):
ret = np.empty([x1.shape[1], x2.shape[1]])
for jj in xrange(ret.shape[1]):
for ii in xrange(ret.shape[0]):
ret[ii,jj], _ = stats.pearsonr(x1[:,ii], x2[:,jj])
return ret
def crr_loop_numpy(x1, x2):
ret = np.empty([x1.shape[1], x2.shape[1]])
for jj in xrange(ret.shape[1]):
for ii in xrange(ret.shape[0]):
ret[ii,jj] = np.corrcoef(x1[:,ii], x2[:,jj])[1,0]
return ret
def crr_numpy(x1, x2):
return np.corrcoef(x1, x2, rowvar=False)[0:x1.shape[1], x1.shape[1]:]
def crr_original(x1, x2):
crscov = np.dot(x1.T, x2) / x1.shape[0]
std1 = np.std(x1, axis=0).reshape(x1.shape[1], 1)
std2 = np.std(x2, axis=0).reshape(x2.shape[1], 1)
crsstd = np.dot(std1, std2.T)
return crscov / crsstd
def generate_data(nx):
nt = 1000
data1 = np.random.randn(nt, nx)
data2 = np.random.randn(nt, nx)
return data1, data2
def measure_time(f, *args):
start = time.time()
f(*args)
return time.time() - start
def measure_mean_time(f, *args):
n = 2
return np.mean([measure_time(f, *args) for i in xrange(n)])
def experiment(nxs, crrs):
mean_times = np.empty([len(crrs), len(nxs)])
for icrr, crr in enumerate(crrs):
for ix, nx in enumerate(nxs):
x1, x2 = generate_data(nx)
mean_times[icrr, ix] = measure_mean_time(crr, x1, x2)
return mean_times
if __name__ == '__main__:
import matplotlib.pyplot as plt
nxs1 = range(200, 200 + 1000, 200)
nxs2 = range(200, 200 + 10000, 600)
mean_times1 = experiment(nxs1, [crr_loop_scipy, crr_loop_numpy])
mean_times2 = experiment(nxs2, [crr_numpy, crr_original])
lebels1 = ['loop_sci', 'loop_np']
labels2 = ['np', 'original']
for label1, label2, mean_time1, mean_time2 in zip(lebels1, labels2, mean_times1, mean_times2):
plt.plot(nxs1, mean_time1, label=label1)
plt.plot(nxs2, mean_time2, label=label2)
plt.xlabel('nx')
plt.ylabel('time (sec)')
plt.title('nt=1000')
plt.xlim([0, nxs2[-1]])
plt.ylim([0, 80])
plt.legend()
plt.show()
Recommended Posts