Rumor's red book, Machine Learning Professional Series "Anomaly Detection and Change Detection" (http://ide-research.net/book/support.html#kodansha) This is an article about writing the graph in Chapter 1 in Python. ..
Like this, I draw sample accuracy and ROC curve with animation.
The entire code is here [https://github.com/matsuken92/anomaly_detection_change_detection/blob/master/anomal_detection_chap.01.ipynb)
Only simple explanations are given here, so if you want to know more details, please buy the book!
I generated similar data in Python, not exactly the same data, or searched for and plotted the data. (Especially, it was hard to find the electrocardiogram data [^ 1] ...: sweat_smile :)
[^ 1]: Please note that the abnormal red part is processed, so it is not the correct electrocardiogram data. The URL to get the data is described on GitHub.
The entire code that renders this is here
From Fig. 1.2, I tried to visualize the relevance by connecting the anomaly degree and the indicator function and drawing.
Furthermore, the branch point threshold is calculated as the point where the value of the density function is the same, and $ \ tau $ in that case is represented by the horizontal bar in the fourth graph.
The entire code that renders this is here
#Random number generation
rd.seed()
n = 1000
d_0 = rd.normal(135, 18, n) #Normal data
d_1 = rd.normal(80, 30, n) #Abnormal data
#Calculation of mean and standard deviation
m_0 = np.mean(d_0)
sd_0 = np.sqrt(np.var(d_0))
m_1 = np.mean(d_1)
sd_1 = np.sqrt(np.var(d_1))
#X-axis data generation
xx = np.linspace(0,300, 5000)
#Normal distribution density function
density_0 = st.norm.pdf(xx, m_0, sd_0)
density_1 = st.norm.pdf(xx, m_1, sd_1)
#Abnormality calculation
abnormaly_score = np.log(density_1) - np.log(density_0)
def balance_position(x_min, x_max, f1, f2, EPS=0.00001):
if abs(f1(x_max) - f2(x_max)) > EPS:
center = (x_min + x_max)/2.
if np.sign(f1(x_max) - f2(x_max)) * np.sign(f1(center) - f2(center)) < 0:
x_min = center
else:
x_max = center
x_max = balance_position(x_min, x_max, f1, f2)
else:
return x_max
return x_max
mark = balance_position(0, 200, lambda x:st.norm.pdf(x, m_0, sd_0), lambda x: st.norm.pdf(x, m_1, sd_1))
print "mark:", mark
tau_pos = np.argsort(np.abs(xx - mark))[0]
print "tau pos:", tau_pos
tau = abnormaly_score[tau_pos]
print "tau:", tau
tau2_pos = np.max(np.argsort(np.abs(abnormaly_score - tau))[0:2])
print "tau2_pos:",tau2_pos
tau2 = abnormaly_score[tau2_pos]
print "tau2:",tau2
mark2 = xx[tau2_pos]
print "mark2:",mark2
#----------------Drawing process-----------------#
n_row = 5 #Number of rows in the graph
plt.subplots(n_row, 1, sharex=True,figsize=(12,12))
gs = gridspec.GridSpec(n_row, 1, height_ratios=[3,3,3,3,3])
axs = [plt.subplot(gs[i]) for i in range(n_row) ]
#First area drawing
axs[0].hist(d_1, bins=40, color="r", alpha=0.6, range=(0,300), label="data 1")
axs[0].hist(d_0, bins=40, color="b", alpha=0.6, range=(0,300), label="data 0")
axs[0].legend(loc='best')
#Second area drawing
axs[1].plot(xx, get_density(d_1, xx), "r--", alpha=.4 , lw=2, label=r"density of data 1")
axs[1].plot(xx, get_density(d_0, xx), "b--", alpha=.4 , lw=2, label=r"density of data 0")
axs[1].legend(loc='best')
axs[1].plot([0,300],[0,0],"k")
axs[1].plot(xx, density_1, c="r", alpha=.5 )
axs[1].plot(xx, density_0, c="b", alpha=.5 )
axs[1].fill_between(xx[0:tau_pos], density_0[0:tau_pos], color="lightblue", zorder = 500, alpha=.6)
axs[1].set_ylim(0,np.max(density_0)*1.1)
axs[1].plot([mark ,mark],[-100,200], "k--", lw=.5)
axs[1].plot([mark2 ,mark2],[-100,200], "k--", lw=.5)
#Third area drawing
axs[2].plot(xx, -np.log(density_0), c="b", alpha=.5, label=r"$\ln p({\bf x}'|y=0,\mathcal{D})$")
axs[2].plot(xx, np.log(density_1), c="r", alpha=.5, label=r"$-\ln p({\bf x}'|y=1,\mathcal{D})$")
axs[2].plot([mark ,mark],[-110,200], "k--", lw=.5)
axs[2].plot([mark2 ,mark2],[-110,200], "k--", lw=.5)
axs[2].set_ylim(-25,40)
axs[2].legend(loc='best')
#4th area drawing
axs[3].plot(xx, abnormaly_score, c="purple", alpha=.6, label=r"$$ \ln{ p({\bf x}'|y=1,\mathcal{D})\over p({\bf x}'|y=0,\mathcal{D})} $$")
axs[3].set_ylim(-5,5)
axs[3].plot([mark ,mark],[-100,200], "k--", lw=.5)
axs[3].plot([mark2 ,mark2],[-100,200], "k--", lw=.5)
axs[3].plot([0 ,300],[tau, tau], "k", lw=.5)
axs[3].legend(loc='best')
#Fifth area drawing
axs[4].fill_between(xx[0:tau_pos], np.ones_like(xx[0:tau_pos]), color="blue", zorder = 500, alpha=.6)
axs[4].fill_between(xx[tau2_pos:], np.ones_like(xx[tau2_pos:]), color="blue", zorder = 500, alpha=.6)
axs[4].plot([mark, mark],[-110,200], "k--", lw=.5)
axs[4].plot([mark2, mark2],[-110,200], "k--", lw=.5)
axs[4].text(10, 1.3, r"$I[a(x) \ge \tau]$")
axs[4].set_ylim(0,5)
#Fixed range of x in all areas
for ax in axs:
ax.set_xlim(0,300)
plt.subplots_adjust(hspace=0)
Similar to Figure 1.2, the density function of the normal distribution estimated from the histogram, kernel density, and mean / variance is drawn and the "information amount" is plotted. The degree of anomaly simply increases as the distance from the average of the data increases.
The entire code that renders this is here
#For unlabeled data
#Random number generation
n = 1000
data = rd.normal(80, 15, n)
#Calculation of mean and standard deviation
m = np.mean(data)
sd = np.sqrt(np.var(data))
#X-axis data generation
xx = np.linspace(0,300, 5000)
#Normal distribution density function
density = st.norm.pdf(xx, m, sd)
#----------------Drawing process-----------------#
n_row = 3 #Number of rows in the graph
xx = np.linspace(0,300, 5000) #X-axis data generation
plt.subplots(n_row, 1, sharex=True,figsize=(12,10))
gs = gridspec.GridSpec(n_row, 1, height_ratios=[3,3,3])
axs = [plt.subplot(gs[i]) for i in range(n_row) ]
#First area drawing
axs[0].hist(data, bins=50, range=(0,300), label="data: a", color="b", alpha=0.5)
axs[0].set_ylim(0,200)
axs[0].legend(loc='best')
#Second area drawing
axs[1].plot(xx, get_density(data, xx), lw=2, linestyle="--", label="density of the data", color="b", alpha=0.4)
axs[1].plot(xx, density, lw=1, label=r"estimated density of norm dist: $p({\bf x}'|\mathcal{D})$", color="b", alpha=0.5)
axs[1].legend(loc='best')
#Third area drawing
axs[2].plot(xx, -np.log(density), lw=1, label=r"information:$-\ln p({\bf x}'|\mathcal{D})$", color="b", alpha=0.5)
axs[2].legend(loc='best')
#Fixed range of x in all areas
for ax in axs:
ax.set_xlim(0,300)
plt.subplots_adjust( hspace=0)
I tried to animate the relationship between normal sample accuracy, abnormal sample accuracy and ROC curve. In addition, the details of the ROC curve were previously explained in "[Statistics] What is the ROC curve? ](Http://qiita.com/kenmatsu4/items/550b38f4fa31e9af6f4f) "is also written, so please refer to it.
In this figure, the sample accuracy is shown below.
r_0 = \log(1 + x) \\
r_1 = \log(e -x)
The graph in the second row represents the F value, and the definition is as follows.
f \equiv { 2r_0 r_1 \over r_0 + r_1 }
Since the ROC curve is $ (X, Y) = (1 --r_0 (\ tau), \ r_1 (\ tau)) $, the y coordinate value of the point that traces the red line uses $ 1-y $. The coordinates are displayed on the animation, and the number on the right side of the two corresponds to this.
The entire code that renders this is here
def animate(nframe):
global num_frame
sys.stdout.write(str(int(float(nframe)/num_frame*100)) + "%, ")
plt.clf()
#Minimum and maximum values of x
xmin = 0
xmax = np.e -1
#Number of divisions of x
sx = num_frame * 2
#present location
pos = nframe * 2
#x-axis generation
xx = np.linspace(xmin, xmax, sx)
#Specimen accuracy
cx1 = np.log(1+xx)
cx2 = np.log(np.e -xx)
#First graph drawing-----------------------
plt.subplot(311)
plt.title("Sample accuracy. x={0:.2f}".format(xx[pos]))
plt.xlim(xmin, xmax)
plt.ylim(0,1)
#Draw a curve
plt.plot(xx,cx1,linewidth=2)
plt.plot(xx,cx2,linewidth=2)
#Drawing points and coordinate values
plt.scatter(xx[pos],cx1[pos], c="g", s=40, zorder=100)
plt.text(xx[pos]+.01,cx1[pos]-.05,"{0:.3f}, {1:.3f}".format(cx1[pos], 1-cx1[pos]), size=16)
plt.scatter(xx[pos],cx2[pos], c="b", s=40, zorder=100)
plt.text(xx[pos]-.20,cx2[pos]-.05,"{0:.3f}".format(cx2[pos]), size=16)
#Dotted line drawing
plt.plot([xx[pos], xx[pos]], [0,1], "k--", alpha=.5, lw=2 )
plt.plot([0, xx[pos]], [cx1[pos],cx1[pos]], "k--", alpha=.5, lw=2 )
plt.plot([0, xx[pos]], [cx2[pos],cx2[pos]], "k--", alpha=.5, lw=2 )
plt.text(.08, .42, r"normal:$r_0$", color="r", size=16)
plt.text(1.2, .42, r"anomalous:$r_1$", color="#2F79B0", size=16)
#Second graph drawing-----------------------
plt.subplot(312)
plt.title("F-value")
plt.xlim(xmin, xmax)
plt.ylim(0,1)
F = 2*cx1*cx2/(cx1+cx2)
F_pos = 2*cx1[pos]*cx2[pos]/(cx1[pos]+cx2[pos])
plt.scatter(xx[pos], F_pos, c="g", alpha=1, s=50)
plt.plot(xx, F)
plt.plot([xx[pos], xx[pos]], [0,1], "k--", alpha=.5, lw=2 )
plt.tight_layout()
#Third graph drawing-----------------------
plt.subplot(313)
plt.title("ROC Curve.")
plt.xlim(0,1)
plt.ylim(0,1)
#Drawing curves and points
plt.plot(1-cx1, cx2, linewidth=2, color="b", alpha=.4)
plt.scatter(1-cx1[pos],cx2[pos], c="b", s=40, zorder=100)
plt.text(1-cx1[pos]+.01,cx2[pos]-.05, "{0:.3f}, {1:.3f}".format(1-cx1[pos], cx2[pos]), size=16)
plt.xlabel(r"$1-r_0$", size=16)
plt.ylabel(r"$r_1$", size=16)
plt.tight_layout()
num_frame = 50
fig = plt.figure(figsize=(5,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('ROC_curve2.gif', writer='imagemagick', fps=3, dpi=60)
"Abnormality detection and change detection" by Tsuyoshi Ide and Masashi Sugiyama (Machine Learning Professional Series) http://ide-research.net/book/support.html
Recommended Posts