2. Multivariate analysis spelled out in Python 7-3. Decision tree [regression tree]

⑴ Import library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor #A class that creates a regression tree model

⑵ Data acquisition and reading

from sklearn.datasets import load_boston
boston_dataset = load_boston()

** Build a regression tree model that predicts the price of a house using the 13 explanatory variables that characterize the house. ** **

#Store explanatory variables in DataFrame
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)

print(boston.head()) #Display the first 5 lines
print(boston.columns) #Show column name
print(boston.shape) #Check the shape


#Add objective variable
boston['MEDV'] = boston_dataset.target

print(boston.head()) #Display the first 5 lines
print(boston.shape) #Reconfirm the shape


⑶ Data division

#Convert dataset to Numpy array
array = boston.values

#Divide into explanatory variables and objective variables
X = array[:,0:13]
Y = array[:,13]
#Import module to split data
from sklearn.model_selection import train_test_split

#Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

⑷ Construction of regression tree model

#Model instantiation
reg = DecisionTreeRegressor(max_leaf_nodes = 20)

#Model generation by learning
model = reg.fit(X_train, Y_train)


⑸ Evaluation of regression tree model

➀ Confirm the validity of the forecast

#Import Python standard pseudo-random number module
import random

#Randomly select id
id = random.randrange(0, X.shape[0], 1)


#Extract the relevant sample from the original dataset
x = X[id]
x = x.reshape(1,13)

#Predict house prices from explanatory variables
YHat = model.predict(x)

#Convert the explanatory variable of the id to DataFrame
df = pd.DataFrame(x, columns = boston_dataset.feature_names)
#Added predicted value y
df["Predicted Price"] = YHat




➁ Check the coefficient of determination as an indicator of versatility

#Import the function to calculate the coefficient of determination
from sklearn.metrics import r2_score
YHat = model.predict(X_test)


r2 = r2_score(Y_test, YHat)
print("R^2 = ", r2)


⑹ Visualization of regression tree model

#Import sklearn tree module
from sklearn import tree

#Module to display images in Notebook
from IPython.display import Image

#Module for visualizing decision tree model
import pydotplus
#Convert decision tree model to DOT data
dot_data = tree.export_graphviz(model,
                                out_file = None,
                                feature_names = boston_dataset.feature_names,
                                class_names = 'MEDV',
                                filled = True)

#Draw a diagram
graph = pydotplus.graph_from_dot_data(dot_data)  

#View diagram


