[PYTHON] Preprocessing of prefecture data

Completion code


import re
import pandas as pd
import numpy as np
import codecs
import matplotlib.pyplot as plt
from sklearn import linear_model

with codecs.open(<Car data>, "r", "Shift-JIS", "ignore") as file:   #http://qiita.com/niwaringo/items/d2a30e04e08da8eaa643
    df = pd.read_table(file, delimiter=",")

dfx = df.copy()

dfx.iloc[:,5:] = dfx.iloc[:,5:].applymap(lambda x: re.sub(',','',x)).applymap(lambda x: re.sub('-','',x)).convert_objects(convert_numeric=True)
#FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.

with codecs.open('Resident area.csv', "r", "Shift-JIS", "ignore") as file:
    df2 = pd.read_table(file, delimiter=",")

df2.iloc[:,3:] = df2.iloc[:,3:].applymap(lambda x: re.sub(',','',x)).applymap(lambda x: re.sub('-','',x)).convert_objects(convert_numeric=True)

nan = float('nan')
        'Resident area_signal':[nan],
        'Composite function_population_area':[nan],
        'Composite function_Automobile_area':[nan],
        'Composite function_population_Resident area':[nan],
        'Composite function_Automobile_Resident area':[nan]

for iteritem in range(1,48):
    iter_shape = df2[df2['Prefecture code']==iteritem]

    iter_data = dfx[
                    (dfx['Prefecture code']==iteritem)
                  & (dfx['Municipality'].str.contains('city$') | dfx['Municipality'].str.contains('Total$') | dfx['Municipality'].str.contains('town$') | dfx['Municipality'].str.contains('village$'))
                  & (dfx['Business type'].str.contains('^Total$'))

    iter_data2 = iter_data.copy()
    iter_data2.loc[:,'Municipality'] = iter_data2.loc[:,'Municipality'].apply(lambda x: re.sub(r'City total$','city',x))
    iter_data2.loc[:,'Municipality'] = iter_data2.loc[:,'Municipality'].apply(lambda x: re.sub(r'^.*county','',x))

    merged = pd.merge(iter_shape,iter_data2,on='Municipality')
    merged = merged.assign(
Composite function_population=np.nan,
Composite function_Automobile_area=(merged['Total total']**(2/3))*(merged['総area']**(1/3)),
Composite function_population_area=(merged['population総数']**(2/3))*(merged['総area']**(1/3)),
Composite function_Automobile_Resident area=(merged['Total total']**(2/3))*(merged['Resident area']**(1/3)),
Composite function_population_Resident area=(merged['population総数']**(2/3))*(merged['Resident area']**(1/3)),
Area square root=np.sqrt(merged['Total area']),
Resident area square root=np.sqrt(merged['Resident area']),
Signal estimation= np.around(0.0027*merged['Total total'].astype(np.float),0)        

    people_signal = linear_model.LinearRegression(fit_intercept=False)
    car_signal = linear_model.LinearRegression(fit_intercept=False)
    shape_signal = linear_model.LinearRegression(fit_intercept=False)
    liveshape_signal = linear_model.LinearRegression(fit_intercept=False)
    people_shape = linear_model.LinearRegression(fit_intercept=False)
    car_shape = linear_model.LinearRegression(fit_intercept=False)
    people_liveshape = linear_model.LinearRegression(fit_intercept=False)
    car_liveshape = linear_model.LinearRegression(fit_intercept=False)

    people = np.array(merged['Total area']).reshape(-1,1)
    car = np.array(merged['Total total']).reshape(-1,1)
    shape = np.array(merged['Area square root']).reshape(-1,1)
    liveshape = np.array(merged['Resident area square root']).reshape(-1,1)
    peopleShape = np.array(merged['Composite function_population_area']).reshape(-1,1)
    carShape = np.array(merged['Composite function_Automobile_area']).reshape(-1,1)
    peopleLiveShape = np.array(merged['Composite function_population_Resident area']).reshape(-1,1)
    carLiveShape = np.array(merged['Composite function_Automobile_Resident area']).reshape(-1,1)
    y_data=np.array(merged['Signal estimation']).reshape(-1,1)


        'Resident area_signal':[liveshape_signal.score(liveshape,y_data)],
        'Composite function_population_area':[people_shape.score(peopleShape,y_data)],
        'Composite function_Automobile_area':[car_shape.score(carShape,y_data)],
        'Composite function_population_Resident area':[people_liveshape.score(peopleLiveShape,y_data)],
        'Composite function_Automobile_Resident area':[car_liveshape.score(carLiveShape,y_data)]

    static_data = static_data.append(df_result)


Read csv data

with codecs.open(<Car data>, "r", "Shift-JIS", "ignore") as file:   #http://qiita.com/niwaringo/items/d2a30e04e08da8eaa643
    df = pd.read_table(file, delimiter=",")

Change to a shape that is easy to use

hokkaido_data = df[(df['Regional Transport Bureau'].str.contains('North Sea')) 
                              & (df['Municipality'].str.contains('city$') | df['Municipality'].str.contains('Total$') | df['Municipality'].str.contains('town$') | df['Municipality'].str.contains('village$'))
                              & (df['Business type'].str.contains('Total')) ]
#Local Transport Bureau includes North Sea&& (City name ends with city|City name includes county=>City name ends with town, village|Ends in town, village) &&Business type is total
#End-of-line match

Align the names of the tables for join (search, replace)

Source: http://sinhrks.hatenablog.com/entry/2014/12/06/233032

So, by adding the str accessor, batch processing of character strings becomes possible.

df[(df['Regional Transport Bureau'].str.contains('^North Sea'))].Regional Transport Bureau.str.replace(r'^North Sea道', '')
#All of"Hokkaido District Transport Bureau"To"Transport Bureau"Replace with

Converts a comma-separated string every 3 digits to a number

dfx.iloc[:,4:] =    # iloc()Since the one extracted with is not a slice, the value can be changed directly.
dfx.iloc[:,4:].applymap(lambda x: re.sub(',','',x))      
.applymap(lambda x: re.sub('-','',x))

・ Since a copy is created in ʻapplymap, substitute it. -ʻApplymap applies the argument function to all elements. -Replace the first argument of re.sub (pattern, str, argument) => ʻargument` with the second argument ・ Lamb (x: x ** 2) => Anonymous function

Linear regression

from sklearn import linear_model

lm = linear_model.LinearRegression(fit_intercept=False) #Option to set the intercept to 0

x_data=merged['Resident area']
y_data=merged['Total population']

#Since sklearn can only handle numpy, change it to numpy type. Also, reshape it because it has to be in the form of columns instead of rows.
x_data = np.array(x_data).reshape(-1,1) 
y_data = np.array(y_data).reshape(-1,1)

print(lm.coef_)      #Partial regression coefficient
print(lm.intercept_) #=> 0.0 Naturally, the intercept is set to 0
print(lm.score(x_data,y_data)) #Coefficient of determination


http://qiita.com/irs/items/4ebbfd8bef63db1892fb http://qiita.com/Attsun/items/5af3efdc241aa2fd3959 http://sinhrks.hatenablog.com/entry/2015/01/28/073327 About anonymous function lambda http://www.lifewithpython.com/2013/01/python-anonymous-function-lambda.html

