Private Python handbook (updated from time to time)

from __future__ import division, unicode_literals

General

#Count for each element of the list
from collections import defaultdict
cnt_dict = defaultdict(int)

data = x = np.random.randint(low=0, high=5, size=500)
for d in data:
    cnt_dict[d] += 1

print cnt_dict

out


defaultdict(<type 'int'>, {0: 90, 1: 113, 2: 94, 3: 96, 4: 107})
#Library version check
from distutils.version import LooseVersion

#Example of use
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
#More convenient counting method
import  numpy as np
from collections import Counter

data1 = np.random.randint(low=0, high=5, size=300)
cnt1 = Counter(data1)
print cnt1

data2 = np.random.randint(low=0, high=10, size=500)
cnt2 = Counter(data2)
print cnt2

print cnt1 + cnt2

out


Counter({3: 65, 0: 64, 1: 60, 4: 60, 2: 51})
Counter({4: 58, 8: 58, 1: 55, 6: 54, 0: 53, 2: 49, 3: 47, 5: 46, 7: 40, 9: 40})
Counter({4: 118, 0: 117, 1: 115, 3: 112, 2: 100, 8: 58, 6: 54, 5: 46, 7: 40, 9: 40})
#Pickle
import cPickle as pickle
def unpickle(filename):
    with open(filename, 'rb') as fo:
        _dict = pickle.load(fo)
    return _dict

def to_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, -1)
        # pickle.Pickler(f, 2).dump(obj)
#Extracting links from web pages
from bs4 import BeautifulSoup
import requests
from requests_oauthlib import OAuth1Session

url = 'http://headlines.yahoo.co.jp/rss/list'
url_list = []
res = requests.get(url)
news_all = BeautifulSoup(res.text, "xml")
for link in news_all.find_all('a'):
    url = link.get('href')
    print url

save with h5&load


import deepdish as dd

dd.io.save("../data/df_test.h5", df_test)
df_test = dd.io.load("../data/df_test.h5")
#How many decimal places to display
%precision 4
np.pi

out


3.1416

Drawing related

#Fixed import statement
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import diatomite as dt
import sys
plt.style.use('ggplot')

#When using Tex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

Pandas Dataframe

Convert a string that looks like a number to a numeric type

# http://stackoverflow.com/questions/21197774/assign-pandas-dataframe-column-dtypes

In [11]: df
Out[11]: 
   x  y
0  a  1
1  b  2

In [12]: df.dtypes
Out[12]: 
x    object
y    object
dtype: object

In [13]: df.convert_objects(convert_numeric=True)
Out[13]: 
   x  y
0  a  1
1  b  2

In [14]: df.convert_objects(convert_numeric=True).dtypes
Out[14]: 
x    object
y     int64
dtype: object
#Category variable(Factory type in R)Treatment
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

df = sns.load_dataset("tips")
for c in ['sex', 'smoker', 'day', 'time',]:
    df["c{}".format(c)] = pd.Categorical.from_array(df[c]).codes
df.head()

out


   total_bill   tip     sex smoker  day    time  size  csex  csmoker  cday     ctime  
0       16.99  1.01  Female     No  Sun  Dinner     2     0        0     2         0 
1       10.34  1.66    Male     No  Sun  Dinner     3     1        0     2         0 
2       21.01  3.50    Male     No  Sun  Dinner     3     1        0     2         0
3       23.68  3.31    Male     No  Sun  Dinner     2     1        0     2         0
4       24.59  3.61  Female     No  Sun  Dinner     4     0        0     2         0

Date related

from datetime import datetime
now = datetime.now()
now.strftime("%Y-%m-%d %a %H:%M:%S")

out


'2015-08-13 Thu 16:41:25'
from dateutil.parser import parse
parse("2015-3-25 21:43:15")

out


datetime.datetime(2015, 3, 25, 21, 43, 15)
datestrs = ['2011/7/6 12:00:00', None, '2011/8/6 21:00:00']
pd.to_datetime(datestrs)

out


DatetimeIndex(['2011-07-06 12:00:00', 'NaT', '2011-08-06 21:00:00'], dtype='datetime64[ns]', freq=None, tz=None)

#Date duplication check
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/2', '2000/1/2', '2000/1/2','2000/1/3'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts.index.is_unique

out


False
dup_ts.groupby(level=0).count()

out


2000-01-01    1
2000-01-02    3
2000-01-03    1
#Date data generation by specifying a range
dft = pd.date_range(start='2000-1-1', end='2001-1-1', freq='H')
dft

out


DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00',
               '2000-01-01 02:00:00', '2000-01-01 03:00:00',
               '2000-01-01 04:00:00', '2000-01-01 05:00:00',
               '2000-01-01 06:00:00', '2000-01-01 07:00:00',
               '2000-01-01 08:00:00', '2000-01-01 09:00:00', 
               ...
               '2000-12-31 15:00:00', '2000-12-31 16:00:00',
               '2000-12-31 17:00:00', '2000-12-31 18:00:00',
               '2000-12-31 19:00:00', '2000-12-31 20:00:00',
               '2000-12-31 21:00:00', '2000-12-31 22:00:00',
               '2000-12-31 23:00:00', '2001-01-01 00:00:00'],
              dtype='datetime64[ns]', length=8785, freq='H', tz=None)
#Fill in the missing dates(resample function)
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/5', '2000/1/8', '2000/1/9'])
df = pd.DataFrame(np.random.normal(0,1,size=len(dates)), columns=["num"], index=dates)
print "[Before]"
print df
df =  df.resample('D')
print "[After]"
print df

out


[Before]
                 num
2000-01-01  1.201939
2000-01-05  0.522156
2000-01-08  1.800669
2000-01-09 -0.834700

[After]
                 num
2000-01-01  1.201939
2000-01-02       NaN
2000-01-03       NaN
2000-01-04       NaN
2000-01-05  0.522156
2000-01-06       NaN
2000-01-07       NaN
2000-01-08  1.800669
2000-01-09 -0.834700
#Localization
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/5', '2000/1/8', '2000/1/9'])
print dates.tz.__repr__
print dates
#Set location in Japan time (ex:00:00:00 is recognized as the time in Japan time)
dates = dates.tz_localize("Japan")
print dates
#Converted to US East Coast time (value does not change)
print dates.tz_convert('US/Eastern')

out


<method-wrapper '__repr__' of NoneType object at 0x10017dc40>
DatetimeIndex(['2000-01-01', '2000-01-05', '2000-01-08', '2000-01-09'], dtype='datetime64[ns]', freq=None, tz=None)
DatetimeIndex(['2000-01-01 00:00:00+09:00', '2000-01-05 00:00:00+09:00',
               '2000-01-08 00:00:00+09:00', '2000-01-09 00:00:00+09:00'],
              dtype='datetime64[ns]', freq=None, tz='Japan')
DatetimeIndex(['1999-12-31 10:00:00-05:00', '2000-01-04 10:00:00-05:00',
               '2000-01-07 10:00:00-05:00', '2000-01-08 10:00:00-05:00'],
              dtype='datetime64[ns]', freq=None, tz='US/Eastern')
rng = pd.period_range('2014/1/1', '2015/3/31', freq='M');
print rng

ser = pd.Series(np.random.randn(rng.size), index=rng)
print ser

values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-DEC')
df = pd.DataFrame(np.random.randn(index.size), index=index)
print df

out


PeriodIndex(['2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06',
             '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12',
             '2015-01', '2015-02', '2015-03'],
            dtype='int64', freq='M')
2014-01    0.273280
2014-02   -0.231141
2014-03    0.251094
2014-04   -1.217927
2014-05    0.341373
2014-06   -0.931357
2014-07   -0.414243
2014-08   -1.876341
2014-09    1.152908
2014-10   -0.473921
2014-11    0.527473
2014-12   -0.529911
2015-01   -0.656616
2015-02    0.742319
2015-03   -0.268112
Freq: M, dtype: float64
               0
2014Q3  0.011621
2014Q4 -0.029027
2015Q1 -0.222156
2015Q2 -0.749983
#When CY is applied
values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-DEC')
print index
print index.asfreq('M',how='start')
print index.asfreq('M',how='end')
print index.asfreq('D',how='start')
print index.asfreq('D',how='end')

out


PeriodIndex(['2014Q3', '2014Q4', '2015Q1', '2015Q2'], dtype='int64', freq='Q-DEC')
PeriodIndex(['2014-07', '2014-10', '2015-01', '2015-04'], dtype='int64', freq='M')
PeriodIndex(['2014-09', '2014-12', '2015-03', '2015-06'], dtype='int64', freq='M')
PeriodIndex(['2014-07-01', '2014-10-01', '2015-01-01', '2015-04-01'], dtype='int64', freq='D')
PeriodIndex(['2014-09-30', '2014-12-31', '2015-03-31', '2015-06-30'], dtype='int64', freq='D')
#When applying FY
values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-MAR')
print index
print index.asfreq('M',how='start')
print index.asfreq('M',how='end')
print index.asfreq('D',how='start')
print index.asfreq('D',how='end')

out


PeriodIndex(['2014Q3', '2014Q4', '2015Q1', '2015Q2'], dtype='int64', freq='Q-MAR')
PeriodIndex(['2013-10', '2014-01', '2014-04', '2014-07'], dtype='int64', freq='M')
PeriodIndex(['2013-12', '2014-03', '2014-06', '2014-09'], dtype='int64', freq='M')
PeriodIndex(['2013-10-01', '2014-01-01', '2014-04-01', '2014-07-01'], dtype='int64', freq='D')
PeriodIndex(['2013-12-31', '2014-03-31', '2014-06-30', '2014-09-30'], dtype='int64', freq='D')
# time zone change from utc to stdjp (for no timezone variable)
import datetime, pytz
utc = pytz.timezone('UTC')
jst = pytz.timezone('Asia/Tokyo')
now = datetime.datetime.now()
updated = now.replace(tzinfo=utc).astimezone(jst)
print "time:{}".format(updated)

out


time:2015-08-22 02:46:23.844806+09:00

out




out



Time series

ts = pd.Series(np.random.randn(1000), index=pd.date_range('2010/1/1', periods=1000))
ts = ts.cumsum()
ts.plot(color="b", alpha=0.5, figsize=(10,6))

#Simple moving average
pd.rolling_mean(ts, 40, center=True).plot(style='-', c='r', alpha=0.8,)
pd.rolling_mean(ts, 180, center=True).plot(style='-', c='blue', alpha=0.9,zorder=100)

ts_001.png

#You can slice by date!
ts['2010/12/31':]

out



#Correlogram drawing
import statsmodels.tsa.stattools as stt

plt.figure(figsize=(10,5)) 
acf = stt.acf(np.array(ts), 60)        #ACF calculation
plt.bar(range(len(acf)), acf, width = 0.3) #display
plt.show()


pcf = stt.pacf(np.array(ts), 50)
plt.figure(figsize=(10,5))
plt.bar(range(len(pcf)), pcf, width = 0.3) 
plt.show()

acf.png

pacf.png

# ARMA(3, 0)Process sample generation
from statsmodels.tsa.arima_process import arma_generate_sample
ar_params = np.array([0.30, 0.50, -0.10])
ma_params = np.array([0.00])
ar_params = np.r_[1, -ar_params]
ma_params = np.r_[1, -ma_params]
nobs = 250
y = arma_generate_sample(ar_params, ma_params, nobs)
ts = pd.Series(y, index=pd.date_range('2010/1/1', periods=nobs))

ts.plot(color="b", alpha=0.5, figsize=(10,6))

plt.figure(figsize=(10,5)) 
acf = stt.acf(np.array(ts), 60)        #ACF calculation
ts_acf = pd.Series(acf, index=pd.date_range('2010/1/1', periods=len(acf)))
ts_acf.plot(kind='bar', figsize=(10,5), color="b", alpha=0.5)
plt.show()

pacf = stt.pacf(np.array(ts), 50)
ts_pacf = pd.Series(pacf, index=pd.date_range('2010/1/1', periods=len(pacf)))
ts_pacf.plot(kind='bar', figsize=(10,5), color="g", alpha=0.5)
plt.show()

arma_3_0_ts.png arma_3_0_acf.png arma_3_0_pacf.png

import statsmodels.graphics.tsaplots as tsaplots

fig = plt.figure(figsize=(12,5)) 
ax = fig.add_subplot(111)
tsaplots.plot_acf(ts, ax=ax, color="g")
    
plt.show()

arma_3_0_acf_intvl.png

#ARMA test
from statsmodels.tsa import arima_model
arma = arima_model.ARMA(y, order = [3,0]).fit()
print arma.summary()

out


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                      y   No. Observations:                  250
Model:                     ARMA(3, 0)   Log Likelihood                -357.274
Method:                       css-mle   S.D. of innovations              1.009
Date:                Thu, 13 Aug 2015   AIC                            724.548
Time:                        17:57:45   BIC                            742.155
Sample:                             0   HQIC                           731.634
                                                                              
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.0262      0.187      0.140      0.889        -0.341     0.393
ar.L1.y        0.2256      0.063      3.586      0.000         0.102     0.349
ar.L2.y        0.4945      0.057      8.699      0.000         0.383     0.606
ar.L3.y       -0.0569      0.064     -0.895      0.371        -0.181     0.068
                                    Roots                                    
=============================================================================
                 Real           Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            1.2968           +0.0000j            1.2968            0.0000
AR.2           -1.5205           +0.0000j            1.5205            0.5000
AR.3            8.9145           +0.0000j            8.9145            0.0000
-----------------------------------------------------------------------------
#Confirmation of ARMA residuals
resid = arma.resid

plt.figure(figsize=(15,5))
plt.bar(range(len(resid)), resid, width=0.5)
plt.show()

plt.figure(figsize=(15,5))
acf = stt.acf(resid, nlags=len(resid))
plt.bar(range(len(acf)), acf, width=0.5, color="g")
plt.show()

fig = plt.figure(figsize=(15,5)) 
ax = fig.add_subplot(111)
tsaplots.plot_acf(resid, ax=ax, color="pink")
    
plt.show()

resid_plot.png resid_acf.png resid_acf2.png

# Ljung-Box Q-statistic for autocorrelation parameters
lbs = stt.q_stat(acf, len(ts))           #stats models is a specification that takes acf as input
plt.figure(figsize=(12,6))
plt.bar(range(len(lbs[1])), lbs[1])

lbs.png

#Fill in missing values with 0
df_data.fillna(0)

out




out



Spark

import os, sys
from datetime import datetime as dt
print "loading PySpark setting..."
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

out



loading PySpark setting...
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.5.0
      /_/

Using Python version 2.7.10 (default, May 28 2015 17:04:42)
SparkContext available as sc, HiveContext available as sqlContext.
#Split data for Cross Validation
from pyspark.mllib.regression import LabeledPoint

def parsePoint(vec):
    return LabeledPoint(vec[0], vec[1:])

dat = np.column_stack([iris.target[:], iris.data[:,0],iris.data[:,2]])
data = sc.parallelize(dat)   #RDD conversion
parsedData = data.map(parsePoint)  #Convert the contents data to Labeled Point

#Divided into training data and test data
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])



out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out




out



Recommended Posts

Private Python handbook (updated from time to time)
Anaconda updated from 4.2.0 to 4.3.0 (python3.5 updated to python3.6)
progate Python learning memo (updated from time to time)
Machine learning python code summary (updated from time to time)
vtkXMLUnstructuredGridReader Summary (updated from time to time)
vtkOpenFOAMReader Summary (Updated from time to time)
Engineer vocabulary (updated from time to time)
Tensorflow memo [updated from time to time]
Python (from first time to execution)
Updated to Python 2.7.9
vtkClipPolyData / DataSet Summary (Updated from time to time)
[Updated from time to time] PostmarketOS related notes
Summary of vtkThreshold (updated from time to time)
[Note] AI / machine learning / python related websites [updated from time to time]
Changes from Python 3.0 to Python 3.5
Changes from Python 2 to Python 3.0
Summary of gcc options (updated from time to time)
[Updated from time to time] LetCode algorithm and library
Notes on machine learning (updated from time to time)
OpenFOAM post-processing cheat sheet (updated from time to time)
Useful help sites, etc. (updated from time to time)
Post from Python to Slack
Cheating from PHP to Python
Apache settings, log confirmation, etc. (* Updated from time to time)
[Updated from time to time] Review of Let Code NumPy
Switch from python2.7 to python3.6 (centos7)
Connect to sqlite from python
I read the Chainer reference (updated from time to time)
[Updated from time to time] Python memos often used for data analysis [N division, etc.]
Create folders from '01' to '12' with python
[Lambda] [Python] Post to Twitter from Lambda!
Connect to utf8mb4 database from python
Post images from Python to Tumblr
How to access wikipedia from python
Python to switch from another language
Did not change from Python 2 to 3
Update Python on Mac from 2 to 3
[Notes / Updated from time to time] This and that of Azure Functions
git / python> git log analysis (v0.1, v0.2)> Implementation to estimate work time from git log
Summary of folders where Ruby, Python, PostgreSQL, etc. are installed on macOS (updated from time to time)
[Python] Fluid simulation: From linear to non-linear
From Python to using MeCab (and CaboCha)
How to update Google Sheets from Python
Send a message from Python to Slack
I want to use jar from python
Easy conversion from UTC to local time
Convert from katakana to vowel kana [python]
Push notification from Python server to Android
Connecting from python to MySQL on CentOS 6.4
Porting and modifying doublet-solver from python2 to python3.
How to access RDS from Lambda (python)
Python> Output numbers from 1 to 100, 501 to 600> For csv
Convert from Markdown to HTML in Python
[Amazon Linux] Switching from Python 2 series to Python 3 series
API explanation to touch mastodon from python
Connect to coincheck's Websocket API from Python
Send a message from Slack to a Python server
Get time series data from k-db.com in Python
Edit Excel from Python to create a PivotTable
First time python
3 ways to parse time strings in python [Note]