In analysis competitions such as Kaggle and Signate, the speed of initial action is important, so Organize frequently used Jupyter Notebook templates to speed up the initial action. Updated from time to time.
2020.5.25 Random number setting change (functionalization)
import template
import pandas as pd
import numpy as np
import pandas_profiling as pdp
import lightgbm as lgb
import random
from numba import jit
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.dates import DateFormatter
%matplotlib inline
import seaborn as sns
def seed_everything(seed):
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
if "tr" in sys.modules:
tf.random.set_seed(seed)
seed_everything(28)
#Specify the maximum number of lines to display (50 lines are specified here)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
%load_ext autoreload
%autoreload 2
import os
# windows
if os.name == 'nt':
path = '../input/data/'
import japanize_matplotlib
sns.set(font="IPAexGothic")
elif os.name == 'posix':
# Kaggle
if 'KAGGLE_DATA_PROXY_TOKEN' in os.environ.keys():
path = '/kaggle/input/'
# Google Colab
else:
from google.colab import drive
drive.mount('/content/drive')
!ls drive/My\ Drive/'Colab Notebooks'/xxx #xxx rewrite
path = "./drive/My Drive/Colab Notebooks/xxx/input/data/" #xxx rewrite
#Check the remaining time of the session
!cat /proc/uptime | awk '{print $1 /60 /60 /24 "days (" $1 / 60 / 60 "h)"}'
print(os.name)
print(path)
Identify the platform running Python
import os
# windows
if os.name == 'nt':
#xxx
elif os.name == 'posix':
# Kaggle
if 'KAGGLE_DATA_PROXY_TOKEN' in os.environ.keys():
#xxx
# Google Colab
else:
#xxx
print(os.name)
Even if you change the library, it will be automatically loaded at runtime.
%load_ext autoreload
%autoreload 2
reference https://qiita.com/Accent/items/f6bb4d4b7adf268662f4
If you want to speed up, it's important to find the bottleneck first. Expected to be used in Notebook such as Jupyter. Easy: %% time is useful if you want to know the processing time in a cell. Details:% lprun is useful if you want to know the detailed processing time of each line.
4.1 %%time Put it at the beginning j of the cell. It displays the execution time of the entire cell.
%%time
def func(num):
sum = 0
for i in range(num):
sum += i
return sum
out = func(10000)
4.2 %lprun It outputs the execution time for each line. %% prun is a module unit, so it may be difficult to understand. % lprun is easier to understand line by line.
Below, 3 steps. Step0. Installation Step1. Road Step2. Execution
Skip if installed. Commands for Google Coab, Kaggle Cloud
!pip install line_profiler
%load_ext line_profiler
def func(num):
sum = 0
for i in range(num):
sum += i
return sum
%lprun -f func out = func(10000)
If you are using a cloud platform such as Google Colab, it can be difficult to change your system. It's relatively easy because it automatically installs Japanese fonts and packages.
Note that seaborn also sets the font when importing, so execute sns.set at the end.
import seaborn as sns
import japanize_matplotlib
sns.set(font="IPAexGothic") ###Be sure to run last
When displaying the DataFrame of pandas, it is abbreviated (...) after a certain number of rows / columns. Set the maximum number of displays to control omission.
#Specify the maximum number of lines to display (50 lines are specified here)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
The type is automatically set from the range of numerical values in the data frame.
Reference by @gemartin https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
if is_datetime(df[col]) or is_categorical_dtype(df[col]):
# skip datetime type or categorical type
continue
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
If you do not fix the seed of the random number, the prediction result will change every time and the effect will be difficult to understand, so fix it.
Module settings related to random numbers
import numpy as np
import random
import os
def seed_everything(seed):
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
if "tr" in sys.modules:
tf.random.set_seed(seed)
LightGBM parameters related to random numbers
lgb_params = {
'random_state':28,
'bagging_fraction_seed':28,
'feature_fraction_seed':28,
'data_random_seed':28,
'seed':28
}
Recommended Posts