[PYTHON] Date feature generation memo

Code notes for generating features and objective variables from date-type data without using featuretools

datefeaturetool.py


import datetime
import numpy as np
import pandas as pd

class DeltaDate():
    """
Date feature generation
    """
    def __init__(self, cutoff_date):
        """
        cutoff_date: datetime.date(2020, 2, 2) or pandas.Timestamp('2020-02-02')
                     int, numpy.int64, float, numpy.float64
        """
        if type(cutoff_date) == datetime.date:
            self.cutoff_date = pd.to_datetime(cutoff_date)
            print('cutoff_date converted from datetime.date type to pandas.Timestamp type.')
        else:
            self.cutoff_date = cutoff_date
    
    def delta_date_1d(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series 
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (np.int64)
        """
        day_lt = ['d', 'D', 'day', 'Day']
        month_lt = ['m', 'M', 'month', 'Month']
        year_lt = ['y', 'Y', 'year', 'Year']
        
        tcd = type(self.cutoff_date)
        if tcd == pd._libs.tslibs.timestamps.Timestamp:
            if freq in day_lt:
                delta = self.cutoff_date - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                start_year = dates.dt.year  # timedelta64[ns]
                start_month = dates.dt.month  # timedelta64[ns]
                start_day = dates.dt.day  # timedelta64[ns]
                end_year = self.cutoff_date.year  # np.int64
                end_month = self.cutoff_date.month  # np.int64
                end_day = self.cutoff_date.day  # np.int64
                cond = ((end_month<start_month)|((end_month==start_month)&(end_day<start_day)))
                if freq in month_lt:
                    delta = (end_year - start_year) * 12 + (end_month - start_month)
                    delta = delta.mask(cond, delta - 1)  # np.int64
                else:
                    delta = end_year - start_year
                    delta = delta.mask(cond, delta - 1)  # np.int64
            else:
                print("freq must be 'day', 'month' or 'year'")
        elif (tcd==int)|(tcd==np.int64)|(tcd==float)|(tcd==np.float64):
            if freq in day_lt:
                y = self.cutoff_date // 10000
                m = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100
                d = self.cutoff_date - self.cutoff_date//100 * 100
                cod = pd.Timestamp(year=y, month=m, day=d)
                dates = pd.to_datetime(dates.astype(str), format='%Y%m%d')
                delta = cod - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                y_diff = self.cutoff_date//10000 - dates//10000
                m_diff = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100 - (dates - dates//10000 * 10000)//100
                d_diff = (self.cutoff_date - self.cutoff_date//100 * 100) - (dates - dates//100 * 100)
                cond = (m_diff < 0) | ((m_diff == 0) & (d_diff < 0))
                if freq in month_lt:
                    delta = y_diff * 12 + m_diff
                    delta = delta.mask(cond, delta - 1)
                else:
                    delta = y_diff
                    delta = delta.mask(cond, delta - 1)
        else:
            print("freq must be 'day', 'month' or 'year'")
        
        if past_or_future in ['f', 'future']:
            delta = -delta
            print('delta for the future.')
        
        delta.name = 'elapsed_' + delta.name
        
        return delta
    
    def delta_date(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (np.int64)
        """
        if type(dates) == pd.core.series.Series:
            delta = self.delta_date_1d(dates, freq, past_or_future)
        elif type(dates) == pd.core.frame.DataFrame:
            s_lt = []
            for col in dates:
                dd = self.delta_date_1d(dates[col], freq, past_or_future)
                s_lt += [dd]
            delta = pd.concat(s_lt, axis=1)
        else:
            print('dates must be andas.Series or pd.DataFrame.')
        return delta
    
    def within_date(self, dates, within, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        within: int (Within n days, within n months, within n years)
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (0: over, 1: within, np.nan: minus)
        """
        if type(within) == list:
            delta_sign_lt = []
            for n in within:
                delta = self.delta_date(dates, freq, past_or_future)
                delta_sign = delta.mask(delta>n, 0)
                delta_sign = delta_sign.mask(delta<=n, 1)
                delta_sign = delta_sign.mask(delta<0)
                if type(delta_sign) == pd.core.frame.DataFrame:
                    delta_sign.columns = ['within' + str(n) + c for c in dates.columns]
                else:
                    delta_sign.name = 'within' + str(n) + dates.name
                delta_sign_lt+= [delta_sign]
            within_sign = pd.concat(delta_sign_lt, axis=1)
        else:
            delta = self.delta_date(dates, freq, past_or_future)
            delta_sign = delta.mask(delta>within, 0)
            delta_sign = delta_sign.mask(delta<=within, 1)
            within_sign = delta_sign.mask(delta<0)
            within_sign.name = 'within' + str(within) + dates.name
        
        return within_sign

if __name__ == '__main__':
    df = pd.DataFrame([['2017-8-1', '2018-12-15'],
                       ['2020-2-2', '2019-3-31']],
                      columns=['date1', 'date2'])
    for c in df:
        df[c] = pd.to_datetime(df[c], format='%Y-%m-%d')
    
    deltadate = DeltaDate(datetime.date(2020, 2, 28))
    result = deltadate.delta_date(df, freq='d')
    within = deltadate.within_date(df, [12, 24], freq='m')

Recommended Posts

Date feature generation memo
Feature generation with pandas group by
Data analysis before kaggle's titanic feature generation
Automatic PowerPoint generation with python-pptx (personal memo)