[PYTHON] [Pandas sample code] Create and aggregate sample data that looks like a purchase log

Sample data creation

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from math import ceil 

# create sample data ---------------
## function
def repeat_copy_and_random_choice(values_list, sample_length):
    repeat_rate = 5
    repeat_copied = values_list * ceil(sample_length / len(values_list) * repeat_rate)
    random_choiced = np.random.choice(repeat_copied, sample_length)
    return random_choiced

## parameters
data_length = 100
users = [c for c in 'ABCDEFGHIJ']
items = [c for c in 'abcdefghijklmnopqrstuvwxyz']
order_dates =  list(np.arange(datetime(2019,9,1), datetime(2019,9,30), timedelta(days=1)).astype(datetime))
item_price_master = pd.DataFrame()

## DataFrame
df = pd.DataFrame()
df['order_date'] = repeat_copy_and_random_choice(order_dates, data_length)
df['user'] = repeat_copy_and_random_choice(users, data_length)
df['item'] = repeat_copy_and_random_choice(items, data_length)
df['quantity'] = np.random.randint(1,10, data_length)
item_price_master['item'] = items
item_price_master['unit_price'] = [np.ceil(x * 1000) for x in np.random.random(len(items))]
df = df.merge(item_price_master, on='item', how='left')
df = df.sort_values(by=['order_date', 'user', 'item']).reset_index(drop=True)

Such data image.png Image of "purchase date, user name, product name, number of purchases, unit price"

Calculation example 1: Total purchase amount / total purchase days for each user

## purchase amount / (date & user)
(df
.assign(price=lambda xdf: xdf['unit_price'] * xdf['quantity'])
.groupby('user')
.agg({
    'order_date':pd.Series.nunique,
    'price':pd.Series.sum,
    })
.assign(price_per_date=lambda xdf: (xdf['price'] / xdf['order_date']).astype(int))
.sort_values(by='price_per_date', ascending=False)
)

## purchase amount / (date & user): Another way of writing (a confusing but reminder)
def tmp1(srs):
    x = int(srs['price'] / srs['order_date'])
    return pd.Series(data=[srs['order_date'], srs['price'], x],
                    index=['order_date', 'price', 'price_per_date'])

(df
.assign(price=lambda xdf: xdf['unit_price'] * xdf['quantity'])
.groupby('user')
.agg({
    'order_date':pd.Series.nunique,
    'price':pd.Series.sum,
    })
.apply(tmp1, axis=1)
.sort_values(by='price_per_date', ascending=False)
)

The result looks like this image.png

Calculation example 2: Median purchase interval days per user

def calc_med_diff_date(xdf):
    #Leave only where the dates are different
    flags = (
        (xdf['order_date_prev'].notnull()) &
        (xdf['order_date']!=xdf['order_date_prev'])
    )
    tmp = xdf.loc[flags, :]
    avg_diff_date = (tmp['order_date'] -  tmp['order_date_prev']).median()
    return avg_diff_date

(df
.sort_values(by=['user', 'order_date'], ascending=True)
.assign(order_date_prev=lambda xdf: xdf.groupby('user')['order_date'].shift(1))
[['user','order_date','order_date_prev']]  #You don't have to do this
.groupby('user')
.apply(calc_med_diff_date)
.sort_values(ascending=True)
)

The result looks like this image.png

end

Recommended Posts

[Pandas sample code] Create and aggregate sample data that looks like a purchase log
Create code that outputs "A and pretending B" in python
Create test data like that with Python (Part 1)
Randomly sample MNIST data to create a dataset
Create a decision tree from 0 with Python and understand it (3. Data analysis library Pandas edition)
Create a QR code that displays "Izumi Oishi" by scratch
Create applications, register data, and share with a single email