import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from math import ceil
# create sample data ---------------
## function
def repeat_copy_and_random_choice(values_list, sample_length):
repeat_rate = 5
repeat_copied = values_list * ceil(sample_length / len(values_list) * repeat_rate)
random_choiced = np.random.choice(repeat_copied, sample_length)
return random_choiced
## parameters
data_length = 100
users = [c for c in 'ABCDEFGHIJ']
items = [c for c in 'abcdefghijklmnopqrstuvwxyz']
order_dates = list(np.arange(datetime(2019,9,1), datetime(2019,9,30), timedelta(days=1)).astype(datetime))
item_price_master = pd.DataFrame()
## DataFrame
df = pd.DataFrame()
df['order_date'] = repeat_copy_and_random_choice(order_dates, data_length)
df['user'] = repeat_copy_and_random_choice(users, data_length)
df['item'] = repeat_copy_and_random_choice(items, data_length)
df['quantity'] = np.random.randint(1,10, data_length)
item_price_master['item'] = items
item_price_master['unit_price'] = [np.ceil(x * 1000) for x in np.random.random(len(items))]
df = df.merge(item_price_master, on='item', how='left')
df = df.sort_values(by=['order_date', 'user', 'item']).reset_index(drop=True)
Solche Daten Bild von "Kaufdatum, Benutzername, Produktname, Anzahl der Käufe, Stückpreis"
## purchase amount / (date & user)
(df
.assign(price=lambda xdf: xdf['unit_price'] * xdf['quantity'])
.groupby('user')
.agg({
'order_date':pd.Series.nunique,
'price':pd.Series.sum,
})
.assign(price_per_date=lambda xdf: (xdf['price'] / xdf['order_date']).astype(int))
.sort_values(by='price_per_date', ascending=False)
)
## purchase amount / (date & user): Eine andere Art zu schreiben (eine verwirrende, aber Erinnerung)
def tmp1(srs):
x = int(srs['price'] / srs['order_date'])
return pd.Series(data=[srs['order_date'], srs['price'], x],
index=['order_date', 'price', 'price_per_date'])
(df
.assign(price=lambda xdf: xdf['unit_price'] * xdf['quantity'])
.groupby('user')
.agg({
'order_date':pd.Series.nunique,
'price':pd.Series.sum,
})
.apply(tmp1, axis=1)
.sort_values(by='price_per_date', ascending=False)
)
Das Ergebnis sieht so aus
def calc_med_diff_date(xdf):
#Nur dort lassen, wo die Daten unterschiedlich sind
flags = (
(xdf['order_date_prev'].notnull()) &
(xdf['order_date']!=xdf['order_date_prev'])
)
tmp = xdf.loc[flags, :]
avg_diff_date = (tmp['order_date'] - tmp['order_date_prev']).median()
return avg_diff_date
(df
.sort_values(by=['user', 'order_date'], ascending=True)
.assign(order_date_prev=lambda xdf: xdf.groupby('user')['order_date'].shift(1))
[['user','order_date','order_date_prev']] #Du musst das nicht tun
.groupby('user')
.apply(calc_med_diff_date)
.sort_values(ascending=True)
)
Das Ergebnis sieht so aus
Ende
Recommended Posts