Beiträge von Hisabisa
Es gibt Kategorien_Encoder und Pandas get_dummies () für OneHot Encoding, aber ich finde es schwierig, sie in den folgenden Punkten zu verwenden.
Ich konnte den juckenden Ort nicht erreichen und es juckte
Ich habe meinen eigenen ohehot-Encoder erstellt, um dieses Problem zu lösen
class BaseEncoder():
def __init__(self):
pass
def fit(self):
raise Exception('not implemented')
def transform(self):
raise Exception('not implemented')
def fit_transform(self):
raise Exception('not implemented')
class OneHotEncoder(BaseEncoder):
# library requirement
# import pandas as pd
# import numpy as np
# mojimoji
def __init__(self,
col_name=None,
categories=None,
handle_unknown="summarize",
handle_nan="onehot",
col_order="name",
col_name_type="category",
force_hankaku=True,
return_type="df",
handle_rare=None,
dummy=None,
):
import pandas as pd
import numpy as np
import mojimoji
#---
# args
# col_name : target column [str, default : None] get column name from training data. If training data is np values, col is None
#
# categories : encoded category list [list, default : None]
#
# handle_unknown : handle unknown category method [str, default : "summarize"]
# "summarize" : unknown category (not appeared in training data) is treated as "unknownCategory"
# "ignore" : unknown category is ignored
#
# handle_nan : handle nan method [str, default : "onehot"]
# "onehot" : nan is treated as onehot
# "ignore" : nan is ignored
#
# col_order : output order [str, default : "name"]
# "name" : sorted by category name
# "count_asc" : sorted by ascending appearance count
# "count_des" : sorted by descending appearance count
#
# col_name_type : column name type [str, default : "category"]
# "name" : return column name is category name
# "index" : return column name is index number (rare : -1, nan : -2, impute : -3)
#
# force_hankaku : whether apply hankaku or not [bool , default : True]
#
# return_type : return values type [str, default : "df"] "pd" : pd.DataFrame , "np" : np.values
#
# handle_rare : rare category treat method [float, list, default : None]
# float : rare threshold of appearance category , list : this list category is treated as rare
#
# dummy : dummy method [str, bool, None, defult: None]
# str : category name , this category is treated as dummy
# True : dummy is valid, and dummy category is selected automatically
#
self.col_name = col_name
if type(categories) is list:
raise Exception(f"[Error] argument categories is invalid , shuold be list, but>> {categories}")
self.categories = categories
checks = ["summarize" , "ignore"]
if handle_unknown not in checks:
raise Exception(f"[Error] argument handle_unknown is invalid , shuold be {checks}, but {handle_unknown}")
self.handle_unknown = handle_unknown
checks = ["onehot" , "ignore"]
if handle_nan not in checks:
raise Exception(f"[Error] argument handle_nan is invalid , shuold be {checks}, but {handle_nan}")
self.handle_nan = handle_nan
checks = ["name" , "count_asc", "count_des"]
if col_order not in checks:
raise Exception(f"[Error] argument col_order is invalid , shuold be {checks}, but {col_order}")
self.col_order = col_order
checks = ["category" , "index"]
if col_name_type not in checks:
raise Exception(f"[Error] argument col_name_type is invalid , shuold be {checks}, but {col_name_type}")
self.col_name_type = col_name_type
checks = [bool]
if type(force_hankaku) not in checks:
raise Exception(f"[Error] argument force_hankaku should be {checks} type , but {force_hankaku}")
self.force_hankaku = force_hankaku
checks = ["df" , "np"]
if return_type not in checks:
raise Exception(f"[Error] argument return_type is invalid , shuold be {checks}, but {return_type}")
self.return_type = return_type
checks = [int, float, list]
if type(handle_rare) not in checks and handle_rare is not None:
raise Exception(f"[Error] argument handle_rare should be {checks} type or None, but {handle_rare}")
if type(handle_rare) in [int, float]:
if handle_rare >= 1 or handle_rare <= 0:
print(f"[Warning] handle_rare may be meaningless value >> {handle_rare}")
self.handle_rare = handle_rare if handle_rare is not None else -1.
checks = [str, bool]
if type(dummy) not in checks and dummy is not None:
raise Exception(f"[Error] argument force hankaku should be {checks} type or None , but {dummy}")
self.dummy = dummy # True only
self.encode_map = {}
self.unknown_categories = []
self.dummy_category = None
def fit(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# get column name
if self.col_name is None:
self.col_name = _Xs.name
if self.col_name is None:
self.col_name = "onehotEncode"
print(f"[Warning] column name is {self.col_name}")
new_cols = []
# if categories is inputted
if self.categories is not None:
cats = pd.Series(self.categories).astype(str)
if self.force_hankaku:
cats = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
for c in [x for x in cats if x not in ["nan", "None"]]:
onehot_name = f"{self.col_name}_{c}"
self.encode_map[c] = onehot_name
new_cols.append(onehot_name)
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in cats:
onehot_name = f"{self.col_name}_nan"
self.encode_map[nan_v] = onehot_name
new_cols.append(onehot_name)
# handle unknown
if self.handle_unknown == "summarize":
new_cols.append(f"{self.col_name}_unknownCategory")
self.new_cols = new_cols
return
# get category
vc = _Xs.value_counts(dropna=False, normalize=True)
# sort category
if self.col_order == "name":
vc.sort_index(inplace=True)
elif self.col_order == "count_asc":
vc.sort_values(inplace=True, ascending=True)
elif self.col_order == "count_des":
vc.sort_values(inplace=True, ascending=False)
# rare category (threshold)
if type(self.handle_rare) is float:
for c_ind, c in enumerate([x for x in vc[vc > self.handle_rare].index if x not in ["nan", "None"]]):
# skip dummy
if (self.dummy == True and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
self.encode_map[c] = "DUMMY_CATEGORY"
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in [x for x in vc[vc <= self.handle_rare].index if x not in ["nan", "None"]]:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# rare category (list)
if type(self.handle_rare) is list:
for c_ind, c in enumerate([x for x in vc.index if x not in ["nan" , "None"] + self.handle_rare]):
# skip dummy
if (self.dummy and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in self.handle_rare:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in vc.index:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_nan"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-2"
self.encode_map[nan_v] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle unknown
if self.handle_unknown == "summarize":
if self.col_name_type == "category":
new_cols.append(f"{self.col_name}_unknownCategory")
elif self.col_name_type == "index":
new_cols.append(f"{self.col_name}_-3")
encode_map_inv = {}
for k, v in self.encode_map.items():
if v in encode_map_inv.keys():
encode_map_inv[v] += [k]
else:
encode_map_inv[v] = [k]
self.new_cols = new_cols
self.categories = list(self.encode_map.keys())
self.encode_map_inv = encode_map_inv
del _Xs
def transform(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
if self.force_hankaku:
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# return dataframe
res_df = pd.DataFrame(index=range(len(_Xs)))
for k, v in self.encode_map_inv.items():
if k == "DUMMY_CATEGORY":
continue
res_df[k] = 0 # fill 0
res_df.loc[_Xs.isin(v), k] = 1 # one hot
# handle unknown
if self.handle_unknown == "summarize":
new_col = f"{self.col_name}_unknownCategory"
res_df[new_col] = 0 # fill 0
known_cats = self.categories
if self.handle_nan == "ignore":
known_cats += ["nan", "None"]
res_df.loc[~_Xs.isin(known_cats), new_col] = 1 # one hot
for cat in list(set(_Xs.values) - set(known_cats)):
if cat not in self.unknown_categories:
self.unknown_categories += [cat]
del _Xs
# return type redefine
if self.return_type == "np":
res_df = res_df.values
return res_df
def fit_transform(self, Xs):
self.fit(Xs)
return self.transform(Xs)
Erstellen Sie Beispieldaten für Training und Test wie folgt
Der Test enthält Kategorien (Elefanten, Vögel usw.), die beim Lernen nicht gefunden werden (der erstellte Encoder hat auch die Funktion, diese in die unbekannte Kategorie einzuteilen).
# generate sample category
import random
random.seed(42)
vals1 = ['salamander'] * 10 + ['snake'] * 8 + ['cameleon'] * 5 + ['rizard'] * 7 + ['frog'] * 2 + ['jellyfish'] * 3 + [np.nan] * 3 + [None] * 2
vals2 = ['cute'] * 4 + ['cool'] * 12 + ['colurful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 3 + [np.nan] * 6 + [None] * 4
vals3 = ['salamander'] * 13 + ['snake'] * 5 + ['cameleon'] * 7 + ['rizard'] * 5 + ['turtle'] * 3 + ['bird'] * 1 + ['elephant'] * 1 + ["jellyfish"] * 2 + [np.nan] * 1 + [None] * 2
vals4 = ['cute'] * 4 + ['cool'] * 12 + ['colorful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 1 + ['good'] * 1 + ['OK'] * 1 + [np.nan] * 3 + [None] * 7
random.shuffle(vals1)
random.shuffle(vals2)
random.shuffle(vals3)
random.shuffle(vals4)
train_df = pd.DataFrame({'animal' : vals1, 'feature' : vals2})
test_df = pd.DataFrame({'animal' : vals3, 'feature' : vals4})
Probieren Sie eine heiße Tiersäule
#Erstellen Sie eine Instanz
ohe = OneHotEncoder()
# train data de
Zugkodierung
ohe.fit(train_df['animal'])
#Eigentlich kodieren. Setzen Sie die Trainingsdaten in Transformation,
ohe.transform(train_df['animal'])
Konzentrieren Sie sich auf die Originaldaten und sehen Sie das Ergebnis
pd.concat([train_df, ohe.transform(train_df['animal'])], axis=1)
Einige Nans sind richtig heiß, und es gibt auch unbekannte Kategorien.
Schauen wir uns die Testdaten an
pd.concat([test_df, ohe.transform(test_df['animal'])], axis=1)
Da die gleiche Säule wie der Zug hergestellt wird, kann sie wie mit leichtem Gbm oder elastischem Netz verwendet werden
Diese Funktion war so selten wie sie war, also habe ich sie implementiert dict kehrt mit ohe.encode_map zurück
ohe.encode_map
Sie können auch die umgekehrte Version sehen
ohe.encode_map_inv
ohe.new_cols
Geben Sie handle_nan = "ignore" an.
ohe = OneHotEncoder(handle_nan="ignore")
ohe.fit(train_df['animal'])
Nanosäule ist weg
Sie können eine Kategorie als seltene Kategorie angeben, z. B. handle_rare = 0.1 (die Zahl 0.1 ist%).
Sehen Sie, wie oft Tiere erscheinen
versuche zu kodieren
ohe = OneHotEncoder(handle_rare=0.1)
ohe.fit(train_df['animal'])
rarCategory wurde hinzugefügt
Wenn Sie sich encode_map ansehen, können Sie sehen, was selten wurde
Wenn Sie eine Liste von Kategorien in handle_rare einfügen, werden die eingegebenen Kategorien außerdem in rarCategory codiert.
ohe = OneHotEncoder(handle_rare=["cameleon", "frog"])
ohe.fit(train_df['animal'])
handle_unknown = "ignorieren", unbekannt ist nicht codiert
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(train_df['animal'])
Wenn col_name_type = "index" gesetzt ist, wird es zu einem Index (wie category_encoders).
ohe = OneHotEncoder(col_name_type="index")
ohe.fit(train_df['animal'])
Standardmäßig ist der Spaltenname des Datenrahmens das Präfix, Sie können ihn jedoch mit col_name = "XXXX" ändern.
(Wenn Sie anstelle eines Datenrahmens einen Numpy-Wert eingeben, wird onehotEncode zum Präfix.)
ohe = OneHotEncoder(col_name="new_col")
ohe.fit(train_df['animal'])
Wenn Sie eine Dummy-Codierung wünschen (Codierung, die die Anzahl der Features reduziert, indem nicht eine Kategorie zu einer Spalte gemacht wird) setze Dummy = True
ohe = OneHotEncoder(dummy=True)
ohe.fit(train_df['animal'])
Dummy-Kategorien finden Sie in ohe.dummy_category
Wenn Dummy = "xxx" ist, ist die Kategorie Dummy
Ich möchte eine Bibliothek mit Kategoriecodierungen erstellen, die dort erreicht werden können, wo es juckt
Ich bin froh, wenn Sie sich mit den oben genannten Informationen einen Eindruck verschaffen können
Neben den oben genannten Funktionen gibt es noch andere detaillierte Funktionen, aber ich bin es leid zu schreiben. Wenn ich mehr Likes bekomme, plane ich, eine Bibliothek zu erstellen und eine Verwendung wie git zusammenzustellen.
Recommended Posts