[Python] pandas Code that is likely to be reused

import

import pandas as pd

Data confirmation

# Show first 5 lines
print(df.head(5))
# Show the last 5 lines
print(df.tail(5))

DataFrame

Empty data frame

# Empty data frame
df = pd.DataFrame(index=[], columns=[])
df = pd.DataFrame(index=[], columns=range(15))
 df = pd.DataFrame (0, index = range (4), columns = range (15)) Fill with # 0

# Create an empty data frame from an existing data frame
df_blank = df.iloc[0:0]

Columns name replacement

# Replace column names with serial numbers (because it is easy to process with for)
df.columns=list(range(len(df.columns)))
# Replace column names with alphabets (because it is easy to match with Excel)
df.columns=([chr(ord('A') + i) for i in range(26)]+["AA"]+["AB"])

Rename line (Index)

# Reassign index from 0
df=df.reset_index()
# Reassign index from 1 (because it is easy to match with Excel)
df.index = df.index + 1
# Assign serial numbers to lines other than the index
serial_num = pd.RangeIndex(start=0, stop=len(df.index) , step=1)
df['No'] = serial_num

Columns processing

Column extraction

df['A']
df=df[["F","Z"]]
# Show only arbitrary columns
 df [['column name']]
# Lines 1, 2 and 4
df.iloc[[1,2,4]]
# 0-2nd column
df.iloc[[0,2]]

Add column

# If you specify no column, the column will be added.
df['job'] = ["Engineer"]
 df ['new_column'] = 0 # Fill the initial value with 0 and add

Condition extraction

df=df[df['A'] > 450]
df=df[df['A']=='P1'] 
# Specify multiple conditions
 [Note] An error will occur if (&, |, ~) is not used instead of (and, or, not).
 [Note] When using a comparison operator, an error will occur unless you enclose each condition in parentheses.
df=df[(df['D']==0.0) | (df['D']==0.1)]

Conditional assignment

x=0 → y  x<>0 →x 
dfsum["z"]=dfsum["y"].where(dfsum["x"] ==0, dfsum["x"])
# Substitution
 gf2 ["Batting average"] = gf2 ['Hits'] / gf2 ['At bats']

Rename column

# Convert y to sales
df.rename(columns={'y': 'sales'}, inplace=True)
df.columns = ['year_orig', 'pop', 'rate', '%']

Delete column

 df.drop (5) # Delete line 5
 df.drop ([5,6]) # Delete lines 5,6
df.drop(['remarks'], axis=1, inplace=True)
# Eliminate duplicate column contents
 df = set (column df2.A)

Column value assignment / update

df['A'].apply(lambda x: x * 2)   #=A1*2
 df ['A']. Apply (lambda x: 1 if x> 0 else 0) #IF (A1> 0,1,0)

slice

# If expand = True, it will be split into two, but if not, it will be split in the list format.
print(df['a'].str[:2],expand=True)

Type change

# Initially it is of type object
df=df.astype(float)
df1["G"]=df1["G"].astype(int)

option	Description
inplace=True	It will be a destructive change, and the original data will also change

Row (Index) processing

[Note] Since it starts from 0, it becomes Excel line-1.

Row extraction

# Extract rows for a specific section
df[1:3]
# Extract only 100 rows
df.loc[100]
# Extract from line 100 to line 105
df[100:106]
# Extract only the first 10 lines
df[:10]

Add line

 If you specify no row, the row will be added.
df.loc['ONE'] = 0
# Add value for each row
df.loc['FIVE'] = ['A5', 'B5', 'C5']

Delete row

# Remove duplicate lines
df=df.drop(range(0,3))
df.drop(range(0,3),inplace=True )

Replacement

df['col1'] = df['col1'].str.replace('x', 'z')

Join

# Vertical connection
df=pd.concat([df, df1])
# Horizontal connection
df=pd.concat([df, df1], axis=1)

[Note] If the Index is different, it will not be combined right next to it, so if you want to put it right next to it, reassign the Index.

merge

 dfsum = pd.merge (dfsum, gf2, on ='key', how = "outer")

group

 gf = df3.groupby ('player name')
 gf1 = gf.agg ({'Home base': max,'Stolen base':'max','Batting average':'max'})
gf1=gf1.reset_index()

cell

# Substitution
df.loc[99,0] ='▲'
df.loc[:, 2:8] = 0

[Note] SettingWithCopyWarning may occur if df.loc [99] [0] is written (chained indexing). The value does not change

sort

df.sort_values(['sales', 'temperature'], ascending=False)
sorted(df)

option	Description
ascending=True	descending order
ascending=False	ascending order

conversion

# List → DataFrame
pd.DataFrame(data=list)

# Data part → list
s = pd.Series([0, 1, 2])
print(s)
 0    0
 1    1
 2    2
 dtype: int64
list= s.values.tolist()

display

# Do not display columns and indexes
print(df.to_string(index=False, header=False))

File reading / writing

Excel file reading

df = pd.read_excel('sample.xlsx',sheet_name='sheet1')

option	Description
sheet_name=""	Specify the sheet name
header=None	If specified, read without header
	If not specified, the first row is automatically inserted as the column name
header=2	Read the specified line as a header. Lines before that are ignored. Line 0 starts from this point

Excel file export

df.to_excel('sample.xlsx',sheet_name='sheet1', encoding='cp932',index=False, header=False)

option	Description
sheet_name=""	Specify the sheet name
index=False	Delete line name and export
header=False	Delete column names and export

read csv file

df=pd.read_csv("sample.csv",encoding="cp932",header=None)
df=pd.read_csv("sample.csv",encoding="shift_jis")

df1=pd.read_csv(sample.csv,encoding='cp932',names=col_names,low_memory=False )

[Note] If the windows extension character string is mixed, it is necessary to set the character code as cp932. [Note] Enter low_memory = False when reading a heavy file.

Export csv file

df.to_csv('sample.csv',encoding='cp932')

Read TAB delimited file

df=pd.read_csv("sample.DAT",encoding='cp932', sep='\t',names=col_names )

Read text file

df=pd.read_table("sample.DAT",encoding='cp932',header=None)

Text file writing

df.to_csv("employee.csv", sep=",")

Missing data

# Replace NaN in column A with '0.0'
df.fillna(value={'A': 0.0}, inplace=True)
# Delete rows with NaN in column A
df.dropna(subset=[A], axis=0, inplace=True)
# Replace'-' in column A with 0
df['A'] = df['A'].replace('--', 0).astype(float)

# Fill NaN with the value in the upper cell
df=df.fillna(method='ffill')

Visualization

import matplotlib.pyplot as plt

It feels good to change the plot style by doing the following

plt.style.use('ggplot')

notation	Details of operation
plt.title('Graph title')	Set title
plt.xlabel('X-axis label')	Set x-axis name
plt.ylabel('Y-axis label')	Set y-axis name
plt.xlim(Minimum value of X,Maximum value of X)	Specify x-axis range
plt.ylim(Minimum value of Y,Maximum value of Y)	Specify y-axis range
plt.legend()	Show legend
plt.grid()	Show grid lines