import pandas as pd
# Show first 5 lines
print(df.head(5))
# Show the last 5 lines
print(df.tail(5))
DataFrame
# Empty data frame
df = pd.DataFrame(index=[], columns=[])
df = pd.DataFrame(index=[], columns=range(15))
df = pd.DataFrame (0, index = range (4), columns = range (15)) Fill with # 0
# Create an empty data frame from an existing data frame
df_blank = df.iloc[0:0]
# Replace column names with serial numbers (because it is easy to process with for)
df.columns=list(range(len(df.columns)))
# Replace column names with alphabets (because it is easy to match with Excel)
df.columns=([chr(ord('A') + i) for i in range(26)]+["AA"]+["AB"])
# Reassign index from 0
df=df.reset_index()
# Reassign index from 1 (because it is easy to match with Excel)
df.index = df.index + 1
# Assign serial numbers to lines other than the index
serial_num = pd.RangeIndex(start=0, stop=len(df.index) , step=1)
df['No'] = serial_num
df['A']
df=df[["F","Z"]]
# Show only arbitrary columns
df [['column name']]
# Lines 1, 2 and 4
df.iloc[[1,2,4]]
# 0-2nd column
df.iloc[[0,2]]
# If you specify no column, the column will be added.
df['job'] = ["Engineer"]
df ['new_column'] = 0 # Fill the initial value with 0 and add
df=df[df['A'] > 450]
df=df[df['A']=='P1']
# Specify multiple conditions
[Note] An error will occur if (&, |, ~) is not used instead of (and, or, not).
[Note] When using a comparison operator, an error will occur unless you enclose each condition in parentheses.
df=df[(df['D']==0.0) | (df['D']==0.1)]
x=0 → y x<>0 →x
dfsum["z"]=dfsum["y"].where(dfsum["x"] ==0, dfsum["x"])
# Substitution
gf2 ["Batting average"] = gf2 ['Hits'] / gf2 ['At bats']
# Convert y to sales
df.rename(columns={'y': 'sales'}, inplace=True)
df.columns = ['year_orig', 'pop', 'rate', '%']
df.drop (5) # Delete line 5
df.drop ([5,6]) # Delete lines 5,6
df.drop(['remarks'], axis=1, inplace=True)
# Eliminate duplicate column contents
df = set (column df2.A)
df['A'].apply(lambda x: x * 2) #=A1*2
df ['A']. Apply (lambda x: 1 if x> 0 else 0) #IF (A1> 0,1,0)
# If expand = True, it will be split into two, but if not, it will be split in the list format.
print(df['a'].str[:2],expand=True)
# Initially it is of type object
df=df.astype(float)
df1["G"]=df1["G"].astype(int)
option | Description |
---|---|
inplace=True | It will be a destructive change, and the original data will also change |
[Note] Since it starts from 0, it becomes Excel line-1.
# Extract rows for a specific section
df[1:3]
# Extract only 100 rows
df.loc[100]
# Extract from line 100 to line 105
df[100:106]
# Extract only the first 10 lines
df[:10]
If you specify no row, the row will be added.
df.loc['ONE'] = 0
# Add value for each row
df.loc['FIVE'] = ['A5', 'B5', 'C5']
# Remove duplicate lines
df=df.drop(range(0,3))
df.drop(range(0,3),inplace=True )
df['col1'] = df['col1'].str.replace('x', 'z')
# Vertical connection
df=pd.concat([df, df1])
# Horizontal connection
df=pd.concat([df, df1], axis=1)
[Note] If the Index is different, it will not be combined right next to it, so if you want to put it right next to it, reassign the Index.
dfsum = pd.merge (dfsum, gf2, on ='key', how = "outer")
gf = df3.groupby ('player name')
gf1 = gf.agg ({'Home base': max,'Stolen base':'max','Batting average':'max'})
gf1=gf1.reset_index()
# Substitution
df.loc[99,0] ='▲'
df.loc[:, 2:8] = 0
[Note] SettingWithCopyWarning may occur if df.loc [99] [0] is written (chained indexing). The value does not change
df.sort_values(['sales', 'temperature'], ascending=False)
sorted(df)
option | Description |
---|---|
ascending=True | descending order |
ascending=False | ascending order |
# List → DataFrame
pd.DataFrame(data=list)
# Data part → list
s = pd.Series([0, 1, 2])
print(s)
0 0
1 1
2 2
dtype: int64
list= s.values.tolist()
# Do not display columns and indexes
print(df.to_string(index=False, header=False))
df = pd.read_excel('sample.xlsx',sheet_name='sheet1')
option | Description |
---|---|
sheet_name="" | Specify the sheet name |
header=None | If specified, read without header |
If not specified, the first row is automatically inserted as the column name | |
header=2 | Read the specified line as a header. Lines before that are ignored. Line 0 starts from this point |
df.to_excel('sample.xlsx',sheet_name='sheet1', encoding='cp932',index=False, header=False)
option | Description |
---|---|
sheet_name="" | Specify the sheet name |
index=False | Delete line name and export |
header=False | Delete column names and export |
df=pd.read_csv("sample.csv",encoding="cp932",header=None)
df=pd.read_csv("sample.csv",encoding="shift_jis")
df1=pd.read_csv(sample.csv,encoding='cp932',names=col_names,low_memory=False )
[Note] If the windows extension character string is mixed, it is necessary to set the character code as cp932. [Note] Enter low_memory = False when reading a heavy file.
df.to_csv('sample.csv',encoding='cp932')
df=pd.read_csv("sample.DAT",encoding='cp932', sep='\t',names=col_names )
df=pd.read_table("sample.DAT",encoding='cp932',header=None)
df.to_csv("employee.csv", sep=",")
# Replace NaN in column A with '0.0'
df.fillna(value={'A': 0.0}, inplace=True)
# Delete rows with NaN in column A
df.dropna(subset=[A], axis=0, inplace=True)
# Replace'-' in column A with 0
df['A'] = df['A'].replace('--', 0).astype(float)
# Fill NaN with the value in the upper cell
df=df.fillna(method='ffill')
import matplotlib.pyplot as plt
It feels good to change the plot style by doing the following
plt.style.use('ggplot')
notation | Details of operation |
---|---|
plt.title('Graph title') | Set title |
plt.xlabel('X-axis label') | Set x-axis name |
plt.ylabel('Y-axis label') | Set y-axis name |
plt.xlim(Minimum value of X,Maximum value of X) | Specify x-axis range |
plt.ylim(Minimum value of Y,Maximum value of Y) | Specify y-axis range |
plt.legend() | Show legend |
plt.grid() | Show grid lines |
Recommended Posts