[PYTHON] Google form aggregate analysis tool
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
def quantify(csv_pass):
About arguments
Enter the path to the csv file in the argument
About the return value
The first return value is the answer for each question and a hash that does not allow duplicate integers.
The second return value is the data frame in which the answer is replaced with a numerical value.
-Normalize column names with notational fluctuations
-Delete the time stamp column
"""
csv_data = pd.read_csv(csv_pass)
df = pd.DataFrame(csv_data)
num_respose = len(csv_data)
print(num_respose)
#Delete timestamp
df=df.drop('Time stamp', axis=1)
hashs={}
answer_columns=df.columns
n_suu=num_respose
for cm in answer_columns:
if cm != 'Time stamp':
#print(cm)
answer_hash = {}
n=0
for i in range(n_suu):
key=df[cm][i]
if key in answer_hash:
s=2
else:
answer_hash[key]=n
n = n+1
if i == num_respose-1:
hashs[cm]=answer_hash
replaced_value = df
for i in hashs:
#print(i)
replaced_value = replaced_value.replace({i: hashs[i]})
return hashs,replaced_value
#Algorithm execution to quantify the answer by using csv as a data frame
hashs,replaced_value=quantify("./G1 Survey Form for Students.csv")
#Pie chart creation algorithm
def pie(replaced_value):
"""
Takes a data frame replaced by a numerical value as an argument
"""
answer_columns=replaced_value.columns
answer_columns
for i in answer_columns:
#Create a pie chart other than a time stamp
if i !="Time stamp":
column = i
groupbyed_sizes=replaced_value.groupby(column).size()
#groupbyed_sizes
#Combine two for statements at once
labels=[]
values =[]
for (value, labe) in zip(groupbyed_sizes, hashs[column]):
#print(value,labe)
labels.append(labe)
values.append(value)
print(column)
print(labels)
print(values)
label = labels
x = values
fig, ax = plt.subplots()
ax.pie(x, labels=label, autopct="%1.1f %%")
plt.show()
def cross_tabulation(replaced_value):
"""
Do double cross tabulation
Replaced in argument_value(Quantified data frame)Take
A function that returns a simple summary table from the input values of two column names
"""
print(replaced_value.columns)
print("Enter the question you want to be on the front side from the column name above")
columns=input()
print("Enter the question you want to start from the column name above")
index=input()
def get_swap_dict(d):
#A function that swaps hash keys and values
return {v: k for k, v in d.items()}
#Rename index and columns from numbers to wording
d_swap = get_swap_dict(hashs[index])
d_swap_2 = get_swap_dict(hashs[columns])
s=pd.crosstab(replaced_value[columns],replaced_value[index])
#Add total column
s['Total'] = s.sum(axis=1)
#↓ Display by percentage
#s=pd.crosstab(replaced_value[columns],replaced_value[index],normalize=True)
#index
s=s.rename(d_swap_2,axis=0)
#columns
s=s.rename(d_swap,axis=1)
print("")
print("")
print("")
print("")
print("")
print("-------------------------------------------------------------------------------------------------")
return s
#https://deepage.net/features/pandas-crosstab.html
#Cross tabulation algorithm execution
cross_tab=cross_tabulation(replaced_value)
cross_tab