Data analysis: Easily apply descriptive and inference statistics to CSV data in Python

Introduction

Descriptive and inferential statistics are often used in data analysis during the understanding of data before building machine learning models. This time, in order to understand the data, I read the CSV data with the dataframe of pandas and created the source code to easily apply descriptive statistics and statistical inference.

Preparation data

Create CSV data as input. The data used to create the source code is shown below.

x1,x2,x3,x4,x5
1,11,1,1,1
2,12,1,1,2
3,13,1,1,3
4,14,1,1,4
5,150,1,1,5
5,150,1,1,5
4,160,1,1,4
3,180,1,1,3
2,180,1,1,2
1,190,2,2,2

Source code

The modularized source code is shown below.

class StatisticalTests():
    def __init__(self):
        pass

    @classmethod
    def basic_info(cls, df):
        print('Basic statistics------------------start')
        print('df.head(3)-------------')
        print(df.head(3))
        print('df.dtypes-------------')
        print(df.dtypes)
        print('df.describe(include=\'all\')-------------')
        print(df.describe(include='all'))

    @classmethod
    def t_interval(cls, df):
        print('Mother mean 95%Confidence interval-------------------start')
        for column_name, s in df.iteritems():
            u2 = s.var(ddof=1)  #Population variance estimate (unbiased variance)
            m = s.mean()  #Specimen average
            n = len(s)-1  #Degree of freedom
            se = math.sqrt(u2/len(s))  #Standard error

            ci1, ci2 = st.t.interval(alpha=0.95, loc=m, scale=se, df=n)
            print(f'Column name= {column_name} //Mother mean 95%Confidence interval CI= '
                  f'[{ci1:.2f} , {ci2:.2f}] //Specimen average[{m}]')

    @classmethod
    def shapiro(cls, df):
        print('Shapiro-Wilk test(Test of normality)------------------start')
        for column_name, s in df.iteritems():
            _, p = st.shapiro(s)
            if p >= 0.05:
                print(f'Column name= {column_name} //p-value= {p:.3f} '
                      f'//Test result:Adopting the null hypothesis, it cannot be said that there is no normality')
            else:
                print(f'Column name= {column_name} //p-value= {p:.3f} '
                      f'//Test result:Reject the null hypothesis, no normality')

    @classmethod
    def levene(cls, xa, xb):
        print('Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start')
        _, p = st.levene(xa, xb, center='mean')
        if p >= 0.05:
            print(f'p-value= {p:.3f} //Test result:Adopting the null hypothesis, it cannot be said that the two samples are not homoscedastic.')
        else:
            print(f'p-value= {p:.3f} //Test result:Rejecting the null hypothesis, the two samples are not homoscedastic')

    @classmethod
    def ttest_rel(cls, xa, xb):
        print('Between 2 groups:Corresponding t-test-------------------start')
        #The null hypothesis is that there is no significant difference between the mean values of the two samples.
        #If there is a response, check the same person before and after administration of the drug like Mr. A and Mr. B
        t, p = st.ttest_rel(xa, xb)
        if np.sign(t) == -1:
            a = xa
            xa = xb
            xb = a

        t, p = st.ttest_rel(xa, xb)
        mu = abs(xa.mean()-xb.mean())
        se = mu/t
        n = len(xa)+len(xb)-2
        ci1, ci2 = st.t.interval(alpha=0.95, loc=mu, scale=se, df=n)
        if p >= 0.05:
            print(f'p-value={p:.3f} //t value= {t:.2f}')
            print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
            print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
            print('//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.')
        else:
            print(f'p-value={p:.3f} //t value= {t:.2f}')
            print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
            print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
            print(f'//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples')

    @classmethod
    def ttest_ind_equal_var_true(cls, xa, xb):
        print('Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start')
        #The null hypothesis is that there is no significant difference between the mean values of the two samples.
        #Without correspondence, do not check the same person before and after administration of medicine like Mr. A and Mr. B
        t, p = st.ttest_ind(xa, xb, equal_var=True)
        if np.sign(t) == -1:
            a = xa
            xa = xb
            xb = a

        t, p = st.ttest_ind(xa, xb, equal_var=True)
        cls._ttest_ind(t, p, xa, xb)

    @classmethod
    def ttest_ind_equal_var_false(cls, xa, xb):
        print('Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start')
        #The null hypothesis is that there is no significant difference between the mean values of the two samples.
        #Without correspondence, do not check the same person before and after administration of medicine like Mr. A and Mr. B
        t, p = st.ttest_ind(xa, xb, equal_var=False)
        if np.sign(t) == -1:
            a = xa
            xa = xb
            xb = a

        t, p = st.ttest_ind(xa, xb, equal_var=False)
        cls._ttest_ind(t, p, xa, xb)

    @classmethod
    def _ttest_ind(cls, t, p, xa, xb):
        mu = abs(xa.mean()-xb.mean())
        se = mu/t
        n = len(xa)+len(xb)-2
        ci1, ci2 = st.t.interval(alpha=0.95, loc=mu, scale=se, df=n)
        if p >= 0.05:
            print(f'p-value={p:.3f} //t value= {t:.2f}')
            print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
            print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
            print('//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.')
        else:
            print(f'p-value={p:.3f} //t value= {t:.2f}')
            print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
            print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
            print(f'//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples')

    @classmethod
    def chisquare(cls, sample, answer):
        print('Goodness of fit test-------------------start')
        #Alternative hypothesis: The data obtained do not fit the theoretical distribution.
        sample = sample.tolist()
        answer = answer.tolist()

        p = st.chisquare(sample, f_exp=answer)[1]
        if p >= 0.05:
            print(f'p-value= {p:.3f} //Test result:It is not possible to adopt the null hypothesis and conclude that it does not fit the theoretical distribution.')
        else:
            print(f'p-value= {p:.3f} //Test result:We reject the null hypothesis and conclude that it does not fit the theoretical distribution.')

    @classmethod
    def chi2_contingency(cls, df):
        print('Test of independence-------------------start')
        # Usage)
        #Number of carcinogens Number of non-carcinogens
        #Smoking group 30 70
        #Non-smoking group 20 80
        # print(st.chi2_contingency(x))
        p = st.chi2_contingency(df.values)[1]
        if p >= 0.05:
            print(f'p-value= {p:.3f} //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.')
        else:
            print(f'p-value= {p:.3f} //Test result:Rejecting the null hypothesis, we conclude that the two variables are not independent.')

    @classmethod
    def pearsonr(cls, xa, xb):
        print('Test of correlation coefficient-------------------start')
        #Make a null hypothesis and an alternative hypothesis:The null hypothesis is ρ=0, i.e. population correlation=0
        #The alternative hypothesis is "ρ ≠ 0", that is, the population correlation ≠ 0
        x1 = xa.values
        x2 = xb.values
        s = st.pearsonr(x1, x2)
        if s[1] >= 0.05:
            print(f'Correlation coefficient= {s[0]:.3f} //p-value= {s[1]:.3f} //Test result:Adopt the null hypothesis. It cannot be said that there is a correlation.')
        else:
            print(f'Correlation coefficient= {s[0]:.3f} //p-value= {s[1]:.3f} //Test result:Reject the null hypothesis. There is a correlation.')

Execution result

An execution example of the above modularized source code is shown below. You can see that you can understand the CSV data.

Basic statistics------------------start
df.head(3)-------------
   x1  x2  x3  x4  x5
0   1  11   1   1   1
1   2  12   1   1   2
2   3  13   1   1   3
df.dtypes-------------
x1    int64
x2    int64
x3    int64
x4    int64
x5    int64
dtype: object
df.describe(include='all')-------------
              x1          x2         x3         x4        x5
count  10.000000   10.000000  10.000000  10.000000  10.00000
mean    3.000000  106.000000   1.100000   1.100000   3.10000
std     1.490712   81.493013   0.316228   0.316228   1.37032
min     1.000000   11.000000   1.000000   1.000000   1.00000
25%     2.000000   13.250000   1.000000   1.000000   2.00000
50%     3.000000  150.000000   1.000000   1.000000   3.00000
75%     4.000000  175.000000   1.000000   1.000000   4.00000
max     5.000000  190.000000   2.000000   2.000000   5.00000
Mother mean 95%Confidence interval-------------------start
Column name= x1 //Mother mean 95%Confidence interval CI= [1.93 , 4.07] //Specimen average[3.0]
Column name= x2 //Mother mean 95%Confidence interval CI= [47.70 , 164.30] //Specimen average[106.0]
Column name= x3 //Mother mean 95%Confidence interval CI= [0.87 , 1.33] //Specimen average[1.1]
Column name= x4 //Mother mean 95%Confidence interval CI= [0.87 , 1.33] //Specimen average[1.1]
Column name= x5 //Mother mean 95%Confidence interval CI= [2.12 , 4.08] //Specimen average[3.1]
Shapiro-Wilk test(Test of normality)------------------start
Column name= x1 //p-value= 0.341 //Test result:Adopting the null hypothesis, it cannot be said that there is no normality
Column name= x2 //p-value= 0.004 //Test result:Reject the null hypothesis, no normality
Column name= x3 //p-value= 0.000 //Test result:Reject the null hypothesis, no normality
Column name= x4 //p-value= 0.000 //Test result:Reject the null hypothesis, no normality
Column name= x5 //p-value= 0.410 //Test result:Adopting the null hypothesis, it cannot be said that there is no normality
Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start
p-value= 0.000 //Test result:Rejecting the null hypothesis, the two samples are not homoscedastic
Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start
p-value= 0.813 //Test result:Adopting the null hypothesis, it cannot be said that the two samples are not homoscedastic.
Between 2 groups:Corresponding t-test-------------------start
p-value=0.003 //t value= 4.01
//Difference in mean= 103.00 //Standard error of difference= 25.70
//95 of mean difference%Confidence interval CI= [49.01 , 156.99]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:Corresponding t-test-------------------start
p-value=0.343 //t value= 1.00
//Difference in mean= 0.10 //Standard error of difference= 0.10
//95 of mean difference%Confidence interval CI= [-0.11 , 0.31]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start
p-value=0.001 //t value= 4.00
//Difference in mean= 103.00 //Standard error of difference= 25.77
//95 of mean difference%Confidence interval CI= [48.85 , 157.15]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start
p-value=0.878 //t value= 0.16
//Difference in mean= 0.10 //Standard error of difference= 0.64
//95 of mean difference%Confidence interval CI= [-1.25 , 1.45]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start
p-value=0.003 //t value= 4.00
//Difference in mean= 103.00 //Standard error of difference= 25.77
//95 of mean difference%Confidence interval CI= [48.85 , 157.15]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start
p-value=0.878 //t value= 0.16
//Difference in mean= 0.10 //Standard error of difference= 0.64
//95 of mean difference%Confidence interval CI= [-1.25 , 1.45]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Goodness of fit test-------------------start
p-value= 0.000 //Test result:We reject the null hypothesis and conclude that it does not fit the theoretical distribution.
Goodness of fit test-------------------start
p-value= 1.000 //Test result:It is not possible to adopt the null hypothesis and conclude that it does not fit the theoretical distribution.
Test of independence-------------------start
p-value= 0.142 //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.
Test of independence-------------------start
p-value= 0.000 //Test result:Rejecting the null hypothesis, we conclude that the two variables are not independent.
Test of independence-------------------start
p-value= 1.000 //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.
Test of correlation coefficient-------------------start
Correlation coefficient= 0.165 //p-value= 0.649 //Test result:Adopt the null hypothesis. It cannot be said that there is a correlation.
Test of correlation coefficient-------------------start
Correlation coefficient= 0.979 //p-value= 0.000 //Test result:Reject the null hypothesis. There is a correlation.

Summary

--It was confirmed that CSV data can be read into pandas dataframe and descriptive statistics and statistical inference can be easily applied. ――It was found that CSV data can be understood by applying descriptive statistics and statistical inference.

reference

-Comparison of test results of 2 classes of t-test with Python -Summary of statistical hypothesis test using python, confidence interval estimation method