[PYTHON] Relationship and approximation error of binomial distribution, Poisson distribution, normal distribution, hypergeometric distribution

Overview

Reference book

Basic Statistics

Roughly the relationship of each distribution

Hypergeometric and binomial distributions

In the hypergeometric distribution

On the other hand, in the binomial distribution

The hypergeometric distribution can be well approximated by the binomial distribution if the total number (the number of balls in the box) is sufficiently larger than the number of extracts.

Binomial distribution, Poisson distribution and normal distribution

The binomial distribution is

Calculation of distribution and confirmation of approximation error

Python code (function part)

import scipy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib

# functions ------------------------
def binomial(n, x, p):
    '''
Binomial distribution
Event with probability p occurs x times in n trials
Each trial is independent
Example: An event where 3 is rolled on the dice (probability p)=1/6) but n dice=X when shaken 10 times=5 times)
        nCx * p^x * (1-p)^(n-x)
    '''
    return (
        sc.special.comb(n, x)
        * np.power(p, x)
        * np.power(1 - p, n - x)
    )

def poisson(n, x, p):
    '''
Poisson distribution
        p->When 0, the binomial distribution becomes a Poisson distribution.
    '''
    mean = n * p
    return (
        np.power(mean, x)
        * np.exp(-mean)
        / sc.special.factorial(x)
    )

def gaussian(n, x, p):
    '''
normal distribution
        n->When oo, the binomial distribution is normal
    '''
    mean = n * p
    variance = np.power(n * p * (1 - p), 1/2)
    return (
        1.0 
        / (np.power(2 * np.pi, 1/2) * variance)
        / np.exp(
            np.power(x - mean, 2)
            / (2 * np.power(variance, 2))
            )
    )

def hypergeometric(n, x, p, N):
    '''
Hypergeometric distribution
Each trial is not independent
It is necessary to consider all N for non-restoration extraction
    '''
    target_size = N * p  #Number of hits in the whole
    return (
        sc.special.comb(target_size, x)  #X combinations per
        * sc.special.comb(N - target_size, n - x)  #Lost n-x combinations
        / sc.special.comb(N, n)  #All combinations of n
    )


def calc_distributions(n, p, cols, N=0):
    '''
    ex.:
    n = 30  #Number of trials
    p = 0.16  #Percentage of hits
    N = 1000  #All (used only in hypergeometric distributions)
    '''
    #Prepare an array for storing results
    bi_arr = np.zeros(n + 1, dtype=np.float)
    po_arr = np.zeros(n + 1, dtype=np.float)
    ga_arr = np.zeros(n + 1, dtype=np.float)

    for x in np.arange(n + 1):
        #Store the result in an array
        bi_arr[x] = binomial(n, x, p) 
        po_arr[x] = poisson(n, x, p) 
        ga_arr[x] = gaussian(n, x, p) 

    #Store the result in a data frame
    df = pd.DataFrame()
    df['Binomial distribution'] = bi_arr
    df['Poisson distribution'] = po_arr
    df['normal distribution'] = ga_arr

    if N > 0:
        hy_arr = np.zeros(n + 1, dtype=np.float)
        for x in np.arange(n + 1):
            hy_arr[x] = hypergeometric(n, x, p, N)
        df['Hypergeometric distribution'] = hy_arr

    return df[cols]


def visualize(df,n, p, N=0):
    plot_params = {
        'linestyle':'-', 
        'markersize':5,
        'marker':'o'
    }
    colors = [
        'black',
        'blue',
        'green',
        'purple',
    ]
    
    plt.close(1)
    figure_ = plt.figure(1, figsize=(8,4))    
    axes_ = figure_.add_subplot(111)  #Create Axes
    for i, col in enumerate(df.columns):
        if i == 0:
            plot_params['linewidth'] = 2
            plot_params['alpha'] = 1.0
        else:
            plot_params['linewidth'] = 1
            plot_params['alpha'] = 0.4
        plot_params['color'] = colors[i]
        axes_.plot(
            df.index.values, df[col].values,
            label = col,
            **plot_params,
        )

    plt.legend()
    plt.xlabel('Number of hits x')
    plt.ylabel('Probability of winning x times out of n times')
    title = 'Percentage of hits p:{p:.2f},Number of trials n:{n}'.format(p=p, n=n)
    if 'Hypergeometric distribution' in df.columns:
        title += ',All N:{N}'.format(N=N)
    plt.title(title)
    xmax = n * p * 2
    if xmax<10: xmax = 10;
    plt.xlim([0, xmax])
    #(The expression "number of times" is not good considering non-restoration extraction)

Execute under different conditions, check error

Roll the dice 100 times and how many times 1 comes out

n = 100; p = 0.167
df = calc_distributions(n, p, cols=['Binomial distribution', 'Poisson distribution', 'normal distribution'])
visualize(df, n, p)

image.png Binomial distribution and normal distribution match well

Shake the 100 facet 100 times and how many times 1 will appear

n = 100; p = 0.01
df = calc_distributions(n, p, cols=['Binomial distribution', 'Poisson distribution', 'normal distribution'])
visualize(df, n, p)

image.png Binomial distribution and Poisson distribution match well

Of the 5,000 inhabitants, 10% are children. How many children are there out of 100 selected?

n = 100; p = 0.1; N = 5000
df = calc_distributions(n, p, cols=['Hypergeometric distribution','Binomial distribution'], N=N)
visualize(df, n, p, N)

image.png Hypergeometric distribution and binomial distribution match well

10% of the 500 inhabitants are children. How many children are there out of 100 selected?

n = 100; p = 0.1; N = 500
df = calc_distributions(n, p, cols=['Hypergeometric distribution','Binomial distribution'], N=N)
visualize(df, n, p, N)

image.png The error is noticeable compared to the case of N = 5000

end

Recommended Posts

Relationship and approximation error of binomial distribution, Poisson distribution, normal distribution, hypergeometric distribution
Verification of normal distribution
Understanding the meaning of complex and bizarre normal distribution formulas
[Statistics] Let's visualize the relationship between the normal distribution and the chi-square distribution.
pix2 pix tensorflow2 Record of trial and error