Plonger dans les statistiques avec Python. Partie 2. Distribution des élèves

Bonne journée, habraledi et habragentelmen! Dans cet article, nous allons continuer notre plongée dans les statistiques avec Python. Si quelqu'un a raté le début de la plongée, voici un lien vers la première partie . Sinon, je recommande toujours de garder le livre ouvert de Sarah Boslaf, Statistics for All, à portée de main. Je recommande également d'exécuter le bloc-notes pour expérimenter le code et les graphiques.





Comme l'a dit Andrew Lang, « pour un politicien , les statistiques sont comme un réverbère pour un drogué ivre: un accessoire plutôt qu'un éclairage ». Il est peu probable que vous appreniez beaucoup de nouvelles connaissances ici, mais j'espère que cet article vous aidera à comprendre comment utiliser Python pour faciliter l'auto-étude des statistiques.





Pourquoi inventer de nouvelles allocations?

Imaginez ... alors, avant d'imaginer quoi que ce soit, refaisons toutes les importations nécessaires:





import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
sns.set()
rcParams['figure.figsize'] = 10, 6
%config InlineBackend.figure_format = 'svg'
np.random.seed(42)
      
      



, , . , , - , . 1000 100- , . :





gen_pop = np.trunc(stats.norm.rvs(loc=80, scale=5, size=1000))
gen_pop[gen_pop>100]=100
print(f'mean = {gen_pop.mean():.3}')
print(f'std = {gen_pop.std():.3}')
      
      



mean = 79.5
std = 4.95
      
      



, , . 80 5 . , , , , , - .





, . , - . , - , ? - . , 10 , :





[89, 99, 93, 84, 79, 61, 82, 81, 87, 82]

Z- :





Z = \ frac {\ bar {x} - \ mu} {\ frac {\ sigma} {\ sqrt {n}}}

\ bar {x}- , \ mu \ sigma , n- . :





sample = np.array([89,99,93,84,79,61,82,81,87,82])
sample.mean()
      
      



83.7
      
      



Z-:





z = 10**0.5*(sample.mean()-80)/5
z
      
      



2.340085468524603
      
      



p-value:





1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
      
      



0.019279327322753836
      
      



, , : Z- 0 2 , .. 10 , , , 0.02. , 10 , "" N (80, 5 ^ {2}), , 10 "" 83.7 2%. , , , , . .





- 10 , , :





sample.std(ddof=1)
      
      



10.055954565441423
      
      



ddof std

\ sigma, , s, . :





\ begin {align *} & \ sigma = {\ sqrt {{\ frac {1} {n}} \ sum _ {i = 1} ^ {n} \ left (x_ {i} - {\ mu} \ right ) ^ {2}}} \\ & \\ & s = {\ sqrt {{\ frac {1} {n - 1}} \ sum _ {i = 1} ^ {n} \ left (x_ {i} - {\ bar {x}} \ right) ^ {2}}} \ end {align *}

, , \ sigma . - \ mu n, s n - 1. n - 1 n? , \ mu \ bar {x} - s , \ sigma . n - 1 , - .





, std() NumPy ddof, 0, std() , , ddof=1. . , 10000 10 N (80, 5 ^ {2}) , , ddof=0 . ddof=1 , - , ddof=0:





fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (12, 5))

for i in [0, 1]:
    deviations = np.std(stats.norm.rvs(80, 5, (10000, 10)), axis=1, ddof=i)
    sns.histplot(x=deviations ,stat='probability', ax=ax[i])
    ax[i].vlines(5, 0, 0.06, color='r', lw=2)
    ax[i].set_title('ddof = ' + str(i), fontsize = 15)
    ax[i].set_ylim(0, 0.07)
    ax[i].set_xlim(0, 11)
fig.suptitle(r'$s={\sqrt {{\frac {1}{10 - \mathrm{ddof}}}\sum _{i=1}^{10}\left(x_{i}-{\bar {x}}\right)^{2}}}$',
             fontsize = 20, y=1.15);
      
      



, Z-? , - , . 5000 10 , N (80, 5) :





deviations  = np.std(stats.norm.rvs(80, 5, (5000, 10)), axis=1, ddof=1)
sns.histplot(x=deviations ,stat='probability');
      
      



, , 10- . . , , 10 2%, , ( ) 10 0. , , : 10- , .





, , , : - , - , . , "" N (80, 10 ^ {2}), Z- p-value 10- :





z = 10**0.5*(sample.mean()-80)/10
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'z = {z:.3}')
print(f'p-value = {p:.4}')
      
      



z = 1.17
p-value = 0.242
      
      



, N (80, 5 ^ {2}) , , , .. , N (80, 10 ^ {2}), . 2%, 25%. , - \ sigma s.





, ? ! , : (, , - )





Z = \ frac {\ bar {x} - \ mu} {\ frac {\ sigma} {\ sqrt {n}}} \;  ;  \; \; \;  T = \ frac {\ bar {x} - \ mu} {\ frac {s} {\ sqrt {n}}}.

T-, Z- , \ sigma , s. 10000 N (80, {5} ^ 2), Z- T-, :





fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (12, 5))

N = 10000
samples = stats.norm.rvs(80, 5, (N, 10))
statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
              lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]
title = 'ZT'
bins = np.linspace(-6, 6, 80, endpoint=True)

for i in range(2):
    values = statistics[i](samples)
    sns.histplot(x=values ,stat='probability', bins=bins, ax=ax[i])
    p = values[(values > -2)&(values < 2)].size/N
    ax[i].set_title('P(-2 < {} < 2) = {:.3}'.format(title[i], p))
    ax[i].set_xlim(-6, 6)
    ax[i].vlines([-2, 2], 0, 0.06, color='r');
      
      



- :





import matplotlib.animation as animation

fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (18, 8))

def animate(i):
    for ax in axes:
        ax.clear()
    N = 10000
    samples = stats.norm.rvs(80, 5, (N, 10))
    statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
                  lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]
    title = 'ZT'
    bins = np.linspace(-6, 6, 80, endpoint=True)

    for j in range(2):
        values = statistics[j](samples)
        sns.histplot(x=values ,stat='probability', bins=bins, ax=axes[j])
        p = values[(values > -2)&(values < 2)].size/N
        axes[j].set_title(r'$P(-2\sigma < {} < 2\sigma) = {:.3}$'.format(title[j], p))
        axes[j].set_xlim(-6, 6)
        axes[j].set_ylim(0, 0.07)
        axes[j].vlines([-2, 2], 0, 0.06, color='r')
    return axes

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(7),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('statistics_dist.gif',
                 writer='imagemagick', 
                 fps=1)
      
      



, , . ? -, , . , , ? , , N (0, 1) [-2\sigma; 2\sigma] 95.5% . Z- , T- , 92-93% . , , - , :





statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
              lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]

quantity = 50
N=10000
result = []
for i in range(quantity):
    samples = stats.norm.rvs(80, 5, (N, 10))
    Z = statistics[0](samples)
    p_z = Z[(Z > -2)&((Z < 2))].size/N
    T = statistics[1](samples)
    p_t = T[(T > -2)&((T < 2))].size/N
    result.append([p_z, p_t])

result = np.array(result)
fig, ax = plt.subplots()

line1, line2 = ax.plot(np.arange(quantity), result)
ax.legend([line1, line2], 
          [r'$P(-2\sigma < {} < 2\sigma)$'.format(i) for i in 'ZT'])
ax.hlines(result.mean(axis=0), 0, 50, color='0.6');
      
      



50 . , , , , . ? , ! Z- T- , . , T- ? , - . , , - , , , . , . , - , , \sigma s.





Z-, \bar{x} , s - . 10000 N(80, 5) 10 , :





#     ,
#    svg  png:
#%config InlineBackend.figure_format = 'png'

N = 10000
samples = stats.norm.rvs(80, 5, (N, 10))

means = samples.mean(axis=1)
deviations = samples.std(ddof=1, axis=1)
T = statistics[1](samples)
P = (T > -2)&((T < 2))

fig, ax = plt.subplots()

ax.scatter(means[P], deviations[P], c='b', alpha=0.7,
           label=r'$\left | T \right | < 2\sigma$')
ax.scatter(means[~P], deviations[~P], c='r', alpha=0.7,
           label=r'$\left | T \right | > 2\sigma$')

mean_x = np.linspace(75, 85, 300)
s = np.abs(10**0.5*(mean_x - 80)/2)
ax.plot(mean_x, s, color='k',
        label=r'$\frac{\sqrt{n}(\bar{x}-\mu)}{2}$')
ax.legend(loc = 'upper right', fontsize = 15)
ax.set_title('   \n  ',
             fontsize=15)
ax.set_xlabel(r'   ($\bar{x}$)',
              fontsize=15)
ax.set_ylabel(r'   ($s$)',
              fontsize=15);
      
      



, . , , \bar{x} s , .. . , , N(\mu, \sigma^{2}), \left | \bar{x} - \mu \right | \left | s - \sigma \right |. , ( ) , :





\left | \bar{x} - \mu \right | >\frac{2\sigma s}{\sqrt{n}}

, \sigma=1, .. , , , \mu = 80, n = 10 :





\left | \bar{x} - 80 \right | >\frac{2s}{\sqrt{10}}

, [-2\sigma; 2\sigma], , 92,5% .





? , . , ( ) 100- . , , ( ). 10- 82- , 2- . , , . \mu=80, , .. \sigma = s = 2? Z-:





Z = \frac{\bar{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{82 - 80}{\frac{2}{\sqrt{10}}} \approx 3.16

N(80, 2^{2}) p-value:





z = 10**0.5*(82-80)/2
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'p-value = {p:.2}')
      
      



p-value = 0.0016
      
      



10 82- 2%. N(80, 2^{2}). , \sigma = s = 2, , , .





, , , . ( \left | \bar{x} - \mu \right |) ( s).





10 . 82 , , , 9- . ? :





z = 10**0.5*(82-80)/9
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'p-value = {p:.2}')
      
      



p-value = 0.48
      
      



10 \bar{x} = 82 s = 9 N(80, 9^{2}) . , , - .





, , . :





import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 10000
    
    samples = stats.norm.rvs(80, 5, (N, i))

    means = samples.mean(axis=1)
    deviations = samples.std(ddof=1, axis=1)
    T = i**0.5*(np.mean(samples, axis=1) - 80)/np.std(samples, axis=1, ddof=1)
    P = (T > -2)&((T < 2))
    
    prob = T[P].size/N
    ax.set_title(r' $s$  $\bar{x}$  $n = $' + r'${}$'.format(i),
                 fontsize = 20)
    ax.scatter(means[P], deviations[P], c='b', alpha=0.7,
               label=r'$\left | T \right | < 2\sigma$')
    ax.scatter(means[~P], deviations[~P], c='r', alpha=0.7,
               label=r'$\left | T \right | > 2\sigma$')

    mean_x = np.linspace(75, 85, 300)
    s = np.abs(i**0.5*(mean_x - 80)/2)
    ax.plot(mean_x, s, color='k',
            label=r'$\frac{\sqrt{n}(\bar{x}-\mu)}{2}$')
    ax.legend(loc = 'upper right', fontsize = 15)
    ax.set_xlim(70, 90)
    ax.set_ylim(0, 10)
    ax.set_xlabel(r'   ($\bar{x}$)',
              fontsize='20')
    ax.set_ylabel(r'   ($s$)',
              fontsize='20')
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(5, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('sigma_rel.gif',
                 writer='imagemagick', 
                 fps=3)
      
      



, \bar{x} s \mu \sigma N(\mu, \sigma^{2}), . n Z-, n .





! , ? - , , . , , 10- :





[89,99,93,84,79,61,82,81,87,82]

\bar{x} = 83.7, s = 10.06, , , , , N(80, 5^{2}). Z-, T-, Z- , \sigma s. , - N(80, 5^{2}), N(80, 10.06^{2})- ? s?: , N(80, 1^{2}), N(80, 5^{2}), N(80, 7^{2}) \sigma?





, . , - . , N(80, 5^{2}), , , 10 , 10 . , N(80, 5^{2}) . , - , .





: \bar{x} = 83.7, s = 10.06, N(80, \sigma^{2}) \sigma. , , 83<\bar{x}<84 9.5<s<10.5:





N = 10000
sigma = np.linspace(5, 20, 151)
prob = []

for i in sigma:
    p = []
    for j in range(10):
        samples = stats.norm.rvs(80, i, (N, 10))
        means = samples.mean(axis=1)
        deviations = samples.std(ddof=1, axis=1)
        p_m = means[(means >= 83) & (means <= 84)].size/N
        p_d = deviations[(deviations >= 9.5) & (deviations <= 10.5)].size/N
        p.append(p_m*p_d)
    prob.append(sum(p)/len(p))
prob = np.array(prob)

fig, ax = plt.subplots()
ax.plot(sigma, prob)
ax.set_xlabel(r'    ($\sigma$)',
              fontsize=20)
ax.set_ylabel('',
              fontsize=20);
      
      



, \sigma \approx 10. , , \sigma, s. - , , , , . .





T-?

, - - . 1% , - . , , . , - -. ?





- ! , - , "" t-. , , , . , , 1943 , 50% . , - .





, "" . , ( !) , "" , :





t = \frac{\bar{x}-\mu}{\frac{s}{\sqrt{n}}}

t-, . , , "", , , , , - . " ", "t-" , .





:





t={\frac  {Y_{0}}{{\sqrt  {{\frac  {1}{n}}\sum \limits _{{i=1}}^{n}Y_{i}^{2}}}}},

" " . Y_{i} , .. {\displaystyle Y_{i}\sim {\mathcal {N}}(0,1),;i=0,\ldots ,n}, n, .. , . - , :





t\sim {\mathrm  {t}}(n)

, :





import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 15000
    
    x = np.linspace(-5, 5, 100)
    ax.plot(x, stats.norm.pdf(x, 0, 1), color='r')
    
    samples = stats.norm.rvs(0, 1, (N, i))
    
    t = samples[:, 0]/np.sqrt(np.mean(samples[:, 1:]**2, axis=1))
    t = t[(t>-5)&(t<5)]
    sns.histplot(x=t, bins=np.linspace(-5, 5, 100), stat='density', ax=ax)
    
    ax.set_title(r'  $t(n)$  n = ' + str(i), fontsize = 20)
    ax.set_xlim(-5, 5)
    ax.set_ylim(0, 0.5)
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(2, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('t_rel_of_df.gif',
                 writer='imagemagick', 
                 fps=3)
      
      



, n, , , N(0, 1). , , :





{\ displaystyle f_ {t} (y) = {\ frac {\ Gamma \ left ({\ frac {n + 1} {2}} \ right)} {{\ sqrt {n \ pi}} \, \ Gamma \ left ({\ frac {n} {2}} \ right)}} \, \ left (1 + {\ frac {y ^ {2}} {n}} \ right) ^ {- {\ frac {n +1} {2}}}}

SciPy:





import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 15000
    
    x = np.linspace(-5, 5, 100)
    ax.plot(x, stats.norm.pdf(x, 0, 1), color='r')
    ax.plot(x, stats.t.pdf(x, df=i))
    
    ax.set_title(r'  $t(n)$  n = ' + str(i), fontsize = 20)
    ax.set_xlim(-5, 5)
    ax.set_ylim(0, 0.45)
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(2, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('t_pdf_rel_of_df.gif',
                 writer='imagemagick', 
                 fps=3)
      
      



, n( df ) . - , , n, .





t-

t- SciPy :





sample = np.array([89,99,93,84,79,61,82,81,87,82])

stats.ttest_1samp(sample, 80)
      
      



Ttest_1sampResult(statistic=1.163532240174695, pvalue=0.2745321678073461)
      
      



:





T = 9**0.5*(sample.mean() -80)/sample.std()
T
      
      



1.163532240174695
      
      



, n = 10, df, n - 1. , 1 , , , . :





T = 10**0.5*(sample.mean() -80)/sample.std(ddof=1)
T
      
      



1.1635322401746953
      
      



, t- , p-value? , - , p-value Z-, t-:





t = stats.t(df=9)
fig, ax = plt.subplots()
x = np.linspace(t.ppf(0.001), t.ppf(0.999), 300)
ax.plot(x, t.pdf(x))
ax.hlines(0, x.min(), x.max(), lw=1, color='k')
ax.vlines([-T, T], 0, 0.4, color='g', lw=2)
x_le_T, x_ge_T = x[x<-T], x[x>T]
ax.fill_between(x_le_T, t.pdf(x_le_T), np.zeros(len(x_le_T)), alpha=0.3, color='b')
ax.fill_between(x_ge_T, t.pdf(x_ge_T), np.zeros(len(x_ge_T)), alpha=0.3, color='b')

p = 1 - (t.cdf(T) - t.cdf(-T))
ax.set_title(r'$P(\left | T \right | \geqslant {:.3}) = {:.3}$'.format(T, p));
      
      



, p-value 27%, .. , - , \ alpha = 0,05, p-value , 5 . , , - , \ alpha, 0.95:





\ textrm {CI} _ {1 - \ alpha} = \ bar {x} \ pm \ left (t _ {\ frac {\ alpha} {2}, \ textrm {df}} \ right) \ left (\ frac {s} {\ sqrt {n}} \ right)

SciPy, interval loc () scale () :





sample_loc = sample.mean()
sample_scale = sample.std(ddof=1)/10**0.5
ci = stats.t.interval(0.95, df=9, loc=sample_loc, scale=sample_scale)
ci
      
      



(76.50640345566619, 90.89359654433382)
      
      



, \ bar {x} = 83,7, , \ alpha = 0,95 [76,5;  90,9]. , , \ bar {x}, .





, , , ( ). , , t- , t- , t- .





Bien sûr, je voudrais insérer un gif à la fin, mais je veux terminer avec la phrase d'Herbert Spencer: " Le plus grand objectif de l'éducation n'est pas la connaissance, mais l'action ", alors lancez vos anacondas et passez à l' action ! Cela est particulièrement vrai pour les autodidactes comme moi.





J'appuie sur F5 et j'attends vos commentaires avec impatience!








All Articles