import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
The Normal distribution¶
# Plotting the standard normal distribution
range = np.arange(-3,3,0.001)
plt.plot(range, stats.norm.pdf(range, 0, 1));
# Here we have both the pdf and cdf and notice that we here use samples
x = np.linspace(-3,3,1000)
mu = 0
sigma = 1
y_pdf = stats.norm.pdf(x,mu,sigma)
y_cdf = stats.norm.cdf(x,mu,sigma)
plt.plot(x, y_pdf, label='pdf')
plt.plot(x, y_cdf, label='cdf')
plt.legend();
# Let us create a UDF which we can use to plot later on
def plot_normal(x_range, mu=0, sigma=1, cdf=False, **kwargs):
'''
Plots the normal distribution function for a given x range
If mu and sigma are not provided, standard normal is plotted
If cdf=True cumulative distribution is plotted
Passes any keyword arguments to matplotlib plot function
'''
x = x_range
if cdf:
y = stats.norm.cdf(x, mu, sigma)
else:
y = stats.norm.pdf(x, mu, sigma)
plt.plot(x, y, **kwargs)
# As you can see, we get exactly the same as above
x = np.linspace(-3, 3, 1000)
plot_normal(x, label='pdf')
plot_normal(x, cdf=True, label='cdf')
plt.legend();
x = 13
mu = 10
sigma = np.sqrt(4)
prob = stats.norm.sf(x,mu,sigma)
print('a) P(X > ' + repr(x) + ')' + ' = ' + repr(round(prob,4)))
n = np.linspace(0, 20, 1000)
plot_normal(n,mu,sigma, label='pdf')
plt.legend();
a) P(X > 13) = 0.0668
x = np.linspace(-3, 3, 1000)
plot_normal(x, label='pdf')
plot_normal(x, cdf=True, label='cdf')
plt.legend();
# Let us see what happens when we change our parameters
x = np.linspace(-5, 5, 1000)
plot_normal(x, -2, 1, color='red', lw=2, ls='-', alpha=0.5)
plot_normal(x, 2, 1, color='blue', lw=2, ls='-', alpha=0.5)
plot_normal(x, 0, 1, color='green', lw=2, ls='-', alpha=0.5);
x = np.linspace(-3, 3, 1000)
plot_normal(x, label='pdf')
plt.legend();
# Let us create a UDF which will enable us to shade areas under the curve
def draw_z_score(x, cond, mu, sigma, title):
y = stats.norm.pdf(x, mu, sigma)
z = x[cond]
plt.plot(x, y)
plt.fill_between(z, 0, stats.norm.pdf(z, mu, sigma))
plt.title(title)
plt.show()
x = np.linspace(-3,3,1000)
z0 = -0.75
draw_z_score(x, x<z0, 0, 1, 'z<-0.75')
z0 = 0.75
draw_z_score(x, (-z0 < x) & (x < z0), 0, 1, '-0.75<z<0.75')
z0 = -0.75
z1 = 1.8
draw_z_score(x, (z0 < x) & (x < z1), 0, 1, '-0.75<z<1.8')
z0 = -1.96
z1 = 1.96
draw_z_score(x, (z0 < x) & (x < z1), 0, 1, '-1.96<z<1.96')
Example 4-13/14 page 125-126¶
x = 13
mu = 10
sigma = np.sqrt(4)
n = np.linspace(0, 20, 1000)
prob = stats.norm.sf(x,mu,sigma)
print('P(X > ' + repr(x) + ')' + ' = ' + repr(round(prob,4)))
draw_z_score(n, x<=n, mu, sigma, 'X > 13')
x1 = 9
x2 = 11
prob2 = stats.norm.cdf(x2, mu, sigma)-stats.norm.cdf(x1, mu, sigma)
print('P('+ repr(x1) + ' < X < ' + repr(x2) + ')' +
' = ' + repr(round(prob2,4)))
draw_z_score(n, (x1 <= n) & (n <= x2), mu, sigma, '9 < X < 11')
P(X > 13) = 0.0668
P(9 < X < 11) = 0.3829
Normal distribution: histogram and PDF¶
Explore the normal distribution: a histogram built from samples and the PDF (probability density function).
# Sample from a normal distribution using numpy's random number generator
samples = np.random.normal(size=10000)
# Compute a histogram of the sample
bins = np.linspace(-3, 3, 30)
histogram, bins = np.histogram(samples, bins=bins, normed=True)
bin_centers = 0.5*(bins[1:] + bins[:-1])
# Compute the PDF on the bin centers from scipy distribution object
pdf = stats.norm.pdf(bin_centers)
plt.plot(bin_centers, histogram, label="Histogram of samples")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()
# Same as above but a different type of histogram. Let's change n.
mu, sigma = 0, 1
s = np.random.normal(mu, sigma,1000)
# Create the bins and histogram
count, bins, ignored = plt.hist(s, 30, density=True)
# Plot the distribution curve
pdf = stats.norm.pdf(bins)
plt.plot(bins, pdf, label="PDF")
plt.show()
Importing data - in this case from Excel¶
# Notice that I do not use headers
df = pd.read_excel('C:/Users/RIB/OneDrive/Documents/Arbejde/Stochastic modeling and processes/Old/Excel/EX06_14.XLS', header=None)
df.head()
| 0 | |
|---|---|
| 0 | 88.5 |
| 1 | 94.7 |
| 2 | 84.3 |
| 3 | 90.1 |
| 4 | 89.0 |
df.describe()
| 0 | |
|---|---|
| count | 83.000000 |
| mean | 90.533735 |
| std | 2.888382 |
| min | 83.400000 |
| 25% | 88.600000 |
| 50% | 90.400000 |
| 75% | 92.200000 |
| max | 100.300000 |
df.shape
(83, 1)
df.ndim
2
df.dtypes
0 float64 dtype: object
# To plot this data, we need to sort and then fit the data to a normal
# distribution, basically, we insert the values for mu and std.
x = sorted(df[0])
plot_normal(x, np.mean(x),np.std(x))
# Plot the distribution curve and add a histogram - a few tweeks
pdf = stats.norm.pdf(x, np.mean(x), np.std(x))
plt.plot(x, pdf, '-o', label="PDF")
plt.hist(df[0], density=True, rwidth=0.93)
plt.xlabel('Motor Fuel Octane Ratings')
plt.ylabel('Probability')
plt.legend()
plt.show()
# Plot the distribution curve and histogram
pdf = stats.norm.pdf(sorted(df[0]), np.mean(df[0]), np.std(df[0]))
plt.plot(sorted(df[0]), pdf, '-o', color='g')
plt.hist(df[0], density=True, rwidth=0.9)
plt.show()
Normal Probability Plot¶
For many applications, it will be important to make sure that our data is normally distributed, especially if we have small sample sizes. We can do this in many ways, but one of the most commonly used ways is a normal probability plot.
stats.probplot(df[0], plot=plt)
plt.ylabel('Motor Fuel Octane Ratings')
plt.show()
Plotting a Kernel Density Estimate (KDE)¶
In this tutorial, you’ve been working with samples, statistically speaking. Whether the data is discrete or continuous, it’s assumed to be derived from a population that has a true, exact distribution described by just a few parameters.
A kernel density estimation (KDE) is a way to estimate the probability density function (PDF) of the random variable that “underlies” our sample. KDE is a means of data smoothing.
# Now, above we fitted using a normal distribution, here we kind of
# make a new distribution from the data and plot it.
df.plot.kde();
fig, ax = plt.subplots()
df.plot.kde(ax=ax, legend=False, title='Histogram')
df.plot.hist(density=True, ax=ax, legend=False, rwidth=0.9)
ax.set_ylabel('Probability')
ax.grid(axis='y')
ax.set_facecolor('#d8dcd6')
Plotting the cdf and cumulative histogram¶
cdf = stats.norm.cdf(sorted(df[0]), np.mean(df[0]), np.std(df[0]))
plt.plot(sorted(df[0]), cdf, '-o', color='g')
plt.hist(df[0], density=True, rwidth=0.9, cumulative = True)
plt.show()
Comparing data¶
df2 = pd.read_excel('C:/Users/RIB/OneDrive/Documents/Arbejde/Stochastic modeling and processes/Smoking_and_Cancer.xlsx')
df2.head()
| State | # Cigarettes Cigarettes sold | Deaths per 100K bladder cancer | Deaths per 100K lung cancer | Deaths per 100K kidney cancer | Deaths per 100K leukemia | |
|---|---|---|---|---|---|---|
| 0 | AL | 18.20 | 2.90 | 17.05 | 1.59 | 6.15 |
| 1 | AZ | 25.82 | 3.52 | 19.80 | 2.75 | 6.61 |
| 2 | AR | 18.24 | 2.99 | 15.98 | 2.02 | 6.94 |
| 3 | CA | 28.60 | 4.46 | 22.07 | 2.66 | 7.06 |
| 4 | CT | 31.10 | 5.11 | 22.83 | 3.35 | 7.20 |
df2.describe()
| # Cigarettes Cigarettes sold | Deaths per 100K bladder cancer | Deaths per 100K lung cancer | Deaths per 100K kidney cancer | Deaths per 100K leukemia | |
|---|---|---|---|---|---|
| count | 44.000000 | 44.000000 | 44.000000 | 44.000000 | 44.000000 |
| mean | 24.914091 | 4.121136 | 19.653182 | 2.794545 | 6.829773 |
| std | 5.573286 | 0.964925 | 4.228122 | 0.519080 | 0.638259 |
| min | 14.000000 | 2.860000 | 12.010000 | 1.590000 | 4.900000 |
| 25% | 21.230000 | 3.207500 | 16.437500 | 2.457500 | 6.532500 |
| 50% | 23.765000 | 4.065000 | 20.320000 | 2.845000 | 6.860000 |
| 75% | 28.097500 | 4.782500 | 22.807500 | 3.112500 | 7.207500 |
| max | 42.400000 | 6.540000 | 27.270000 | 4.320000 | 8.280000 |
# Note - the colors do not match, haven't figured out how to do that yet!
fig, ax = plt.subplots()
df2.plot.kde(ax=ax, title='Histogram')
df2.plot.hist(density=True, ax=ax)
ax.set_ylabel('Probability')
ax.grid(axis='x')
ax.set_facecolor('#d8dcd6')
Boxplots¶
fig, ax = plt.subplots()
ax.set_title('Basic Plot')
ax.boxplot(df[0])
plt.show()
df3 = pd.read_excel('C:/Users/RIB/OneDrive/Documents/Arbejde/Stochastic modeling and processes/Old/Excel/EX06_12.XLS', header=None)
fig, ax = plt.subplots()
ax.set_title('Basic Plot')
ax.boxplot(df3[0])
plt.show()
df4 = pd.read_excel('C:/Users/RIB/OneDrive/Documents/Arbejde/Stochastic modeling and processes/Old/Excel/EX06_57.XLS')
df4.head()
| High Dose | Control | |
|---|---|---|
| 0 | 16.1 | 297.1 |
| 1 | 134.9 | 491.8 |
| 2 | 52.7 | 1332.9 |
| 3 | 14.4 | 1172.0 |
| 4 | 124.3 | 1482.7 |
df4.shape
(60, 2)
df4.isna().sum()
High Dose 38 Control 0 dtype: int64
df.isnull().any().any()
False
df.isnull().any()
0 False dtype: bool
df4['High Dose'].dropna()
0 16.1 1 134.9 2 52.7 3 14.4 4 124.3 5 99.0 6 24.3 7 16.3 8 15.2 9 47.7 10 12.9 11 72.7 12 126.7 13 46.4 14 60.3 15 23.5 16 43.6 17 79.4 18 38.0 19 58.2 20 26.5 21 25.1 Name: High Dose, dtype: float64
fig, ax = plt.subplots()
ax.set_title('Basic Plot')
ax.boxplot(df4['High Dose'].dropna())
plt.show()
fig, ax = plt.subplots()
ax.set_title('Basic Plot')
ax.boxplot(df4['Control'].dropna())
plt.show()
================================ The Exponential Distribution¶
# Like above, we will create a UDF which we can use to plot
def plot_exponential(x_range, mu=0, sigma=1, cdf=False, **kwargs):
x = x_range
if cdf:
y = stats.expon.cdf(x, mu, sigma)
else:
y = stats.expon.pdf(x, mu, sigma)
plt.plot(x, y, **kwargs)
# Like above, we will create a UDF which we can use to plot
def plot_exponential(x_range, cdf=False, **kwargs):
x = x_range
if cdf:
y = stats.expon.cdf(x)
else:
y = stats.expon.pdf(x)
plt.plot(x, y, **kwargs)
# Plotting with default mu
x = np.linspace(0, 5, 5000)
plot_exponential(x, color='red', lw=2, ls='-', alpha=0.5, label='pdf')
plot_exponential(x, cdf=True, color='blue', lw=2, ls='-', alpha=0.5, label='cdf')
plt.legend();
x = np.linspace(0, 5, 5000)
pdf = stats.expon.pdf(x)
r = stats.expon.rvs(size=5000)
plt.plot(x, pdf)
plt.hist(r, density=True, rwidth=0.9)
plt.show()
# Exercise from slide
prob2 = stats.expon.cdf(x=2,scale=1/0.4)
prob1 = stats.expon.cdf(x=1,scale=1/0.4)
y1 = stats.expon.pdf(np.arange(0,1,0.001), scale=1/0.4)
y2 = stats.expon.pdf(np.arange(1,2,0.001), scale=1/0.4)
y3 = stats.expon.pdf(np.arange(2,10,0.001), scale=1/0.4)
x1 = np.arange(0,1,0.001)
x2 = np.arange(1,2,0.001)
x3 = np.arange(2,10,0.001)
plt.fill_between(x1, y1, facecolor='blue', alpha=0.5)
plt.fill_between(x2, y2, facecolor='red', alpha=0.5)
plt.fill_between(x3, y3, facecolor='blue', alpha=0.5)
plt.text(x=0.3, y=0.2, s= round(prob1,3))
plt.text(x=1.3, y=0.1, s= round(prob2-prob1,3))
plt.text(x=2.5, y=0.05, s= round(1-prob2,3))
plt.show()
def draw_exp(x, cond, mu, title):
y = stats.expon.pdf(x, scale=mu)
z = x[cond]
plt.plot(x, y)
plt.fill_between(z, 0, stats.expon.pdf(z, scale=mu))
plt.title(title)
plt.show()
x = np.arange(0,10,0.001)
x1 = 1
x2 = 2
draw_exp(x, (x1 < x) & (x < x2), 1/0.4, '1 < X < 2')
Ex 5¶
Let $X$ denote the time in hours from the start of the interval until the first log-on.
# I could actually import the expon method!!!!
from scipy.stats import expon
mu=1/25
x = 0.1
prob1 = stats.expon.sf(x, 0, mu)
print('P(X > ' + repr(x) + ')' + ' = ' + repr(round(prob1,4)))
n = np.arange(0,0.3,0.01)
draw_exp(n, (x <= n), 1/25, 'X > 6 min')
x1 = 2/60
x2 = 3/60
prob2 = stats.expon.cdf(x2, 0, mu)-stats.expon.cdf(x1, 0, mu)
print('P('+ repr(round(x1,3)) + ' < X < ' + repr(round(x2,3)) + ')' +
' = ' + repr(round(prob2,4)))
draw_exp(n, (x1 <= n) & (n <= x2), 1/25, '2 min < X < 3 min')
P(X > 0.1) = 0.0821
P(0.033 < X < 0.05) = 0.1481
print('x = ' + repr(round(60*stats.expon.isf(0.9, 0, mu),4)) + ' min')
x = 0.2529 min
Ex 6¶
mu = 1.4
x = 0.5
prob1 = stats.expon.cdf(x, 0, mu)
print('P(X < ' + repr(x) + ')' + ' = ' + repr(round(prob1,4)))
n = np.arange(0,5,0.01)
draw_exp(n, (n <= x), 1.4, 'X < 30 sec')
P(X < 0.5) = 0.3003
Ex 7¶
mu=1/2
x = 40/60
prob1 = stats.expon.cdf(x, 0, mu)
print('P(X < ' + repr(round(x,3)) + ')' + ' = ' + repr(round(prob1,4)))
n = np.arange(0,5,0.01)
draw_exp(n, (n <= x), 1/2, 'X < 40 sec')
P(X < 0.667) = 0.7364
print('x = ' + repr(round(stats.expon.ppf(0.9, 0, mu),4)) + ' min')
x = 1.1513 min