- Probability mass functions
- Cumulative distribution functions
- Comparing distribution
- Modeling distributions
Probability mass functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from empiricaldist import Pmf, Cdf
from scipy.stats import norm
gss = pd.read_hdf('./dataset/gss.hdf5', 'gss')
gss.head()
Make a PMF
year = gss['year']
# Compute the PMF for year
pmf_year = Pmf.from_seq(year, normalize=False)
# Print the result
print(pmf_year)
Plot a PMF
# Select the age column
age = gss['age']
# Make a PMF of age
pmf_age = Pmf.from_seq(age)
# Plot the PMF
pmf_age.bar()
# Label the axes
plt.xlabel('Age')
plt.ylabel('PMF')
Cumulative distribution functions
Make a CDF
# Select the age column
age = gss['age']
# Compute the CDF of age
cdf_age = Cdf.from_seq(age)
# Calculate the CDF of 30
print(cdf_age(30))
print(1 - cdf_age(30))
Compute IQR
income = gss['realinc']
cdf_income = Cdf.from_seq(income)
# Calculate the 75th percentile
percentile_75th = cdf_income.inverse(0.75)
# Calculate the 25th percentile
percentile_25th = cdf_income.inverse(0.25)
# Calculate the interquartile range
iqr = percentile_75th - percentile_25th
# Print the interquartile range
print(iqr)
Plot a CDF
# Plot it
cdf_income.plot()
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
Comparing distribution
Extract education levels
# Select educ
educ = gss['educ']
# Bachelor`s degree
bach = (educ >= 16)
# Associate degree
assc = ((educ >= 14) & (educ < 16))
# High school (12 or fewer years of education)
high = (educ <= 12)
print(high.mean())
Plot income CDFs
income = gss['realinc']
# Plot the CDFs
Cdf.from_seq(income[high]).plot(label='High school')
Cdf.from_seq(income[assc]).plot(label='Associate')
Cdf.from_seq(income[bach]).plot(label='Bachelor')
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
plt.legend()
plt.savefig('../images/income-cdf.png')
Modeling distributions
Distribution of income
# Extract realinc and compute its log
log_income = np.log10(income)
# Compute mean and standard deviation
mean = np.mean(log_income)
std = np.std(log_income)
print(mean, std)
# Make a norm object
dist = norm(mean, std)
Comparing CDFs
# Evaluate the model CDF
xs = np.linspace(2, 5.5)
ys = dist.cdf(xs)
# Plot the model CDF
plt.plot(xs, ys, color='gray')
# Create and plot the Cdf of log_income
Cdf.from_seq(log_income).plot()
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('CDF')
# Evaluate the model CDF
xs = np.linspace(2, 5.5)
ys = dist.pdf(xs)
# Plot the model CDF
plt.plot(xs, ys, color='gray')
# Plot the data KDE
sns.kdeplot(log_income)
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('CDF')