##########################################################
### Book Title: Practicing R for Statistical Computing
### Authors: (i) Muhammad Aslam
### (ii) Muhammad Imdad Ullah
### Publisher: Springer Nature Singapore Pte Ltd, 2023
##########################################################
######### Chapter 7 ###################
##### Probability and Probability Distributions ##########
# sample space of a coin toss experiment
S <- data.frame(coin = c("Head", "Tail"))
library(prob)
# a coin tossed single time
tosscoin(1)
# a coin tossed three times
tosscoin(3)
# roll of a fair die having six sides
rolldie(1)
# roll of a fair die 3 times having 4-sided die
rolldie(3, nsides = 4)
urnsamples(1:3, size = 2, replace = TRUE, ordered = TRUE)
urnsamples(1:3, size = 2, replace = FALSE, ordered = TRUE)
urnsamples(1:3, size = 2, replace = F, ordered = F)
urnsamples(1:3, size = 2, replace = T, ordered = F)
S <- tosscoin(3, makespace = TRUE)
# extract first 3 rows
S[1:3, ]
# extract row 2 and 5
S[c(2, 5), ]
# extract rows where toss1 has H
subset(S, toss1 == "H")
S <- cards()
# suit is Heart
subset(S, suit == "Heart")
subset(S, rank %in% 7:9)
S <- rolldie(3)
# sum of face on each die > 15
subset(S, X1 + X2 + X3 > 15)
# face on third die is > 5
subset(S, X3 > 5)
# even number on each die
subset(S, (X1 %% 2 == 0 & X2 %% 2 == 0 & X3 %% 2 == 0) == T)
S <- rolldie(3)
subset(S, isin(S, c(2, 2, 6), ordered = TRUE))
S <- cards()
# all Diamonds cards
A = subset(S, suit == "Diamond")
# cards from 7 to 9
B = subset(S, rank %in% 7:9)
# Union of sets
union(A, B)
# Intersection of sets
intersect(A, B)
# Different between sets
setdiff(A, B)
setdiff(B, A)
# Complement of A
setdiff(S, A)
# Complement of B
setdiff(S, B)
# ways to select 3 items from 10 items?
choose(k = 3, n = 10)
# ways to select = 3 items from 100 items?
choose(k = 3, n = 100)
nsamp(n = 20, k = 3, replace = F, ordered = T)
# with replacement and with ordered samples
nsamp (n =3, k = 2, replace = T, ordered = T)
# without replacement and with ordered samples
nsamp (n =3, k = 2, replace = F, ordered = T)
# without replacement and ordered samples
nsamp (n =3, k = 2, replace = F, ordered = F)
# with replacement and without ordered samples
nsamp (n =3, k = 2, replace = T, ordered = F)
n <- c(3, 4, 3)
k <- c(1, 1, 1)
x <- nsamp(n, k, replace = T, ordered = T)
prod(x)
# combination
combn(x = 1:5, m = 3)
combn(x = c("T1", "T2", "T3", "T4"), m = 3)
outcomes <- rolldie(1)
probspace(outcomes, probs = rep(1/6, times = 6))
probspace(1:6, probs = rep(1/6, times = 6))
probspace(1:6)
probspace(rolldie(1))
probspace(rolldie(1), makespace = TRUE)
tosscoin(1, makespace = TRUE)
tosscoin(3, makespace = TRUE)
cards(makespace = TRUE)
probspace(subset(cards(), rank %in% 7:9), makespace=TRUE)
rolldie(1, makespace = TRUE)
probspace(tosscoin(1), probs = c(.6, .4))
probspace(rolldie(1), probs = c(.6, .4))
probspace(rolldie(1), probs = c(.6, .4))
probspace(rolldie(1), probs = c(.167, .167, .167, .167, .167, .193))
S <- cards(makespace = TRUE)
# Draw of a heart card
A <- subset(S, suit == "Heart")
# Probability of Event A
Prob(A)
# Draw of numbered card 7, 8, and 9
B <- subset(S, rank %in% 7:9)
# Probability of Event B
Prob(B)
# Probability of all events
Prob(S)
Prob(interset(x, subset(x, event)))
Prob(intersect(S, subset(S, suit == "Heart")))
Prob(intersect(S, subset(S, rank %in% 7:9)))
# sample space when events are equally likely
S <- rolldie(2, makespace = TRUE)
# Event A (outcomes are same)
A <- subset(S, X1 == X2)
B <- subset(S, X1 + X2 >= 8)
# Conditional probability P(A | B)
Prob(A, given = B)
# Conditional probability P(B | A)
Prob(B, given = A)
# Conditional probability P(A | B)
Prob(S, X1 == X2, given = (X1 + X2 >= 8))
# Conditional probability P(B | A)
Prob(S, X1 + X2 >= 8, given = (X1 == X2))
df <- cards()
# sample space
S <- urnsamples(df, size = 2)
# probability space
P <- probspace(S)
Prob(P, all(rank == "A"))
# urn that contain 7 red & 3 green balls
urn <- rep(c("red", "green"), times = c(7, 3))
# load prob package
library(prob)
# sample space
S <- urnsamples (urn, size = 3, replace = F, ordered = T)
# probability space
P <- probspace(S)
# computation of required probability
Prob(P, X1 == "red" & X2 == "red" & X3 == "red" )
Prob(P, isrep(P, "red", 3))
Prob(P, isin(P, c("red", "green", "red"), ordered = T ))
Prob(P, isin(P, c("red", "green", "red"), ordered = F))
S <- tosscoin(10, makespace = T)
T <- subset(S, isrep(S, vals = "T", nrep =10))
PH <- 1 - Prob(T); PH # Probability of at least one had
iidspace(c("H", "T"), ntrials = 3, probs = c(0.7, 0.3))
iidspace(c("1", "2", "3", "4", "5", "6"), ntrials = 2,
probs = c(.1, .2, .3, .2, .1,.1))
iidspace(c(1:6), ntrials = 2, probs = c(.1, .2, .3, .2, .1, .1))
# Bayes Rule
# prior probabilities
prior <- c(0.6, 0.3, 0.1)
# probability of misfiling by employees (conditional probabilities)
prob <- c(0.003, 0.007, 0.01)
# Posterior probability
post <- prior * cprob
post / sum(post)
newprior <- post
# misfiled all 5 documents
post <- newprior * cprob ^ 5
post / sum(post)
# Random Variable
S <- rolldie(3, nsides = 6, makespace = TRUE)
S <- addrv(S, U = X1 - X2 + X3)
Prob(S, U > 5)
Prob(S, U <= 3)
S <- rolldie(3, nsides = 4, makespace = TRUE)
S <- addrv(S, U = X1 - X2 + X3)
S <- addrv(S, FUN = max, invars = c("X1", "X2", "X3"), name = "V")
S <- addrv(S, FUN = sum, invars = c("X1", "X2", "X3"), name = "W")
# Marginal Distributions
marginal(S, vars = "W")
# mean and variance of random variable
x <- c(0, 1, 2, 3)
p <- c(1/8, 3/8, 3/8, 1/8)
m <- sum(x * p) # mean of prob. dist.
v <- sum((x - m)^ 2 * p) # variance of prob. dist.
sd<- sqrt(v) # standard deviation
m
v
sd
F <- cumsum(f)
library(distrEx)
X <- DiscreteDistribution(supp = 0:3, prob = c(1, 3, 3, 1)/8 )
E(X)
var(X)
sd(X)
# Probability distributions
hist(rnorm(1000, mean = 5, sd = 2), main = "Normal")
hist(exp(rnorm(1000, mean = 0, sd = .5)), main = "Lognormal")
hist(rpois(1000, 3), main = "Poisson")
hist(c(rnorm(500, 2, 2), rnorm(500, 10, 2)), main = "Bimodal")
# Computing p-values
1 - pnorm(1.95) # two-tailed p-value
# quantile of normal distribution
qnorm(0.975)
2*pt(-2.43, df = 13) # two-tailed p-value
# quantile for t-dist.
qt(0.25, df = 13)
1 - pchisq(5.1, 1) # p-value
# quantile for chi-square dist.
qchisq(0.975, 1)
# Binomial Distribution
dbinom(x = 3, size = 4, prob = 1/2)
dbinom(0:3, size = 3, prob = 1/2)
p <- dbinom(0:3, size = 3, prob = 1/2)
plot(x = 0:3, y = p, type = "h", xlim = c(-1, 4),
ylim = c(0, 1), col = "blue", ylab = "Probabilities",
xlab = "Number of heads", lwd = 2,
main = "Binomial Probability Distribution")
points(0:3, p, pch = 16, col = "red")
p <- dbinom(0:20, size = 20, prob = 1/2)
plot(x = 0:20, y = p, type = "h", xlim = c(-1, 21),
ylim = c(0, 0.3), col = "blue", ylab = "Probabilities",
xlab = "Number of heads", lwd = 2,
main = "Binomial Probability Distribution")
points(0:20, p, pch = 16, col = "red")
sum(dbinom(0:4, size = 10, prob = 1/2))
pbinom(4, size = 10, prob = 1/2)
sum(dbinom(5:8, size = 10, prob = 1/2))
pbinom(q = 8, size = 10, prob = 1/2) - pbinom(q = 4, size = 10, prob = 1/2)
diff(pbinom(c(4, 8), size = 10, prob = 1/2))
diff(pbinom(c(4, 8), size = 10, prob = 1/2))
sum (dbinom(8:10, size = 10, prob = 1/2))
1 - pbinom(7, size = 10, prob = 1/2)
pbinom(q = c(3, 5), size = 10, prob = 0.5)
diff(pbinom(q = c(3, 5), size = 10, prob = 0.5))
qbinom(p = 0.25, size = 50, prob = 0.5)
rbinom(n = 10, size = 100, prob = 0.4)
x <- c(4, 7, 9, 11, 12)
ecdf(x)
plot(ecdf(x))
epdf <- function(x) function(t) {
sum(x %in% t)/length(x)
}
epdf(x)(0)
# Hypergeometric Distribution
# Three defective items, P(X = 3)
dhyper(x = 3, m = 15, n = 200, k = 5)
p <- as.data.frame(dhyper(0:5, m = 15, n = 200, k = 5))
rownames(p) <- 0:5
round(p, 5)
phyper(q = 2, m = 15, n = 200, k = 5)
phyper(q = 1, m = 15, n = 200, k = 5, lower.tail = FALSE)
1 - phyper(q = 1, m = 15, n = 200, k = 5)
qhyper(p = 0.9968, m = 15, n = 200, k = 5)
rhyper(nn = 10, m = 15, n = 200, k = 5)
# Geometric Distribution
dgeom(x = 3, prob = 1/6)
pgeom(q = 3, prob = 1/6)
sum(dgeom(x = 0:3, prob = 1/6))
qgeom(p = 0.95, prob = 0.35)
rgeom(n = 3, prob = 1/6)
# negative Binomial Distribution
dnbinom(x = 10, size = 2, prob = 0.1)
pnbinom(q = 5, size = 2, prob = 0.35, lower.tail = FALSE)
qnbinom(p = 0.2337986, size = 2, prob = 0.35, lower.tail = F)
rnbinom(n = 10, size = 2, prob = 0.5)
# Poisson Distribution
dpois(x = 10, lambda = 4)
ppois(q = 10,lambda = 4)
sum(dpois(x = 0:10, lambda = 4))
ppois(q = 20, lambda = 4) - ppois(q = 10, lambda = 4)
diff(ppois(c(10, 20), lambda = 4))
qpois(p = seq(0, 0.99, 0.1), lambda = 4)
rpois(n = 100, lambda = 4)
# PDF of 3X^2
f <- function(x){
3 * x ^2
}
integrate(f, lower = 0.14, upper = 0.71)
# E(X) = 3x^3
EXfunc <- function(x){
3 * x^3
}
EX <- integrate(EXfunc, lower = 0, upper = 1); EX
# EX^2 = x^2 * 3x^2
EX2func <- function(x) {
x^2 * 3*x^2
}
EX2 <- integrate(EX2func, lower = 0, upper = 1)
# Variance
var <- EX2$value - EX$value^2; var
library(distr)
f <- function(x){
3*x^2
}
X <- AbscontDistribution(d = f, low1 = 0, up1 = 1)
p(X)(0.71) - p(X)(.14)
# for mean and variance
library(distrEx)
E(X)
var(X)
g <- function(x) 3/x^4
integrate(g, lower = 1, upper = Inf)
integrate(g, lower = 3, upper = 7)
# E(X) = -x^-3
EXfunc<-function(x) x * g(x)
EX <- integrate(EXfunc, lower = 1, upper = Inf); EX
EX2func <- function(x) x^2 * g(x)
EX2 <- integrate(EX2func, lower = 1, upper = Inf)
var = EX2$value - (EX$value^2); var
x <- seq(from = 0, to = 5, length = 100)
ylim <- c(0, 1)
plot(x, dunif(x, min = 2, max = 4), type = "l", ylim = ylim)
# Uniform Distribution
# P(x < 5 )
punif(q = 5, min = 2, max = 10)
# P(x > 5)
1 - punif(q = 5, min = 2, max = 10)
# P(3 < x < 5)
punif(q = 5, min = 2, max = 10) - punif(q = 3, min = 2, max = 10)
# or
diff(punif(c(3, 5), 2, 10))
# P(X < x) = 0.75
qunif(p = 0.75, min = 2, max = 10)
# p = 0, 1, 2, ..., 10
qunif(p = seq(from = 0, to = 1, by = 0.1), min = 2 , max = 10)
# uniform numbers between 1 and 3
runif(10, min = 1, max = 3)
# Normal Distribution
x <- seq(-4, 4, by = 0.1)
dy <- dnorm(x, mean = 0, sd = 1)
plot(x, dy)
pnorm(q = 510, mean = 500, sd = sqrt(20))
pnorm(q = 510, mean = 500, sd = sqrt(20), lower.tail = FALSE )
x <- seq(-4, 4, by = 0.1)
px <- pnorm(x, mean = 0, sd = 1)
plot(x, dx)
qnorm(0.95, mean = 100, sd = 15)
qnorm(0.05, mean = 0, sd = 1)
qnorm(0.95, mean = 0, sd = 1)
qnorm(0.025, mean = 0, sd = 1)
qnorm(0.975, mean = 0, sd = 1)
y <- rnorm(n = 1000, mean = 0, sd = 1)
hist(y, main = "Standard Normal Distribution", probability = TRUE)
xx <- seq(min(y), max(y), length = 100)
lines(xx, dnorm(xx, mean = 0, sd = 1))
# Exponential Distribution
x <- seq(from = 0, to = 6, length = 100)
ylim <- c(0, 1)
plot(x, dexp(x, rate = 1/2), type = "l", ylim = ylim)
# P(X < 10000)
pexp(q = 10000, rate = 1/30000)
# P(X > 15000)
pexp(q = 15000, rate = 1/30000, lower.tail = FALSE)
# P(15000 < X < 25000)
pexp(q = 25000, rate = 1/30000) - pexp(q = 15000, rate = 1/30000 )
diff(pexp(q = c(15000, 25000), rate = 1/30000))
qexp(p = 0.2834687, rate = 1/30000)
rexp(n = 1000, rate = 1)
# Gamma Distribution
x <- seq(from = 0, to = 6, length = 100)
ylim <- c(0, 1)
plot(x, dgamma(x, shape = 2, rate = 1), ylim = ylim, type = "l")
pgamma(q = 3, shape = 10, rate = 0.25)
qgamma(p = 0.7576078, shape = 10, rate = 0.25)
rgamma(n = 10, shape = 10, scale = 0.25)
# Chi-Square Distribution
x <- seq(from = -20, to = 20, by = 0.5)
y <- dchisq(x, df = 10)
plot(x, y)
pchisq(q = 2, df = 10)
pchisq(q = 3, df = 10)
1 - pchisq(q = 3, df = 10)
x = c(2, 4, 5, 6)
pchisq(q = x, df = 6)
qchisq(p = 0.05, df = 10)
qchisq(p = 0.95, df = 10)
qchisq(p = c(0.005, 0.025, 0.05), df = 90)
rchisq(n = 3, df = 10)
rchisq(n = 100, df = 20)
# F-Distribution
x <- seq(from = 0, to = 20, by = 0.001)
y <- df(x, df1 = 5, df2 = 2)
plot(x, y, type = "l", lwd = 2)
# lower-tailed
pf(q = 4, df1 = 5, df2 = 2)
# upper-tailed
pf(q = 4, df1 = 5, df2 = 2, lower.tail = F)
# or
1 - pf(q = 4, df1 = 5, df2 = 2)
# two-tailed
2 * pf(q = 4, df1 = 5, df2 = 2, lower.tail = F)
# lower-tail
alpha = 0.05
qf(alpha, df1 = 10, df2 = 15)
# upper-tail
qf(alpha, df1 = 10, df2 = 15, lower.tail = F)
qf(p = alpha/2, df1 = 10, df2 = 15, lower.tail = F)
qf(p = alpha/2, df1 = 10, df2 = 15, lower.tail = T)
rf(n = 50, df1 = 15, df2 = 20)
x <- rf(n = 100000, df1 = 15, df2 = 20)
hist(x, breaks = 100, xlab = "F", freq = F, main = "Histogram of F")
# t-distribution
x <- seq(from = -5, to = 5,by = 0.01)
y <- dt(x, df = 10)
plot(x, y, type = "l", lwd = 2)
pt(q = 3, df = 5)
pt(q = -3, df = 5)
1 - pt(q = 3, df = 5) # or pt(3, df = 5, lower.tail = F)
x = c(-3, -4, -2, -1)
pt((mean(x) - 2)/sd(x), df = 20)
qt(p = c(0.025, 0.95, 0.975, 0.99), df = 5)
rt(n = 3, df = 10)
rt(n = 30, df = 20)
# distr package
library(distr)
X <- Binom(size = 3, prob = 1/2); X
# PMF of X evaluated at X = 1
d(X)(1)
# CDF of X evaluated at X = 2
p(X)(2)
library(distr)
X <- Norm(mean = 0, sd = 1)
Y <- 3 - 4 * X
Y
plot(Y)
Y <- sin(exp(X) + 27)
p(Z)(0.5)
plot(Y)
# distriEx package
X <- Binom(size = 3, prob = 0.45)
library(distriEx)
E(x)
var(X)
sd(X)
IQR(X)
mad(X)
# Examining Univariate Data set
attach(faithful)
fivenum(eruptions)
summary(eruptions)
stem(eruptions)
hist(eruptions)
hist(eruptions, seq(1.6, 5.2, 0.2), prob = TRUE)
lines(density(eruptions, bw = 0.1))
# show actual data point
rug(eruptions)
plot( ecdf (eruptions), do.points = FALSE, verticals = TRUE)
long3mn <- eruptions[eruptions > 3]
plot(ecdf(long3mn), do.points = FALSE, verticals = TRUE)
x <- seq (3, 5.4, 0.01)
lines (x, pnorm(x, mean = mean(long3mn), sd = sqrt(var(long3mn))), lty = 3)
# set a square figure region
par (pty = "s")
qqnorm(long3mn)
qqline(long3mn)
long3mn <- eruptions[eruptions > 3]
shapiro.test(long3mn)
ks.test(long3mn, "pnorm", mean=mean(long3mn), sd=sqrt(var(long3mn)))
######## For Exercise's Questions and Answer, please visit the book ######
######## For functions in Tables, please visit the book ######