Below are simulated four distributions (n = 100 each), all with similar measures of center (mean = 0) and spread (s.d. = 1), but with distinctly different shapes.
n
);s
, Johnson distribution with skewness 2.2 and kurtosis 13);k
, Johnson distribution with skewness 0 and kurtosis 30);mm
, two normals with mean -0.95 and 0.95 and standard deviation 0.31).#install.packages("SuppDists")
#library(SuppDists)
# this is used later to generate the s and k distributions
findParams <- function(mu, sigma, skew, kurt) {
value <- .C("JohnsonMomentFitR", as.double(mu), as.double(sigma),
as.double(skew), as.double(kurt - 3), gamma = double(1),
delta = double(1), xi = double(1), lambda = double(1),
type = integer(1), PACKAGE = "SuppDists")
list(gamma = value$gamma, delta = value$delta,
xi = value$xi, lambda = value$lambda,
type = c("SN", "SL", "SU", "SB")[value$type])
}
# Generate sample data -------------------------------------------------------
set.seed(141079)
# normal
n <- rnorm(100)
# right-skew
s <- rJohnson(100, findParams(0, 1, 2.2, 13))
# leptikurtic
k <- rJohnson(100, findParams(0, 1, 0, 30))
# mixture
mm <- rnorm(100, rep(c(-1, 1), each = 50) * sqrt(0.9), sqrt(0.1))
Let's see what our descriptive statistics look like:
four_wide <- data.frame(cbind(n, s, k, mm))
psych::describe(four_wide)
vars n mean sd median trimmed mad min max range skew kurtosis
n 1 100 0.07 1.09 0.13 0.12 1.14 -3.07 2.83 5.91 -0.36 0.11
s 2 100 0.74 1.01 0.43 0.58 0.83 -0.48 4.49 4.97 1.48 2.20
k 3 100 0.00 0.79 -0.07 -0.05 0.63 -2.17 2.77 4.94 0.64 1.38
mm 4 100 -0.06 0.99 -0.03 -0.06 1.41 -1.63 1.49 3.12 0.00 -1.65
se
n 0.11
s 0.10
k 0.08
mm 0.10
What do you notice? For which distributions are the standard measures of central tendency, spread, and shape more accurate?
What you want to look for:
hist()
#2 x 2 histograms in base r graphics
par(mar = c(3.0, 3.0, 1, 1))
par(mfrow=c(2,2))
hist(n)
hist(s)
hist(k)
hist(mm)
What makes these histograms difficult to compare? A few things:
geom_histogram()
The nice thing about ggplot
is that we can use facet_wrap
, and the default protects us from these ills: the x- and y-axes are the same, and the size of the binwidths are also the same. But it means we need to make data in long-format such that our data from the four distributions is stacked on top of each other in one column (rather than in seperate columns for each distribution) and have a variable (column) in our dataset that specifies which of the four distributions the data is drawn from (n
, s
, k
, mm
).
four <- data.frame(
dist = factor(rep(c("n", "s", "k", "mm"),
each = 100),
c("n", "s", "k", "mm")),
vals = c(n, s, k, mm)
)
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) + #no y needed for visualization of univariate distributions
geom_histogram(fill = "white", colour = "black") + #easier to see for me
coord_cartesian(xlim = c(-5, 5)) + #use this to change x/y limits!
facet_wrap(~ dist) #this is one factor variable with 4 levels
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
geom_histogram(binwidth = )
Always change the binwidths on a histogram. Sometimes the default in ggplot
works great, sometimes it does not.
Super narrow:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .1, fill = "white", colour = "black") + #super narrow bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Super wide:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = 2, fill = "white", colour = "black") + #super wide bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Just right? Pretty close to the default for this data.
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
+ geom_rug()
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
geom_rug() +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
What you want to look for:
boxplot()
Note that if varwidth
is TRUE, the boxes are drawn with widths proportional to the square-roots of the number of observations in the groups. It doesn't matter here since all 4 distributions contain 100 values.
#Just Boxplot
par(mar = c(2.1, 2.1, .1, .1))
boxplot(vals ~ dist,
data = four,
ylim = c(-4,4),
varwidth = TRUE) #vary width by n
geom_boxplot()
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot() +
scale_x_discrete(name="") +
scale_y_continuous(name="") +
coord_cartesian(ylim = c(-4,4))
geom_boxplot(notch = TRUE)
ggplot notches: "Notches are used to compare groups; if the notches of two boxes do not overlap, this is strong evidence that the medians differ." (Chambers et al., 1983, p. 62)
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(notch = T) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4,4))
+ stat_summary()
Here we add a diamond for the mean (see other possible shape codes here).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
shape = 18,
size = 4,
colour = "lightseagreen") +
coord_cartesian(ylim = c(-4, 4))
Options:
stripchart()
par(mar = c(2.1, 2.1, .1, .1))
stripchart(vals ~ dist,
data = four,
pch = 16,
ylim = c(-4,4),
method = "jitter",
vertical = TRUE,
col = rgb(32, 178, 170, 100, max = 255))
From statmethods.net: You can use the col2rgb()
function to get the rbg values for R colors. For example, col2rgb("lightseagreen")
yeilds r = 32, g = 178, b = 170. Then add the alpha transparency level as the 4th number in the color vector. Alpha values range from 0 (fully transparent) to 255 (opaque). You must also specify max = 255. See help(rgb)
for more information.
geom_jitter() + stat_summary()
This is the ggplot corollary to a stripchart.
ggplot(four, aes(x = dist, y = vals)) +
geom_jitter(position = position_jitter(height = 0, width = .1),
fill = "lightseagreen",
colour = "lightseagreen",
alpha = .5) +
stat_summary(fun.y = median,
fun.ymin = median,
fun.ymax = median,
geom = "crossbar",
width = 0.5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
beeswarm()
# install.packages("beeswarm")
# library(beeswarm)
#par(mfrow = c(1,1))
par(mar = c(2.1, 2.1, .1, .1))
beeswarm(vals ~ dist,
data = four,
pch = 20,
col="lightseagreen",
ylim=c(-4,4))
This is a beeswarm-like ggplot- not exactly the same, but gives you the same idea.
ggplot(four, aes(x = dist, y = vals)) +
geom_dotplot(stackdir = "center",
binaxis = "y",
binwidth = .1,
binpositions = "all",
stackratio = 1.5,
fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
https://github.com/eclarke/ggbeeswarm
install.packages("ggbeeswarm")
library(ggbeeswarm)
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen",
method = "smiley") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Note that these recommendations do not apply if your data is "big". You will know your data is too big if you try the below methods and you can't see many of the individual points (typically, N > 100).
boxplot()
, beeswarm(add = TRUE)
#install.packages("beeswarm")
#library(beeswarm)
par(mar = c(2.1, 2.1, .1, .1))
boxplot(vals ~ dist, #make the boxplot first
data = four,
outline = FALSE, #avoid double-plotting outliers-beeswarm will plot them too
ylim = c(-4, 4))
beeswarm(vals ~ dist,
data = four,
pch = 20,
col = "lightseagreen",
ylim = c(-4, 4),
add = TRUE) #this is how you layer on top of the boxplot
boxplot()
, stripchart(add = TRUE)
par(mar = c(2.1, 2.1, .1, .1))
boxplot(vals ~ dist, #make the boxplot first
data = four,
outline = FALSE, #avoid double-plotting outliers-beeswarm will plot them too
ylim = c(-4, 4))
stripchart(vals ~ dist,
data = four,
vertical = TRUE,
method = "jitter",
pch = 16,
add = TRUE,
ylim = c(-4, 4),
col = rgb(32, 178, 170, 100, max = 255))
geom_boxplot() + geom_dotplot()
This is my personal pick for EDA when I have small - medium data (N < 100).
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(outlier.shape = NA) +
geom_dotplot(binaxis = 'y',
stackdir = 'center',
stackratio = 1.5,
binwidth = .1,
binpositions = "all",
dotsize = 1,
alpha = .75,
fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
geom_boxplot() + geom_jitter()
I left the outliers in to demonstrate the jittered points only go left to right because I set the jitter height = 0
.
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(width = .5) + #left the outliers in here, so they are double-plotted
geom_jitter(fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE,
position = position_jitter(height = 0, width = .1),
alpha = .5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
A few ways to do this:
geom_density()
ggplot(four, aes(x = vals)) +
geom_density(fill = "lightseagreen") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Instead of doing a facet_wrap
, I could make just one plot showing all four distributions. To make each distribution a different color, set the fill
within the aes
, and assign it to the factor variable dist
. Since now all four will be plotted on top of each other, add an alpha
level to make the color fill transparent (0 = completely transparent; 1 = completely opaque).
# Density plots with semi-transparent fill
ggplot(four, aes(x = vals, fill = dist)) +
geom_density(alpha = .5)
+ geom_histogram()
These are pretty easy to make in ggplot
. However, note that the y-axis is different from if you just plotted the histogram. In fact, when interpreting this plot, the y-axis is only meaningful for reading density. It is meaningless for interpreting the histogram.
ggplot(four, aes(x = vals)) +
geom_histogram(aes(y = ..density..),
binwidth = .5,
colour = "black",
fill = "white") +
geom_density(alpha = .5, fill = "lightseagreen") +
coord_cartesian(xlim = c(-5,5)) +
facet_wrap(~ dist)
geom_violin()
ggplot(four, aes(x = dist, y = vals)) +
geom_violin(colour = "lightseagreen",
fill = "lightseagreen",
alpha = .5,
na.rm = TRUE,
scale = "count") + # max width proportional to sample size
coord_cartesian(ylim = c(-4, 4))
geom_violin() + geom_boxplot()
This is my personal pick for EDA when I have large data (N > 100).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot(outlier.size = 2,
colour = "lightseagreen",
fill = "black",
na.rm = TRUE,
width = .1) +
geom_violin(alpha = .2,
fill = "lightseagreen",
colour = NA,
na.rm = TRUE) +
coord_cartesian(ylim = c(-4, 4))
Note that it is just as easy to layer a univariate scatterplot over a violin plot, just by replacing the geom_boxplot
with a different geom as shown abobe. Lots of combination plots are possible!
vioplot()
Includes equivalent of boxplot.
#install.packages("vioplot")
#library(vioplot)
par(mar = c(2.1, 2.1, .1, .1))
with(four, vioplot(vals[dist == "n"],
vals[dist == "s"],
vals[dist == "k"],
vals[dist == "mm"],
horizontal = FALSE,
names = c("n", "s", "k", "mm"),
col = "lightseagreen",
ylim = c(-4, 4)))
beanplot()
The default beanlines
for each bean is the mean- you can also use the median or quantiles. The default overallline
is the mean, again you can use the median instead.
#install.packages("beanplot")
#library(beanplot)
par(mar = c(2.1, 2.1, .1, .1))
beanplot(vals ~ dist,
data = four,
ylim = c(-4, 4),
method = "jitter", #handling overlapping beans
col = c("lightblue", "lightseagreen", "lightseagreen"),
border = "lightblue")
The more general stat_summary
function applies a summary function to the variable mapped to y at each x value.
The simplest summary function is mean_se
, which returns the mean and the mean plus its standard error on each side. Thus, stat_summary
will calculate and plot the mean and standard errors for the y variable at each x value.
The default geom is "pointrange" which places a dot at the central y value and extends lines to the minimum and maximum y values. Other geoms you might consider to display summarized data:
geom_errorbar
geom_pointrange
geom_linerange
geom_crossbar
There are a few summary functions from the Hmisc
package which are reformatted for use in stat_summary()
. They all return aesthetics for y
, ymax
, and ymin
.
mean_cl_normal()
mean_sdl()
mean_cl_boot()
median_hilow()
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = "mean_se")
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.y = "mean", geom = "point") +
stat_summary(fun.y = "max", geom = "point", shape = 21)
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = median_hilow)
You may have noticed two different arguments that are potentially confusing: fun.data
and fun.y
. If the function returns three values, specify the function with the argument fun.data
. If the function returns one value, specify fun.y
. See below.
x <- c(1, 2, 3)
mean(x) # use fun.y
[1] 2
mean_cl_normal(x) # use fun.data
y ymin ymax
1 2 -0.4841377 4.484138
Confidence limits may give us a better idea than standard error limits of whether two means would be deemed statistically different when modeling, so we frequently use mean_cl_normal
or mean_cl_boot
in addition to mean_se
.
Using the ToothGrowth
dataset
data(ToothGrowth)
tg <- ToothGrowth
# Standard error of the mean
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
ggtitle("Mean +/- SE")
# Connect the points with lines
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean with 95% confidence intervals")
# The errorbars overlapped, so use position_dodge to move them horizontally
pd <- position_dodge(0.1) # move them .05 to the left and right
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal", position = pd) +
stat_summary(fun.y = mean, geom = "line", position = pd) +
ggtitle("Mean with 95% confidence intervals")
Not the best example for this geom, but another good one for showing variability...
# ribbon geom
ggplot(tg, aes(x = dose, y = len, colour = supp, fill = supp)) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(fun.data = mean_se, geom = "ribbon", alpha = .5)
install.packages("Rmisc")
library(Rmisc)
sum_tg <- summarySE(tg,
measurevar="len",
groupvars=c("supp","dose"))
sum_tg
supp dose N len sd se ci
1 OJ 0.5 10 13.23 4.459709 1.4102837 3.190283
2 OJ 1.0 10 22.70 3.910953 1.2367520 2.797727
3 OJ 2.0 10 26.06 2.655058 0.8396031 1.899314
4 VC 0.5 10 7.98 2.746634 0.8685620 1.964824
5 VC 1.0 10 16.77 2.515309 0.7954104 1.799343
6 VC 2.0 10 26.14 4.797731 1.5171757 3.432090
ggplot(sum_tg, aes(x = dose, y = len, colour = supp)) +
geom_errorbar(aes(ymin=len-se, ymax=len+se, group=supp),
width=.1,
position=pd) +
geom_line(aes(group=supp), position=pd) +
geom_point(position=pd)
If you must...
# Standard error of the mean; note positioning
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_se, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_cl_boot, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean with 95% confidence intervals")
More help here: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/