Title:  Data and Functions for the Book Modern Statistical Graphics 

Description:  A companion to the Chinese book ``Modern Statistical Graphics''. 
Authors:  Yihui Xie [aut, cre] , Peng Zhao [aut], Lijia Yu [ctb], Xiangyun Huang [ctb] 
Maintainer:  Yihui Xie <[email protected]> 
License:  GPL 
Version:  0.8.1 
Built:  20241022 05:15:36 UTC 
Source:  https://github.com/yihui/msg 
Datasets and functions for the Chinese book “Modern Statistical Graphics”.
Yihui Xie <https://yihui.org>
This function evaluates the transformation of the original data matrix for
t
from pi
to pi
, and uses matplot
to draw the
curves.
andrews_curve( x, n = 101, type = "l", lty = 1, lwd = 1, pch = NA, xlab = "t", ylab = "f(t)", ... )
andrews_curve( x, n = 101, type = "l", lty = 1, lwd = 1, pch = NA, xlab = "t", ylab = "f(t)", ... )
x 
a data frame or matrix 
n 
number of xaxis values at which f(t) is evaluated 
type , lty , lwd , pch , xlab , ylab , ...

passed to

a matrix of coefficients for each observation at different t values
Yihui Xie <https://yihui.org>
https://en.wikipedia.org/wiki/Andrews_plot
andrews_curve(iris[, 5], col = as.integer(iris[, 5]))
andrews_curve(iris[, 5], col = as.integer(iris[, 5]))
The players in the rows assisted the ones in the columns.
http://www.basketballgeek.com/data/
data(assists) if (require("sna")) { set.seed(2011) gplot(assists, displaylabels = TRUE, label.cex = 0.7) }
data(assists) if (require("sna")) { set.seed(2011) gplot(assists, displaylabels = TRUE, label.cex = 0.7) }
The data was generated from two independent random varialbes (standard Normal distribution) and further points on a circle were added to the data. The order of the data was randomized.
A data frame with 20000 observations on the following 2 variables.
the first random variable with the xaxis coordinate of the circle
the second random variable with the yaxis coordinate of the circle
See the example section for the code to generate the data.
https://yihui.org/en/2008/09/toseeacircleinapileofsand/
data(BinormCircle) ## original plot: cannot see anything plot(BinormCircle) ## transparent colors (alpha = 0.1) plot(BinormCircle, col = rgb(0, 0, 0, 0.1)) ## set axes lmits plot(BinormCircle, xlim = c(1, 1), ylim = c(1, 1)) ## small symbols plot(BinormCircle, pch = ".") ## subset plot(BinormCircle[sample(nrow(BinormCircle), 1000), ]) ## 2D density estimation library(KernSmooth) fit = bkde2D(as.matrix(BinormCircle), dpik(as.matrix(BinormCircle))) # perspective plot by persp() persp(fit$x1, fit$x2, fit$fhat) if (interactive() && require("rgl")) { # perspective plot by OpenGL rgl.surface(fit$x1, fit$x2, fit$fhat) # animation M = par3d("userMatrix") play3d(par3dinterp(userMatrix = list(M, rotate3d(M, pi/2, 1, 0, 0), rotate3d(M, pi/2, 0, 1, 0), rotate3d(M, pi, 0, 0, 1))), duration = 20) } ## data generation x1 = rnorm(10000) y1 = rnorm(10000) x2 = rep(0.5 * cos(seq(0, 2 * pi, length = 500)), 20) y2 = rep(0.5 * sin(seq(0, 2 * pi, length = 500)), 20) x = cbind(c(x1, x2), c(y1, y2)) BinormCircle = as.data.frame(round(x[sample(20000), ], 3))
data(BinormCircle) ## original plot: cannot see anything plot(BinormCircle) ## transparent colors (alpha = 0.1) plot(BinormCircle, col = rgb(0, 0, 0, 0.1)) ## set axes lmits plot(BinormCircle, xlim = c(1, 1), ylim = c(1, 1)) ## small symbols plot(BinormCircle, pch = ".") ## subset plot(BinormCircle[sample(nrow(BinormCircle), 1000), ]) ## 2D density estimation library(KernSmooth) fit = bkde2D(as.matrix(BinormCircle), dpik(as.matrix(BinormCircle))) # perspective plot by persp() persp(fit$x1, fit$x2, fit$fhat) if (interactive() && require("rgl")) { # perspective plot by OpenGL rgl.surface(fit$x1, fit$x2, fit$fhat) # animation M = par3d("userMatrix") play3d(par3dinterp(userMatrix = list(M, rotate3d(M, pi/2, 1, 0, 0), rotate3d(M, pi/2, 0, 1, 0), rotate3d(M, pi, 0, 0, 1))), duration = 20) } ## data generation x1 = rnorm(10000) y1 = rnorm(10000) x2 = rep(0.5 * cos(seq(0, 2 * pi, length = 500)), 20) y2 = rep(0.5 * sin(seq(0, 2 * pi, length = 500)), 20) x = cbind(c(x1, x2), c(y1, y2)) BinormCircle = as.data.frame(round(x[sample(20000), ], 3))
The scores of the game Canabalt from Twitter
‘http://www.neilkodner.com/2011/02/visualizationsofcanabaltscoresscrapedfromtwitter/’ (the URL is not longer accessible)
library(ggplot2) data(canabalt) print(qplot(device, score, data = canabalt)) print(qplot(reorder(death, score, median), score, data = canabalt, geom = "boxplot") + coord_flip())
library(ggplot2) data(canabalt) print(qplot(device, score, data = canabalt)) print(qplot(reorder(death, score, median), score, data = canabalt, geom = "boxplot") + coord_flip())
This function prints a matrix of characters which are very similar to each other.
char_gen(x = c("V", "W"), n = 300, nrow = 10)
char_gen(x = c("V", "W"), n = 300, nrow = 10)
x 
a character vector of length 2 (usually two similar characters) 
n 
the total number of characters in the matrix 
nrow 
the number of rows 
a character matrix on the screen
Yihui Xie <https://yihui.org>
char_gen() char_gen(c("O", "Q"))
char_gen() char_gen(c("O", "Q"))
This data contains the life expectancy and number of people with higher education in the 31 provinces and districts in China (2005).
A data frame with 31 observations on the following 2 variables.
Life expectancy
Number of people with higher education
China Statistical Yearbook 2005. National Bureau of Statistics.
data(ChinaLifeEdu) x = ChinaLifeEdu plot(x, type = "n", xlim = range(x[, 1]), ylim = range(x[, 2])) u = par("usr") rect(u[1], u[3], u[2], u[4], col = "antiquewhite", border = "red") library(KernSmooth) est = bkde2D(x, apply(x, 2, dpik)) contour(est$x1, est$x2, est$fhat, nlevels = 15, col = "darkgreen", add = TRUE, vfont = c("sans serif", "plain"))
data(ChinaLifeEdu) x = ChinaLifeEdu plot(x, type = "n", xlim = range(x[, 1]), ylim = range(x[, 2])) u = par("usr") rect(u[1], u[3], u[2], u[4], col = "antiquewhite", border = "red") library(KernSmooth) est = bkde2D(x, apply(x, 2, dpik)) contour(est$x1, est$x2, est$fhat, nlevels = 15, col = "darkgreen", add = TRUE, vfont = c("sans serif", "plain"))
This function can categorize the variable on the xaxis into groups and plot the mean values of y. The purpose is to show the arbitrariness of the discretization of data.
cut_plot(x, y, breaks, ..., pch.cut = 20)
cut_plot(x, y, breaks, ..., pch.cut = 20)
x 
the x variable 
y 
the y variable 
breaks 
the breaks to cut the x variable 
... 
other arguments to be passed to

pch.cut 
the point symbol to denote the mean values of y 
Yihui Xie <https://yihui.org>
x = rnorm(100) y = rnorm(100) cut_plot(x, y, seq(min(x), max(x), length = 5))
x = rnorm(100) y = rnorm(100) cut_plot(x, y, seq(min(x), max(x), length = 5))
Export of US and China from 1999 to 2004 in US dollars
A data frame with 13 observations on the following 3 variables.
amount of export
year from 1999 to 2004
country: US or China
https://www.wto.org/english/res_e/statis_e/statis_e.htm
data(Export.USCN) par(mar = c(4, 4.5, 1, 4.5)) plot(1:13, Export.USCN$Export, xlab = "Year / Country", ylab = "US Dollars ($10^16)", axes = FALSE, type = "h", lwd = 10, col = c(rep(2, 6), NA, rep(4, 6)), lend = 1, panel.first = grid()) xlabel = paste(Export.USCN$Year, "\n", Export.USCN$Country) xlabel[7] = "" xlabel abline(v = 7, lty = 2) axis(1, at = 1:13, labels = xlabel, tick = FALSE, cex.axis = 0.75) axis(2) (ylabel = pretty(Export.USCN$Export * 8.27)) axis(4, at = ylabel/8.27, labels = ylabel) mtext("Chinese RMB", side = 4, line = 2) box()
data(Export.USCN) par(mar = c(4, 4.5, 1, 4.5)) plot(1:13, Export.USCN$Export, xlab = "Year / Country", ylab = "US Dollars ($10^16)", axes = FALSE, type = "h", lwd = 10, col = c(rep(2, 6), NA, rep(4, 6)), lend = 1, panel.first = grid()) xlabel = paste(Export.USCN$Year, "\n", Export.USCN$Country) xlabel[7] = "" xlabel abline(v = 7, lty = 2) axis(1, at = 1:13, labels = xlabel, tick = FALSE, cex.axis = 0.75) axis(2) (ylabel = pretty(Export.USCN$Export * 8.27)) axis(4, at = ylabel/8.27, labels = ylabel) mtext("Chinese RMB", side = 4, line = 2) box()
This data was collected from Google by searching for percentages in some goverment websites.
A data frame with 10000 observations on the following 4 variables.
a numeric vector: the percentages
a numeric vector: the number of webpages corresponding to a certain percentage
a logical vector: rounded to integers?
a logical vector: rounded to the 1st decimal place?
We can specify the domain when searching in Google. For this data, we used ‘site:gov.cn’, e.g. to search for ‘87.53% site:gov.cn’.
Google (date: 2009/12/17)
data(gov.cn.pct) pct.lowess = function(cond) { with(gov.cn.pct, { plot(count ~ percentage, pch = ifelse(cond, 4, 20), col = rgb(0:1, 0, 0, c(0.04, 0.5))[cond + 1], log = "y") lines(lowess(gov.cn.pct[cond, 1:2], f = 1/3), col = 2, lwd = 2) lines(lowess(gov.cn.pct[!cond, 1:2], f = 1/3), col = 1, lwd = 2) }) } par(mar = c(3.5, 3.5, 1, 0.2), mfrow = c(2, 2)) with(gov.cn.pct, { plot(percentage, count, type = "l", panel.first = grid()) plot(percentage, count, type = "l", xlim = c(10, 11), panel.first = grid()) pct.lowess(round0) pct.lowess(round1) }) if (interactive()) { devAskNewPage(ask = TRUE) with(gov.cn.pct, { plot(count ~ percentage, type = "l") grid() devAskNewPage(ask = FALSE) for (i in 0:99) { plot(count ~ percentage, type = "l", xlim = i + c(0, 1), panel.first = grid()) } devAskNewPage(ask = TRUE) plot(count ~ percentage, pch = 20, col = rgb(0:1, 0, 0, c(0.07, 1))[round0 + 1], log = "y") lines(lowess(gov.cn.pct[round0, 1:2], f = 1/3), col = "red", lwd = 2) lines(lowess(gov.cn.pct[!round0, 1:2], f = 1/3), col = "black", lwd = 2) plot(count ~ percentage, pch = 20, col = rgb(0:1, 0, 0, c(0.07, 1))[round1 + 1], log = "y") lines(lowess(gov.cn.pct[round1, 1:2], f = 1/3), col = "red", lwd = 2) lines(lowess(gov.cn.pct[!round1, 1:2], f = 1/3), col = "black", lwd = 2) }) }
data(gov.cn.pct) pct.lowess = function(cond) { with(gov.cn.pct, { plot(count ~ percentage, pch = ifelse(cond, 4, 20), col = rgb(0:1, 0, 0, c(0.04, 0.5))[cond + 1], log = "y") lines(lowess(gov.cn.pct[cond, 1:2], f = 1/3), col = 2, lwd = 2) lines(lowess(gov.cn.pct[!cond, 1:2], f = 1/3), col = 1, lwd = 2) }) } par(mar = c(3.5, 3.5, 1, 0.2), mfrow = c(2, 2)) with(gov.cn.pct, { plot(percentage, count, type = "l", panel.first = grid()) plot(percentage, count, type = "l", xlim = c(10, 11), panel.first = grid()) pct.lowess(round0) pct.lowess(round1) }) if (interactive()) { devAskNewPage(ask = TRUE) with(gov.cn.pct, { plot(count ~ percentage, type = "l") grid() devAskNewPage(ask = FALSE) for (i in 0:99) { plot(count ~ percentage, type = "l", xlim = i + c(0, 1), panel.first = grid()) } devAskNewPage(ask = TRUE) plot(count ~ percentage, pch = 20, col = rgb(0:1, 0, 0, c(0.07, 1))[round0 + 1], log = "y") lines(lowess(gov.cn.pct[round0, 1:2], f = 1/3), col = "red", lwd = 2) lines(lowess(gov.cn.pct[!round0, 1:2], f = 1/3), col = "black", lwd = 2) plot(count ~ percentage, pch = 20, col = rgb(0:1, 0, 0, c(0.07, 1))[round1 + 1], log = "y") lines(lowess(gov.cn.pct[round1, 1:2], f = 1/3), col = "red", lwd = 2) lines(lowess(gov.cn.pct[!round1, 1:2], f = 1/3), col = "black", lwd = 2) }) }
Calculate the coordinates of a heart shape and draw it with a polygon.
heart_curve(n = 101, ...)
heart_curve(n = 101, ...)
n 
the number of points to use when calculating the coordinates of the heart shape 
... 
other arguments to be passed to 
Yihui Xie <https://yihui.org>
heart_curve() heart_curve(col = "red") heart_curve(col = "pink", border = "red")
heart_curve() heart_curve(col = "red") heart_curve(col = "pink", border = "red")
Plot a graph with a preinstalled R script
msg(fig = "3.6", show_code = TRUE, print_plot = TRUE, filter = 0)
msg(fig = "3.6", show_code = TRUE, print_plot = TRUE, filter = 0)
fig 
Character. The figure number or the R script name, which is given in the book. 
show_code 
Logical. TRUE means the codes are shown in the console. 
print_plot 
Logical. TRUE means the graph is printed. 
filter 
Integer. The line numbers indicating which lines in the code are displayed (when positive) or hidden (when negative). 
A graph and the source code
# msg('3.6') msg('ChinaPop')
# msg('3.6') msg('ChinaPop')
The proportions of sand, silt and clay in soil samples are given for 8 contiguous sites. The sites extended over the crest and flank of a low rise in a valley underlain by marl near Albudeite in the province of Murcia, Spain. The sites were small areas of ground surface of uniform shape internally and delimited by relative discontinuities externally. Soil samples were obtained for each site at 11 random points within a 10m by 10m area centred on the midpoint of the site. All samples were taken from the same depth. The data give the sand, silt and clay content of each sample, expressed as a percentage of the total sand, silt and clay content.
http://www.statsci.org/data/general/murcia.html
data(murcia) boxplot(sand ~ site, data = murcia)
data(murcia) boxplot(sand ~ site, data = murcia)
Attributes of some music clips
Cook D, Swayne DF (2007). Interactive and Dynamic Graphics for Data Analysis With R and GGobi. Springer. ISBN 9780387717616.
data(music)
data(music)
For each altitude, the number of plants is recorded.
A data frame with 600 observations on the following 2 variables.
altitude of the area
number of plants
https://cosx.org/2008/11/lowesstoexplorebivariatecorrelationbyyihui
## different span for LOWESS data(PlantCounts) par(las = 1, mar = c(4, 4, 0.1, 0.1), mgp = c(2.2, 0.9, 0)) with(PlantCounts, { plot(altitude, counts, pch = 20, col = rgb(0, 0, 0, 0.5), panel.first = grid()) for (i in seq(0.01, 1, length = 70)) { lines(lowess(altitude, counts, f = i), col = rgb(0, i, 0), lwd = 1.5) } })
## different span for LOWESS data(PlantCounts) par(las = 1, mar = c(4, 4, 0.1, 0.1), mgp = c(2.2, 0.9, 0)) with(PlantCounts, { plot(altitude, counts, pch = 20, col = rgb(0, 0, 0, 0.5), panel.first = grid()) for (i in seq(0.01, 1, length = 70)) { lines(lowess(altitude, counts, f = i), col = rgb(0, i, 0), lwd = 1.5) } })
The time, location and magnitude of all the earth quakes with magnitude being greater than 6 since 1973.
data(quake6) library(ggplot2) qplot(year, month, data = quake6) + stat_sum(aes(size = ..n..)) + scale_size(range = c(1, 10))
data(quake6) library(ggplot2) qplot(year, month, data = quake6) + stat_sum(aes(size = ..n..)) + scale_size(range = c(1, 10))
Given that the variances of two groups are unequal, we compute the difference of Pvalues assuming equal or unequal variances respectively by simulation.
A data frame with 1000 rows and 99 columns.
See the Examples section for the generation of this data.
By simulation.
Welch B (1947). “The generalization of Student's problem when several different population variances are involved.” Biometrika, 34(1/2), 28–35.
data(t.diff) boxplot(t.diff, axes = FALSE, xlab = expression(n[1])) axis(1) axis(2) box() ## reproducing the data if (interactive()) { set.seed(123) t.diff = NULL for (n1 in 2:100) { t.diff = rbind(t.diff, replicate(1000, { x1 = rnorm(n1, mean = 0, sd = runif(1, 0.5, 1)) x2 = rnorm(30, mean = 1, sd = runif(1, 2, 5)) t.test(x1, x2, var.equal = TRUE)$p.value  t.test(x1, x2, var.equal = FALSE)$p.value })) } t.diff = as.data.frame(t(t.diff)) colnames(t.diff) = 2:100 }
data(t.diff) boxplot(t.diff, axes = FALSE, xlab = expression(n[1])) axis(1) axis(2) box() ## reproducing the data if (interactive()) { set.seed(123) t.diff = NULL for (n1 in 2:100) { t.diff = rbind(t.diff, replicate(1000, { x1 = rnorm(n1, mean = 0, sd = runif(1, 0.5, 1)) x2 = rnorm(30, mean = 1, sd = runif(1, 2, 5)) t.test(x1, x2, var.equal = TRUE)$p.value  t.test(x1, x2, var.equal = FALSE)$p.value })) } t.diff = as.data.frame(t(t.diff)) colnames(t.diff) = 2:100 }
For the test of means of two samples, we calculated the Pvalues and recorded the counts of Tukey's rule of thumb.
A data frame with 10000 observations on the following 3 variables.
Pvalues of t test
Pvalues of Wilcoxon test
Tukey's counts
See the reference for details.
Simulation; see the Examples section below.
D. Daryl Basler and Robert B. Smawley. Tukey's Compact versus Classic Tests. The Journal of Experimental Education, Vol. 36, No. 3 (Spring, 1968), pp. 8688
data(tukeyCount) ## does Tukey's rule of thumb agree with t test and Wilcoxon test? with(tukeyCount, { ucount = unique(count) stripchart(pvalue.t ~ count, method = "jitter", jitter = 0.2, pch = 19, cex = 0.7, vertical = TRUE, at = ucount  0.2, col = rgb(1, 0, 0, 0.2), xlim = c(min(count)  1, max(count) + 1), xaxt = "n", xlab = "Tukey Count", ylab = "Pvalues") stripchart(pvalue.w ~ count, method = "jitter", jitter = 0.2, pch = 21, cex = 0.7, vertical = TRUE, at = ucount + 0.2, add = TRUE, col = rgb(0, 0, 1, 0.2), xaxt = "n") axis(1, unique(count)) lines(sort(ucount), tapply(pvalue.t, count, median), type = "o", pch = 19, cex = 1.3, col = "red") lines(sort(ucount), tapply(pvalue.w, count, median), type = "o", pch = 21, cex = 1.3, col = "blue", lty = 2) legend("topright", c("t test", "Wilcoxon test"), col = c("red", "blue"), pch = c(19, 21), lty = 1:2, bty = "n", cex = 0.8) }) if (interactive()) { ## this is how the data was generated set.seed(402) n = 30 tukeyCount = data.frame(t(replicate(10000, { x1 = rweibull(n, runif(1, 0.5, 4)) x2 = rweibull(n, runif(1, 1, 5)) c(t.test(x1, x2)$p.value, wilcox.test(x1, x2)$p.value, with(rle(rep(0:1, each = n)[order(c(x1, x2))]), ifelse(head(values, 1) == tail(values, 1), 0, sum(lengths[c(1, length(lengths))])))) }))) colnames(tukeyCount) = c("pvalue.t", "pvalue.w", "count") }
data(tukeyCount) ## does Tukey's rule of thumb agree with t test and Wilcoxon test? with(tukeyCount, { ucount = unique(count) stripchart(pvalue.t ~ count, method = "jitter", jitter = 0.2, pch = 19, cex = 0.7, vertical = TRUE, at = ucount  0.2, col = rgb(1, 0, 0, 0.2), xlim = c(min(count)  1, max(count) + 1), xaxt = "n", xlab = "Tukey Count", ylab = "Pvalues") stripchart(pvalue.w ~ count, method = "jitter", jitter = 0.2, pch = 21, cex = 0.7, vertical = TRUE, at = ucount + 0.2, add = TRUE, col = rgb(0, 0, 1, 0.2), xaxt = "n") axis(1, unique(count)) lines(sort(ucount), tapply(pvalue.t, count, median), type = "o", pch = 19, cex = 1.3, col = "red") lines(sort(ucount), tapply(pvalue.w, count, median), type = "o", pch = 21, cex = 1.3, col = "blue", lty = 2) legend("topright", c("t test", "Wilcoxon test"), col = c("red", "blue"), pch = c(19, 21), lty = 1:2, bty = "n", cex = 0.8) }) if (interactive()) { ## this is how the data was generated set.seed(402) n = 30 tukeyCount = data.frame(t(replicate(10000, { x1 = rweibull(n, runif(1, 0.5, 4)) x2 = rweibull(n, runif(1, 1, 5)) c(t.test(x1, x2)$p.value, wilcox.test(x1, x2)$p.value, with(rle(rep(0:1, each = n)[order(c(x1, x2))]), ifelse(head(values, 1) == tail(values, 1), 0, sum(lengths[c(1, length(lengths))])))) }))) colnames(tukeyCount) = c("pvalue.t", "pvalue.w", "count") }
The pay per episode for actors as well as other information.
https://flowingdata.com/2011/02/15/visualizethistvstopearners/
data(tvearn) plot(pay ~ rating, data = tvearn) library(ggplot2) qplot(pay, data = tvearn, geom = "histogram", facets = gender ~ ., binwidth = 20000) qplot(rating, pay, data = tvearn, geom = c("jitter", "smooth"), color = type)
data(tvearn) plot(pay ~ rating, data = tvearn) library(ggplot2) qplot(pay, data = tvearn, geom = "histogram", facets = gender ~ ., binwidth = 20000) qplot(rating, pay, data = tvearn, geom = c("jitter", "smooth"), color = type)
This functions generates a color vector from an input vector, which can be of the class numeric or factor.
vec2col(vec, n, name) ## Default S3 method: vec2col(vec, n, name) ## S3 method for class 'factor' vec2col(vec, n, name)
vec2col(vec, n, name) ## Default S3 method: vec2col(vec, n, name) ## S3 method for class 'factor' vec2col(vec, n, name)
vec 
the numeric or factor vector 
n 
the number of colors to be generated from the palette 
name 
the name of the palette 
a vector of colors corresponding to the input vector
Yihui Xie <https://yihui.org>
## convert factor to colors with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Species), pch = 19)) # another palette with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Species, name = "Dark2"), pch = 19)) ## turn numeric values to colors with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Petal.Width), pch = 19))
## convert factor to colors with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Species), pch = 19)) # another palette with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Species, name = "Dark2"), pch = 19)) ## turn numeric values to colors with(iris, plot(Petal.Length, Petal.Width, col = vec2col(Petal.Width), pch = 19))