Distributions
and
Models

Data 304: Visualizing Data and Models

Setup

library(vegabrite)
library(tidyverse)
library(palmerpenguins)
vega_data <- altair::import_vega_data()

Visualizing distributions

Histograms

Ingredients:

bin aggregation, bar mark

Code

vl_chart(width = 150, height = 60) |>
  vl_add_data(penguins) |>
  vl_encode_x("body_mass_g:Q", bin = list(maxbins = 30)) |>
  vl_encode_y(aggregate = "count") |>
  vl_encode_column("species:N") |>
  vl_encode_row("sex:N") |>
  vl_mark_bar()

Improvements?

Improved Histogram

Code

vl_chart(width = 150, height = 60) |>
  vl_add_data(penguins |> filter(!is.na(sex))) |>
  vl_encode_x("body_mass_g:Q", bin = list(maxbins = 30)) |>
  vl_encode_y(aggregate = "count") |>
  vl_axis_x(title = NA, values = (2:6) * 1000) |>
  vl_scale_x(nice = TRUE) |>
  vl_axis_y(title = NA) |>
  vl_encode_column("species:N", title = NA) |>
  vl_encode_row("sex:N", title = NA) |>
  vl_mark_bar() |>
  vl_vconcat(
    vl_chart(height = 15, width = 500) |>
      vl_add_data(tibble(x = 1, y = 1)) |>
      vl_mark_text(size = 14) |>
      vl_encode_text(value = "body mass (g)") |>
      vl_encode_x("x:N", axis = FALSE) |>
      vl_encode_y("y:N", axis = FALSE) |>
      vl_config_view(stroke = "transparent")
      
  )

Freqency polygons

Ingredients:

bin aggregation (just like histograms), line mark

Code

vl_chart(width = 400, height = 60) |>
  vl_add_data(penguins) |>
  vl_mark_line(opacity = 0.5) |>
  vl_encode_x("body_mass_g:Q", bin = list(maxbins = 30)) |>
  vl_encode_y(aggregate = "count") |>
  vl_axis_y(title = NA) |>
  vl_encode_color("species:N") |>
  vl_encode_row("sex:N")

Density plots

Usual method: kernel density estimation (kde)

Code

set.seed(123)
S <- tibble(x = round(rgamma(10, shape = 2, rate = 0.1), 1))

kde_demo <- function(bw = 6) {
  base <-
    vl_chart(width = 300, height = 100) |>
    vl_add_properties(title = paste0("bandwidth: ", bw)) |>
    vl_add_data(S) |>
    vl_encode_y("density:Q") |>
    vl_scale_y(domain = c(0, .5))

  kde <- base |>
    vl_density(
      "x",
      as = list("value", "density"),
      bandwidth = bw,
      counts = TRUE,
      extent = c(0, 60)
    ) |>
    vl_mark_line(opacity = 2) |>
    vl_encode_color(value = "red") |>
    vl_encode_x("value:Q", title = NA)

  kernels <- base |>
    vl_density(
      "x",
      as = list("value", "density"),
      bandwidth = bw,
      groupby = list("x"),
      extent = c(-3, 60),
      counts = TRUE
    ) |>
    vl_mark_line(opacity = 0.5) |>
    vl_encode_detail("x:N") |>
    vl_encode_x("value:Q", title = NA)

  ticks <-
    base |>
    vl_mark_tick() |>
    vl_encode(x = "x") |>
    vl_encode_y(datum = 0)

  kernels + kde + ticks
}

((kde_demo(2) | kde_demo(4)) &
  (kde_demo(6) | kde_demo(9)))

Density plots

Ingredients:

Use density transform to create x/y values for density curve
Use line, area, etc. to display
bandwidth determines amount of smoothing
groupby to compute separate densities for each group

Code

vl_chart(width = 400, height = 60) |>
  vl_add_data(penguins) |>
  vl_density("body_mass_g", groupby = list("species", "sex")) |>
  vl_mark_line(opacity = 0.5) |>
  vl_encode_x("value:Q") |>
  vl_encode_y("density:Q") |>
  vl_encode_color("species:N") |>
  vl_encode_row("sex:N")

Boxplots

Code

vl_chart(width = 500) |>
  vl_add_data(penguins) |>
  vl_mark_boxplot() |>
  vl_encode_y("species:N") |>
  vl_encode_x("body_mass_g:Q", scale = list(zero = FALSE))

More Boxplots

Code

vl_chart(width = 500) |>
  vl_add_data(penguins) |>
  vl_mark_boxplot(outliers = FALSE, median = FALSE, extent = "min-max") |>
  vl_encode_x("body_mass_g:Q", scale = list(zero = FALSE)) |>
  vl_encode_y("species:N") |>
  vl_encode_color("sex:N") |>
  vl_encode_yOffset("sex:N")

Visualizing Models

Data 304: Visualizing Data and Models

We have focused mainly on visualizing data so far.

easier
foundational to visualizing models

(Group) means are “models”

Examples: Two-sample t procedures, ANOVA

Method: aggregate or precompute

Warning

Beware the tendency to summarize everything with means (and to use bars to display them). This is a common approach to visualization, but it hides variation, which is the other side of the coin.

Data value = model mean + residual

Regression models

Vega-Lite can handle some basic regression models with the regression transform.

Code

base <-
  vl_chart() |>
  vl_add_data(penguins) |>
  vl_encode_x("body_mass_g:Q", scale = list(zero = FALSE)) |>
  vl_encode_y("flipper_length_mm:Q", scale = list(zero = FALSE))

points <- base |> vl_mark_circle(opacity = 0.3, size = 15)
line <- base |>
  vl_regression("body_mass_g", on = "flipper_length_mm") |>
  vl_mark_line()

(points + line) |>
  vl_add_properties(width = 500)

LOESS

LOESS = LOcally Estimated Scatterplot Smoothing

Big ideas:

fit many (regression) models using “nearby” points
weight nearer points more than farther points

Code

base <-
  vl_chart() |>
  vl_add_data(penguins) |>
  vl_encode_x("body_mass_g:Q", scale = list(zero = FALSE)) |>
  vl_encode_y("flipper_length_mm:Q", scale = list(zero = FALSE)) 

points <- base |> vl_mark_circle(opacity = 0.3, size = 15) 

line <- function(bw = 0.2, color = "steelblue", opacity = 0.7) {
  base |> 
    vl_loess("body_mass_g", on = "flipper_length_mm", bandwidth = bw) |>
    vl_mark_line(color = color, opacity = opacity) |>
    vl_add_properties(title = paste0("bandwidth = ", bw))
}

vl_concat(
  points +  line(0.1, "red"),
  points +  line(0.2, "forestgreen"),
  points +  line(0.5, "navy"),
  points +  line(0.8, "black")
)

Principles for visualizing models

Consider showing both the data and the model.
- Can help us assess fit.
Include representation of model uncertainy.
- Confidence bands, error bars, “spaghetti plots”, etc.
Perform model diagnostics by visualizing data related to the model.
- Examples: residuals plots, auto-correlation plots
- May need to first create the appropriate data set, then do the visualization
  - Model precitions/fits: data frame of x/etc/y tuples
Surprise and Scale

Visualization can surprise you, but it doesn’t scale well. Modeling scales well, but it can’t surprise you. (Hadley Wickham, paraphrased)
- We need both, and to go back and forth between the two.

broom::tidy()

library(broom)

model <- 
  lm(flipper_length_mm ~ body_mass_g + sex + species, 
     data = penguins)

tidy(model)

# A tibble: 5 × 5
  term              estimate std.error statistic   p.value
  <chr>                <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)      165.       3.18         51.7  1.01e-159
2 body_mass_g        0.00655  0.000931      7.04 1.15e- 11
3 sexmale            2.48     0.854         2.90 3.97e-  3
4 speciesChinstrap   5.54     0.785         7.06 9.92e- 12
5 speciesGentoo     18.0      1.44         12.5  1.46e- 29

broom::augment()

augment(model) |> head(2) |> gt::gt()

.rownames	flipper_length_mm	body_mass_g	sex	species	.fitted	.resid	.hat	.sigma	.cooksd	.std.resid
1	181	3750	male	Adelie	191.6287	-10.628721	0.01240207	5.320866	0.010054358	-2.0008066
2	186	3800	female	Adelie	189.4785	-3.478497	0.01539750	5.350125	0.001345147	-0.6558059

Code

base <-
  vl_chart() |>
  vl_encode_x("body_mass_g:Q", scale = list(zero = FALSE), title = NA) |>
  vl_encode_y("flipper_length_mm:Q", scale = list(zero = FALSE))

points <-
  base |>
  vl_mark_point()

line <-
  base |>
  vl_encode_y(".fitted:Q", title = NA) |>
  vl_mark_line()

area <- base |>
  vl_mark_area(opacity = 0.5) |>
  vl_encode_y2("upper:Q") |>
  vl_encode_y("lower:Q")

(points + area + line) |>
  vl_facet_row("sex:N", title = NA) |>
  vl_facet_column("species:N", title = NA) |>
  vl_add_data(
    augment(model) |>
      mutate(upper = .fitted + .sigma) |>
      mutate(lower = .fitted - .sigma)
  ) |>
  vl_add_properties(height = 40, width = 150)

Your turn

Exercise 1 The anscombe data set (in Vega Data Sets, also in R) contains 4 x-y pairs.

names(anscombe)

[1] "x1" "x2" "x3" "x4" "y1" "y2" "y3" "y4"

Make a graphic showing scatter plots with regression lines for all four pairs. What is the point of this (artifical) data set?

Exercise 2 Make a list of visualizations “of models” that you have seen (in other classes, in papers, etc.). Describe them using the grammar of graphics and the data required to make them. (You may find it handy to draw a sketch for each.)

Your turn

Exercise 3 Open one (or more) of the graphics in these slides that uses a transform and inspect the data in the data viewer to see what the data look like post transformation.

Exercise 4 Return to the penguins data and create a scatter plot with multiple regression or loess lines. What groups of penguins should you use.

Exercise 5 Describe a qq-plot in terms of the grammar of graphics and the data you need to make one. (You migth start by reminding yourself – or asking someone – what a qq-plot is.)

Create a qq-plot (you may choose the data; perhaps use one of the features of penguins) using the quantile transform.

Distributions and Models

Setup

Visualizing distributions

Histograms

Improved Histogram

Freqency polygons

Density plots

Density plots

Boxplots

More Boxplots

Visualizing Models

(Group) means are “models”

Regression models

LOESS

Principles for visualizing models

broom::tidy()

broom::augment()

Your turn

Your turn

Distributions
and
Models