Odds and Ends

Data 304: Visualizing Data and Models

Setup

library(vegabrite)
library(tidyverse)
vega_data <- altair::import_vega_data()

Translating JSON \(\leftrightarrow\) R

  • {vegabrite} and {altair} convert R code into a Vega-Lite JSON specification for a graphic.

  • If you read Vega-Lite documentation, you may need to do some translating.

Translating JSON \(\leftrightarrow\) R

For the most part,

  • JSON array/object \(\leftarrow\) R list.

Sometimes we can use

  • JSON array \(\leftarrow\) R vector.

Example

{"timeUnit": "year", "field": "date", "range": [2006, 2008] }}

becomes

list(timunit = "year", field = "date", range = c(2006, 2008)) 

Note

R vectors require that each element have the same, basic data type; R lists can hold any R objects, even if they have different data types.

See for yourself

You can always test to make sure the your {vebabrite} or {altair} code is correctly creating the JSON you expect by converting to JSON and inspecting.

Code
vl_chart() |>
  vl_mark_point(color = "red") |>
  vl_encode_x("some_variable:O", axis = list(title = "X variable")) |>
  format()
{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "mark": {
    "color": "red",
    "type": "point"
  },
  "encoding": {
    "x": {
      "field": "some_variable",
      "axis": {
        "title": "X variable"
      },
      "type": "ordinal"
    }
  }
} 

Debugging tip

You can copy and paste the JSON into the Vega Editor, which often provides better error messages.

field/value/datum

We have mostly used encoding to map variables to channels.

  • Example: region determines color: field = "region:N"

We can also use value to set the scaled value.

  • Example: set the color to red: value = "red"

Or we can provide a data value (pre-scale):

  • Example: Make it the same color as the Asia data: datum = "Asia"
  • This can be useful for adding annotation text to your graphic.

Using expressions

Expressions can be used anywhere the Vega-Lite documentation lists ExprRef as an option.

  • Note: There are a few places where you might expect this to work, but it does not! When in doubt, check the documentation.

Expressions can be used to

  • access parameters
  • make (simple) calculations (in the environment where our data and parameters live)

Violin plots

Exercise 1 What are the key elements to making this half-violin plot?

Code
vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating", axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Full violin plot

We could compute a negative density using vl_calculate() to get the other side, but there is an easier way: vl_stack_x("center")

Code
vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating", axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code
vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * datum.density", as = "jdensity") |>
  vl_encode_y("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_x("jdensity:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_point() |>
  # vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code
base <- 
  vl_chart() |>
  # vl_filter("datum.MPAA_Rating == 'G'") |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * (2 * random() - 1) * datum.density + 0.12", 
               as = "jdensity") |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE)

points <- 
  base |> vl_mark_point(size = 10) |>
  vl_encode_y("jdensity:Q", title = NA, axis = FALSE) 
  

violins <-
  base |> vl_mark_area(fillOpacity = 0.4, strokeOpacity = 0.8) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_stack_y("center")

  
(points + violins) |>
  vl_add_properties(width = 500, height = 40) |>
  vl_facet_row("MPAA_Rating:N") |>
  vl_config_view(stroke = "transparent") |>
  vl_config_facet(spacing = 0) |>
  vl_add_data_url(vega_data$movies$url) 

Ridgeline plots (almost)

Code
vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:N", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Nominal vs Ordinal

Here is the same graphic using an ordinal type for color.

Code
vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:O", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:O", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Tree maps

There is a package in R called {treemap} that makes (ugly) tree maps.

Code
library(treemap)
data(GNI2014)
treemap(
  GNI2014, draw = TRUE,
  index=c("continent", "iso3"),
  vSize="population",
  vColor="GNI",
  type="value",
  format.legend = list(scientific = FALSE, big.mark = " "))

Tree maps

But we can grab just the data and make our own in {vegabrite}

Code
tm <-
  treemap(GNI2014, draw = FALSE,
       index=c("continent", "iso3"),
       vSize="population",
       vColor="GNI",
       type="value",
       format.legend = list(scientific = FALSE, big.mark = " "))

Tree maps

Code
base <-
  vl_chart() |>
  vl_calculate("datum.x0 + datum.w", as = "x1") |>
  vl_calculate("datum.y0 + datum.h", as = "y1") |>
  # vl_encode_fillOpacity("vColor:Q") |>
  vl_encode_fill("continent:N") |>
  vl_encode_x("x0:Q", axis = FALSE) |>
  vl_encode_y("y0:Q", axis = FALSE) |>
  vl_encode_x2("x1:Q") |>
  vl_encode_y2("y1:Q") |>
  vl_config_view(strokeOpacity = 0)

continents <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 1)) |>
  vl_mark_rect(stroke = "navy", opacity = 0.5, strokeWidth = 3) |>
  vl_add_properties(width = 500, height = 500)

countries <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_rect(stroke = "navy", fillOpacity = 0.1, strokeWidth = 1) |>
  vl_encode_strokeWidth(value = 1) |>
  vl_add_properties(width = 500, height = 500)

labels <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_text(align = "left", baseline = "bottom", 
               dx = 2, dy = -2, size = 5, 
               fillOpacity = 1) |>
  vl_encode_fill(value = "black") |>
  vl_encode_size("vSize", legend = FALSE, scale = list(type = "sqrt")) |>
  vl_encode_text("iso3:N") |>
  vl_add_properties(width = 500, height = 500)


(continents + countries + labels)

Some data wrangling

Goals:

  • Create a date object from available pieces
  • Switch from wide to long format
  • Filter to remove some of the data
Code
library(tidyverse)
mailing <-
  jsonlite::fromJSON("../data/swd-lets-practice-ex-2-13.json") |>
  as_tibble()
mailing |> head(2) |> gt::gt()
Date Completion Rate Response Rate
Q1-2017 0.91 0.023
Q2-2017 0.93 0.018
Code
mailing_wide <-
  mailing |>
  separate(Date, into = c("quarter", "year"), sep = "-") |>
  mutate(
    quarter = parse_number(quarter),
    year = parse_number(year),
    date_str = paste0(year, "-", 3 * quarter - 2, "-01"),
    date = ymd(date_str),
    `Net Completion Rate` = `Response Rate` * `Completion Rate`
  )
mailing_long <-
  mailing_wide |>
  pivot_longer(matches("Rate"), names_to = "rate type", values_to = "rate") |>
  filter(`rate type` != "Completion Rate")

mailing_long |>
  # vl_filter("datum[rate type] != 'Completion Rate'") |>
  vl_chart() |>
  vl_mark_line() |>
  vl_encode_x("date:T") |>
  vl_encode_y("rate:Q") |>
  vl_encode_color("rate type:N") |>
  vl_add_properties(width = 600, height = 200)

Pie Charts

Some key features:

  • Ordinal data should be ordered correctly (and use the ordinal type);
  • Color scheme should reflect the order;
  • Non-reponse is an odd-ball category (things are easier if you filter it out, but you might like to include it).
  • It is probably easier to compare percents than counts if the number of surveys is different from one year to the next.
Code
pie_data <- 
  read.csv("https://calvin-data304.netlify.app/data/likert-survey.csv") |>
  group_by(year) |>
  mutate(percent = count / sum(count))

pie_layer <- vl_chart()|>
  vl_mark_arc(outerRadius = 80, opacity = 0.7, stroke = "black") |>
  vl_encode_color("response:O", sort = "number") |>
  vl_scale_color(
    range = c("steelblue", "lightsteelblue", "lightyellow", 
              "pink", "red", "lightgray"))

text_layer <- vl_chart()|>
  vl_mark_text(radius = 55, color = "black", strokeOpacity = 0) |>
  vl_encode_size(
    "percent:Q", legend = FALSE, 
    scale = list(range = c(5,20), type = "sqrt")) |>
  vl_encode_stroke("response:O", sort = "number", legend = FALSE) |>
  vl_encode_text("percent:Q", format = ".0%") 

vl_layer(pie_layer, text_layer) |>
  vl_encode_order("number:Q", sort = "descending") |>
  vl_encode_theta("percent:Q", stack = TRUE)|>
  vl_facet_column("year:N", title = NA, header = list(labelFontSize = 20))|>
  vl_add_data(pie_data)

Missing transformation in {vegabrite}

{vegabrite} seems not to include the extent transformation. We can add it using the following definition, which mimics how the other transform functions are built.

vl_extent <- 
  function(spec, extent = NULL, param = NULL ) {
     obj <- vegabrite:::.make_object(
       as.list(environment(), all.names = TRUE), NULL, NULL)
    vegabrite:::.add_transform(
      spec, obj, 
      "#/definitions/ExtentTransform", .trans = "extent")
  }

What does extent do?

  • Extent computes the minimum and maximum values of a variable.

  • Useful for placing annotation on a graphic.

Code
set.seed(123)
example_data <- tibble(letter = LETTERS[1:5], number = sample(100, 5))

base <-
  vl_chart(width = 300) |>
  vl_add_data(example_data)

bars <-
  base |>
  vl_mark_bar() |>
  vl_encode(x = "number:Q") |>
  vl_encode(y = "letter:O")

rule <-
  base |>
  vl_extent(extent = "number", param = "number_extent") |>
  vl_mark_rule(color = "red", size = 4) |>
  vl_encode_x(datum = list(expr = "number_extent[0]"), type = "quantitative")

bars + rule

Potential gotcha with extent

You must specify type = "quantitative" when using

  vl_encode_x(datum = list(expr = "number_extent[0]"), type = "quantitative")
  • try deleting that in the Vega editor or in the vegabrite code and see what happens.