Odds and Ends

Data 304: Visualizing Data and Models

Setup

library(vegabrite)
library(tidyverse)
vega_data <- altair::import_vega_data()

Translating JSON \(\leftrightarrow\) R

{vegabrite} and {altair} convert R code into a Vega-Lite JSON specification for a graphic.
If you read Vega-Lite documentation, you may need to do some translating.

Translating JSON \(\leftrightarrow\) R

For the most part,

JSON array/object \(\leftarrow\) R list.

Sometimes we can use

JSON array \(\leftarrow\) R vector.

Example

{"timeUnit": "year", "field": "date", "range": [2006, 2008] }}

becomes

list(timunit = "year", field = "date", range = c(2006, 2008))

Note

R vectors require that each element have the same, basic data type; R lists can hold any R objects, even if they have different data types.

See for yourself

You can always test to make sure the your {vebabrite} or {altair} code is correctly creating the JSON you expect by converting to JSON and inspecting.

Code

vl_chart() |>
  vl_mark_point(color = "red") |>
  vl_encode_x("some_variable:O", axis = list(title = "X variable")) |>
  format()

{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "mark": {
    "color": "red",
    "type": "point"
  },
  "encoding": {
    "x": {
      "field": "some_variable",
      "axis": {
        "title": "X variable"
      },
      "type": "ordinal"
    }
  }
}

Debugging tip

You can copy and paste the JSON into the Vega Editor, which often provides better error messages.

field/value/datum

We have mostly used encoding to map variables to channels.

Example: region determines color: field = "region:N"

We can also use value to set the scaled value.

Example: set the color to red: value = "red"

Or we can provide a data value (pre-scale):

Example: Make it the same color as the Asia data: datum = "Asia"
This can be useful for adding annotation text to your graphic.

Using expressions

Expressions can be used anywhere the Vega-Lite documentation lists ExprRef as an option.

Note: There are a few places where you might expect this to work, but it does not! When in doubt, check the documentation.

Expressions can be used to

access parameters
make (simple) calculations (in the environment where our data and parameters live)

Violin plots

Exercise 1 What are the key elements to making this half-violin plot?

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating", axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Full violin plot

We could compute a negative density using vl_calculate() to get the other side, but there is an easier way: vl_stack_x("center")

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating", axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * datum.density", as = "jdensity") |>
  vl_encode_y("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_x("jdensity:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_point() |>
  # vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code

base <- 
  vl_chart() |>
  # vl_filter("datum.MPAA_Rating == 'G'") |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * (2 * random() - 1) * datum.density + 0.12", 
               as = "jdensity") |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE)

points <- 
  base |> vl_mark_point(size = 10) |>
  vl_encode_y("jdensity:Q", title = NA, axis = FALSE) 
  

violins <-
  base |> vl_mark_area(fillOpacity = 0.4, strokeOpacity = 0.8) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_stack_y("center")

  
(points + violins) |>
  vl_add_properties(width = 500, height = 40) |>
  vl_facet_row("MPAA_Rating:N") |>
  vl_config_view(stroke = "transparent") |>
  vl_config_facet(spacing = 0) |>
  vl_add_data_url(vega_data$movies$url)

Ridgeline plots (almost)

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:N", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Nominal vs Ordinal

Here is the same graphic using an ordinal type for color.

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:O", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:O", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Tree maps

There is a package in R called {treemap} that makes (ugly) tree maps.

Code

library(treemap)
data(GNI2014)
treemap(
  GNI2014, draw = TRUE,
  index=c("continent", "iso3"),
  vSize="population",
  vColor="GNI",
  type="value",
  format.legend = list(scientific = FALSE, big.mark = " "))

Tree maps

But we can grab just the data and make our own in {vegabrite}

Code

tm <-
  treemap(GNI2014, draw = FALSE,
       index=c("continent", "iso3"),
       vSize="population",
       vColor="GNI",
       type="value",
       format.legend = list(scientific = FALSE, big.mark = " "))

Tree maps

Code

base <-
  vl_chart() |>
  vl_calculate("datum.x0 + datum.w", as = "x1") |>
  vl_calculate("datum.y0 + datum.h", as = "y1") |>
  # vl_encode_fillOpacity("vColor:Q") |>
  vl_encode_fill("continent:N") |>
  vl_encode_x("x0:Q", axis = FALSE) |>
  vl_encode_y("y0:Q", axis = FALSE) |>
  vl_encode_x2("x1:Q") |>
  vl_encode_y2("y1:Q") |>
  vl_config_view(strokeOpacity = 0)

continents <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 1)) |>
  vl_mark_rect(stroke = "navy", opacity = 0.5, strokeWidth = 3) |>
  vl_add_properties(width = 500, height = 500)

countries <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_rect(stroke = "navy", fillOpacity = 0.1, strokeWidth = 1) |>
  vl_encode_strokeWidth(value = 1) |>
  vl_add_properties(width = 500, height = 500)

labels <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_text(align = "left", baseline = "bottom", 
               dx = 2, dy = -2, size = 5, 
               fillOpacity = 1) |>
  vl_encode_fill(value = "black") |>
  vl_encode_size("vSize", legend = FALSE, scale = list(type = "sqrt")) |>
  vl_encode_text("iso3:N") |>
  vl_add_properties(width = 500, height = 500)


(continents + countries + labels)

Some data wrangling

Goals:

Create a date object from available pieces
Switch from wide to long format
Filter to remove some of the data

Code

library(tidyverse)
mailing <-
  jsonlite::fromJSON("../data/swd-lets-practice-ex-2-13.json") |>
  as_tibble()
mailing |> head(2) |> gt::gt()

Date	Completion Rate	Response Rate
Q1-2017	0.91	0.023
Q2-2017	0.93	0.018

Code

mailing_wide <-
  mailing |>
  separate(Date, into = c("quarter", "year"), sep = "-") |>
  mutate(
    quarter = parse_number(quarter),
    year = parse_number(year),
    date_str = paste0(year, "-", 3 * quarter - 2, "-01"),
    date = ymd(date_str),
    `Net Completion Rate` = `Response Rate` * `Completion Rate`
  )
mailing_long <-
  mailing_wide |>
  pivot_longer(matches("Rate"), names_to = "rate type", values_to = "rate") |>
  filter(`rate type` != "Completion Rate")

mailing_long |>
  # vl_filter("datum[rate type] != 'Completion Rate'") |>
  vl_chart() |>
  vl_mark_line() |>
  vl_encode_x("date:T") |>
  vl_encode_y("rate:Q") |>
  vl_encode_color("rate type:N") |>
  vl_add_properties(width = 600, height = 200)

Pie Charts

Some key features:

Ordinal data should be ordered correctly (and use the ordinal type);
Color scheme should reflect the order;
Non-reponse is an odd-ball category (things are easier if you filter it out, but you might like to include it).
It is probably easier to compare percents than counts if the number of surveys is different from one year to the next.

Code

pie_data <- 
  read.csv("https://calvin-data304.netlify.app/data/likert-survey.csv") |>
  group_by(year) |>
  mutate(percent = count / sum(count))

pie_layer <- vl_chart()|>
  vl_mark_arc(outerRadius = 80, opacity = 0.7, stroke = "black") |>
  vl_encode_color("response:O", sort = "number") |>
  vl_scale_color(
    range = c("steelblue", "lightsteelblue", "lightyellow", 
              "pink", "red", "lightgray"))

text_layer <- vl_chart()|>
  vl_mark_text(radius = 55, color = "black", strokeOpacity = 0) |>
  vl_encode_size(
    "percent:Q", legend = FALSE, 
    scale = list(range = c(5,20), type = "sqrt")) |>
  vl_encode_stroke("response:O", sort = "number", legend = FALSE) |>
  vl_encode_text("percent:Q", format = ".0%") 

vl_layer(pie_layer, text_layer) |>
  vl_encode_order("number:Q", sort = "descending") |>
  vl_encode_theta("percent:Q", stack = TRUE)|>
  vl_facet_column("year:N", title = NA, header = list(labelFontSize = 20))|>
  vl_add_data(pie_data)

Missing transformation in {vegabrite}

{vegabrite} seems not to include the extent transformation. We can add it using the following definition, which mimics how the other transform functions are built.

vl_extent <- 
  function(spec, extent = NULL, param = NULL ) {
     obj <- vegabrite:::.make_object(
       as.list(environment(), all.names = TRUE), NULL, NULL)
    vegabrite:::.add_transform(
      spec, obj, 
      "#/definitions/ExtentTransform", .trans = "extent")
  }

What does extent do?

Extent computes the minimum and maximum values of a variable.
Useful for placing annotation on a graphic.

Code

set.seed(123)
example_data <- tibble(letter = LETTERS[1:5], number = sample(100, 5))

base <-
  vl_chart(width = 300) |>
  vl_add_data(example_data)

bars <-
  base |>
  vl_mark_bar() |>
  vl_encode(x = "number:Q") |>
  vl_encode(y = "letter:O")

rule <-
  base |>
  vl_extent(extent = "number", param = "number_extent") |>
  vl_mark_rule(color = "red", size = 4) |>
  vl_encode_x(datum = list(expr = "number_extent[0]"), type = "quantitative")

bars + rule

Potential gotcha with extent

You must specify type = "quantitative" when using

  vl_encode_x(datum = list(expr = "number_extent[0]"), type = "quantitative")

try deleting that in the Vega editor or in the vegabrite code and see what happens.