Odds and Ends

Data 304: Visualizing Data and Models

Setup

library(vegabrite)
library(tidyverse)
vega_data <- altair::import_vega_data()

Translating JSON \(\leftrightarrow\) R

{vegabrite} and {altair} convert R code into a Vega-Lite JSON specification for a graphic.
If you read Vega-Lite documentation, you may need to do some translating.

Translating JSON \(\leftrightarrow\) R

For the most part,

JSON array/object \(\leftarrow\) R list.

Sometimes we can use

JSON array \(\leftarrow\) R vector.

Example

{"timeUnit": "year", "field": "date", "range": [2006, 2008] }}

becomes

list(timunit = "year", field = "date", range = c(2006, 2008))

Note

R vectors require that each element have the same, basic data type; R lists can hold any R objects, even if they have different data types.

See for yourself

You can always test to make sure the your {vebabrite} or {altair} code is correctly creating the JSON you expect by converting to JSON and inspecting.

Code

vl_chart() |>
  vl_mark_point(color = "red") |>
  vl_encode_x("some_variable:O", axis = list(title = "X variable")) |>
  format()

{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "mark": {
    "color": "red",
    "type": "point"
  },
  "encoding": {
    "x": {
      "field": "some_variable",
      "axis": {
        "title": "X variable"
      },
      "type": "ordinal"
    }
  }
}

Debugging tip

You can copy and paste the JSON into the Vega Editor, which often provides better error messages.

field/value/datum

We have mostly used encoding to map variables to channels.

Example: region determines color: field = "region:N"

We can also use value to set the scaled value.

Example: set the color to red: value = "red"

Or we can provide a data value (pre-scale):

Example: Make it the same color as the Asia data: datum = "Asia"
This can be useful for adding annotation text to your graphic.

Missing transformation in {vegabrite}

{vegabrite} seems not to include the extent transformation. We can add it using the following definition, which mimics how the other transform functions are built.

vl_extent <- 
  function(spec, extent = NULL, param = NULL ) {
     obj <- vegabrite:::.make_object(
       as.list(environment(), all.names = TRUE), NULL, NULL)
    vegabrite:::.add_transform(
      spec, obj, 
      "#/definitions/ExtentTransform", .trans = "extent")
  }

Your turn

Exercise 1 Copy the JSON that this produces into the Vega Editor and figure out (a) why it doesn’t work as desired and (b) how to fix it.

Code

set.seed(123)
example_data <- tibble(letter = LETTERS[1:5], number = sample(100, 5))

base <-
  vl_chart(width = 300) |>
  vl_add_data(example_data) 

bars <- 
  base |>
  vl_mark_bar() |>
  vl_encode(x = "number:Q") |>
  vl_encode(y = "letter:O") 

rule <- 
  base |>
  vl_extent( extent = "number", param = "n_extent" ) |>
  vl_mark_rule(color = "red") |>
  vl_encode_x(datum = list(expr = "n_extent[0]"))

bars + rule

Target:

Code

# need to set type for x encoding
 
rule <- 
  base |>
  vl_extent( extent = "number", param = "n_extent" ) |>
  vl_mark_rule(color = "red") |>
  vl_encode_x(datum = list(expr = "n_extent[0]"), type = "quantitative")

bars + rule

I think you can’t use vl_extent() to populate a select input because I think options doesn’t accept a parameter.

Using expressions

Expressions can be used anywhere the Vega-Lite documentation lists ExprRef as an option.

Note: There are a few places where you might expect this to work, but it does not! When in doubt, check the documentation.

Expressions can be used to

access parameters
make (simple) calculations (in the evironment where our data and parameters live)

Violin plots

Several of you figured out how to make violin plots, others of you wondered about them.

Exercise 2 What are the key elements to making this half-violin plot?

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Full violin plot

We could compute a negative density using vl_calculate() to get the other side, but there is an easier way: vl_stack_x("center")

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_y("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_x("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_area(orient = "horizontal") |>
  vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * datum.density", as = "jdensity") |>
  vl_encode_y("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_x("jdensity:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_column("MPAA_Rating:N", spacing = 0) |>
  vl_mark_point() |>
  # vl_stack_x("center") |>
  vl_add_properties(width = 50, height = 300) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Sina plot

Here’s another variation on the theme.

Code

base <- 
  vl_chart() |>
  # vl_filter("datum.MPAA_Rating == 'G'") |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_calculate("random() * (2 * random() - 1) * datum.density + 0.12", 
               as = "jdensity") |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE)

points <- 
  base |> vl_mark_point(size = 10) |>
  vl_encode_y("jdensity:Q", title = NA, axis = FALSE) 
  

violins <-
  base |> vl_mark_area(fillOpacity = 0.4, strokeOpacity = 0.8) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_stack_y("center")

  
(points + violins) |>
  vl_add_properties(width = 500, height = 40) |>
  vl_facet_row("MPAA_Rating:N") |>
  vl_config_view(stroke = "transparent") |>
  vl_config_facet(spacing = 0) |>
  vl_add_data_url(vega_data$movies$url)

Ridgeline plots (almost)

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:N", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:N", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Nominal vs Ordinal

Here is the same graphic using an ordinal type for color.

Code

vl_chart() |>
  vl_density("IMDB_Rating", groupby = list("MPAA_Rating")) |>
  vl_encode_x("value:Q", title = "IMDB Rating",
              axis = list(grid = FALSE)) |>
  vl_encode_y("density:Q", title = NA, axis = FALSE) |>
  vl_encode_color("MPAA_Rating:O", legend = FALSE) |>
  vl_encode_row("MPAA_Rating:O", spacing = -25) |>  # too bad it doesn't work!
  vl_mark_area() |>
  vl_add_properties(width = 500, height = 50) |>
  vl_config_view(stroke = "transparent") |>
  vl_add_data_url(vega_data$movies$url)

Tree maps

There is a package in R called {treemap} that makes (ugly) tree maps.

Code

library(treemap)
data(GNI2014)
treemap(
  GNI2014, draw = TRUE,
  index=c("continent", "iso3"),
  vSize="population",
  vColor="GNI",
  type="value",
  format.legend = list(scientific = FALSE, big.mark = " "))

Tree maps

But we can grab just the data and make our own in {vegabrite}

Code

tm <-
  treemap(GNI2014, draw = FALSE,
       index=c("continent", "iso3"),
       vSize="population",
       vColor="GNI",
       type="value",
       format.legend = list(scientific = FALSE, big.mark = " "))

Code

base <-
  vl_chart() |>
  vl_calculate("datum.x0 + datum.w", as = "x1") |>
  vl_calculate("datum.y0 + datum.h", as = "y1") |>
  # vl_encode_fillOpacity("vColor:Q") |>
  vl_encode_fill("continent:N") |>
  vl_encode_x("x0:Q", axis = FALSE) |>
  vl_encode_y("y0:Q", axis = FALSE) |>
  vl_encode_x2("x1:Q") |>
  vl_encode_y2("y1:Q") |>
  vl_config_view(strokeOpacity = 0)

continents <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 1)) |>
  vl_mark_rect(stroke = "navy", opacity = 0.5, strokeWidth = 3) |>
  vl_add_properties(width = 500, height = 500)

countries <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_rect(stroke = "navy", fillOpacity = 0.1, strokeWidth = 1) |>
  vl_encode_strokeWidth(value = 1) |>
  vl_add_properties(width = 500, height = 500)

labels <- 
  base |>
  vl_add_data(tm$tm |> filter(level == 2)) |>
  vl_mark_text(align = "left", baseline = "bottom", 
               dx = 2, dy = -2, size = 5, 
               fillOpacity = 1) |>
  vl_encode_fill(value = "black") |>
  vl_encode_size("vSize", legend = FALSE, scale = list(type = "sqrt")) |>
  vl_encode_text("iso3:N") |>
  vl_add_properties(width = 500, height = 500)


(continents + countries + labels)

Some dat wrangling

Goals:

Create a date object from available pieces
Switch from wide to long format
Filter to remove some of the data

Code

library(tidyverse)
mailing <- 
  jsonlite::fromJSON("../data/swd-lets-practice-ex-2-13.json") |>
  as_tibble()
head(mailing)

# A tibble: 6 × 3
  Date    `Completion Rate` `Response Rate`
  <chr>               <dbl>           <dbl>
1 Q1-2017              0.91           0.023
2 Q2-2017              0.93           0.018
3 Q3-2017              0.91           0.028
4 Q4-2017              0.89           0.023
5 Q1-2018              0.84           0.034
6 Q2-2018              0.88           0.027

Code

mailing_wide <-
  mailing |>
  separate(Date, into = c("quarter", "year"), sep = "-") |>
  mutate(
    quarter = parse_number(quarter),
    year = parse_number(year),
    date_str = paste0(year, "-", 3 * quarter - 2, "-01" ),
    date = ymd(date_str),
    `Net Completion Rate` = `Response Rate` * `Completion Rate`
    ) 
mailing_long <-
  mailing_wide |>
  pivot_longer(matches("Rate"), names_to = "rate type", values_to = "rate") |>
  filter(`rate type` != "Completion Rate")

mailing_long |>
  # vl_filter("datum[rate type] != 'Completion Rate'") |>
  vl_chart() |>
  vl_mark_line() |>
  vl_encode_x("date:T") |>
  vl_encode_y("rate:Q") |>
  vl_encode_color("rate type:N") |>
  vl_add_properties(width = 600, height = 200)

Pie Charts

Some key features:

Ordinal data should be ordered correctly (and use the ordinal type);
Color scheme should reflect the order;
Non-reponse is an odd-ball category (things are easier if you filter it out, but you might like to include it).
It is probably easier to compare percents than counts if the number of surveys is different from one year to the next.

Code

pie_data <- 
  read.csv("https://calvin-data304.netlify.app/data/likert-survey.csv") |>
  group_by(year) |>
  mutate(percent = count / sum(count))

pie_layer <- vl_chart()|>
  vl_mark_arc(outerRadius = 80, opacity = 0.7, stroke = "black") |>
  vl_encode_color("response:O", sort = "number") |>
  vl_scale_color(
    range = c("steelblue", "lightsteelblue", "lightyellow", 
              "pink", "red", "lightgray"))

text_layer <- vl_chart()|>
  vl_mark_text(radius = 55, color = "black", strokeOpacity = 0) |>
  vl_encode_size(
    "percent:Q", legend = FALSE, 
    scale = list(range = c(5,20), type = "sqrt")) |>
  vl_encode_stroke("response:O", sort = "number", legend = FALSE) |>
  vl_encode_text("percent:Q", format = ".0%") 

vl_layer(pie_layer, text_layer) |>
  vl_encode_order("number:Q", sort = "descending") |>
  vl_encode_theta("percent:Q", stack = TRUE)|>
  vl_facet_column("year:N", title = NA, header = list(labelFontSize = 20))|>
  vl_add_data(pie_data)