This started as a data visualization challenge - recreating and improving on the famine dataset visualization from Our World in Data. I wanted to try a different approach using ggplot and plotly that better captures both the duration and scale of each event in a single view.

The original graph, which also includes a dataset.

Scraping the data from the webpage:

library(rvest)

d <- read_html(x = "https://ourworldindata.org/famines#the-our-world-in-data-dataset-of-famines") |>
  html_element(".article-block__table--wide table") |>
  html_table() |>
  clean_names()

str(d)
## tibble [77 × 6] (S3: tbl_df/tbl/data.frame)
##  $ year                     : chr [1:77] "1846–52" "1860-1" "1863-67" "1866-7" ...
##  $ country                  : chr [1:77] "Ireland" "India" "Cape Verde" "India" ...
##  $ excess_mortality_midpoint: chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
##  $ excess_mortality_lower   : chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
##  $ excess_mortality_upper   : chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
##  $ source                   : chr [1:77] "Ó Gráda (2007)" "Kumar and Raychaudhuri (1983)" "Ó Gráda (2010): 22" "Kumar and Raychaudhuri (1983)" ...

The year column is not "standardized" and needs to be broken down and reformatted. One small methodological note: the original dataset uses a simple midpoint to estimate mortality where only upper and lower bounds are available. Since the data is heavily skewed, the geometric mean is a better measure of central tendency and is used here instead.

d <- d |>
  # hmm two types of dashes
  separate(year, "[-,–]", into = c("year_start", "year_end"), remove = F, convert = T) |>
  mutate(
    # for indexing
    digits = str_length(year_end),
    # condition year_end with year_start and digits
    year_end = (year_start %/% 10^digits) * 10^digits + year_end,
    # handle NA
    year_end = coalesce(year_end, year_start)
  ) |>
  mutate(across(starts_with("excess"), ~ parse_number(.x))) |>
  # geometric mean
  mutate(
    mortality = sqrt(excess_mortality_lower * excess_mortality_upper) |> floor(),
    mortality = ifelse(is.na(mortality), excess_mortality_midpoint, mortality)
  ) |>
  mutate(duration = year_end - year_start) |>
  select(-digits)


# to get info on regions
library(countrycode)

d <- d |> mutate(continent = countrycode(country,
  origin = "country.name",
  destination = "region"
))

str(d)
## tibble [77 × 11] (S3: tbl_df/tbl/data.frame)
##  $ year                     : chr [1:77] "1846–52" "1860-1" "1863-67" "1866-7" ...
##  $ year_start               : int [1:77] 1846 1860 1863 1866 1868 1868 1870 1876 1876 1877 ...
##  $ year_end                 : num [1:77] 1852 1861 1867 1867 1868 ...
##  $ country                  : chr [1:77] "Ireland" "India" "Cape Verde" "India" ...
##  $ excess_mortality_midpoint: num [1:77] 1000000 2000000 30000 961043 100000 ...
##  $ excess_mortality_lower   : num [1:77] 1000000 2000000 30000 961043 100000 ...
##   ..- attr(*, "problems")= tibble [2 × 4] (S3: tbl_df/tbl/data.frame)
##   .. ..$ row     : int [1:2] 56 58
##   .. ..$ col     : int [1:2] NA NA
##   .. ..$ expected: chr [1:2] "a number" "a number"
##   .. ..$ actual  : chr [1:2] "-" "-"
##  $ excess_mortality_upper   : num [1:77] 1000000 2000000 30000 961043 100000 ...
##  $ source                   : chr [1:77] "Ó Gráda (2007)" "Kumar and Raychaudhuri (1983)" "Ó Gráda (2010): 22" "Kumar and Raychaudhuri (1983)" ...
##  $ mortality                : num [1:77] 1000000 2000000 30000 961043 100000 ...
##  $ duration                 : num [1:77] 6 1 4 1 0 2 1 3 3 2 ...
##  $ continent                : chr [1:77] "Europe & Central Asia" "South Asia" "Sub-Saharan Africa" "South Asia" ...

Since we have time-to-event data with potentially multiple event types, an event chart is an effective way to display the raw outcome data for a small sample like this.

library(ggforce)
library(plotly)

p_1 <- d |>
  ggplot(aes(y = duration, x = year_end, key = source, label = country, label2 = year, label3 = duration)) +
  geom_diagonal(aes(x = year_start, xend = year_end, y = 0, yend = duration, col = continent), alpha = 0.5, inherit.aes = FALSE) +
  geom_point(aes(size = mortality, col = continent), alpha = 0.9) +
  geom_rug(sides = "b", aes(col = continent), alpha = 0.5) +
  scale_y_continuous(
    expand = c(0.07, 0.07),
    breaks = scales::breaks_pretty(10),
    labels = scales::unit_format(unit = "Years", scale = 1, digits = 2),
    position = "top"
  ) +
  scale_x_continuous(breaks = scales::breaks_pretty(10)) +
  scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628", "black")) +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank(),
    axis.title.x = element_blank()
  ) +
  ggtitle("Duration of Famines Worldwide, from 1860 to 2016") +
  labs(col = NULL) +
  guides(
    size = "none",
    alpha = "none"
  )


p_1 |>
  ggplotly(tooltip = c("continent", "country", "year", "mortality"), width = 768, height = 480) |>
  layout(
    yaxis = list(
      title = "Duration per famine",
      titlefont = list(size = 16),
      textangle = 45,
      side = "right"
    ),
    legend = list(
      orientation = "h"
    ),
    images = list(
      list(
        source = base64enc::dataURI(file = "www/Our_World_in_Data_logo.png"),
        xref = "paper",
        yref = "paper",
        x = 1.01,
        y = 1.01,
        sizex = 0.1,
        sizey = 0.1,
        xanchor = "right",
        yanchor = "bottom"
      )
    )
  )