This started as a data visualization challenge - recreating and improving on the famine dataset visualization from Our World in Data. I wanted to try a different approach using ggplot and plotly that better captures both the duration and scale of each event in a single view.
The original graph, which also includes a dataset.
Scraping the data from the webpage:
library(rvest)
d <- read_html(x = "https://ourworldindata.org/famines#the-our-world-in-data-dataset-of-famines") |>
html_element(".article-block__table--wide table") |>
html_table() |>
clean_names()
str(d)
## tibble [77 × 6] (S3: tbl_df/tbl/data.frame)
## $ year : chr [1:77] "1846–52" "1860-1" "1863-67" "1866-7" ...
## $ country : chr [1:77] "Ireland" "India" "Cape Verde" "India" ...
## $ excess_mortality_midpoint: chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
## $ excess_mortality_lower : chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
## $ excess_mortality_upper : chr [1:77] "1,000,000" "2,000,000" "30,000" "961,043" ...
## $ source : chr [1:77] "Ó Gráda (2007)" "Kumar and Raychaudhuri (1983)" "Ó Gráda (2010): 22" "Kumar and Raychaudhuri (1983)" ...
The year column is not "standardized" and needs to be broken down and reformatted. One small methodological note: the original dataset uses a simple midpoint to estimate mortality where only upper and lower bounds are available. Since the data is heavily skewed, the geometric mean is a better measure of central tendency and is used here instead.
d <- d |>
# hmm two types of dashes
separate(year, "[-,–]", into = c("year_start", "year_end"), remove = F, convert = T) |>
mutate(
# for indexing
digits = str_length(year_end),
# condition year_end with year_start and digits
year_end = (year_start %/% 10^digits) * 10^digits + year_end,
# handle NA
year_end = coalesce(year_end, year_start)
) |>
mutate(across(starts_with("excess"), ~ parse_number(.x))) |>
# geometric mean
mutate(
mortality = sqrt(excess_mortality_lower * excess_mortality_upper) |> floor(),
mortality = ifelse(is.na(mortality), excess_mortality_midpoint, mortality)
) |>
mutate(duration = year_end - year_start) |>
select(-digits)
# to get info on regions
library(countrycode)
d <- d |> mutate(continent = countrycode(country,
origin = "country.name",
destination = "region"
))
str(d)
## tibble [77 × 11] (S3: tbl_df/tbl/data.frame)
## $ year : chr [1:77] "1846–52" "1860-1" "1863-67" "1866-7" ...
## $ year_start : int [1:77] 1846 1860 1863 1866 1868 1868 1870 1876 1876 1877 ...
## $ year_end : num [1:77] 1852 1861 1867 1867 1868 ...
## $ country : chr [1:77] "Ireland" "India" "Cape Verde" "India" ...
## $ excess_mortality_midpoint: num [1:77] 1000000 2000000 30000 961043 100000 ...
## $ excess_mortality_lower : num [1:77] 1000000 2000000 30000 961043 100000 ...
## ..- attr(*, "problems")= tibble [2 × 4] (S3: tbl_df/tbl/data.frame)
## .. ..$ row : int [1:2] 56 58
## .. ..$ col : int [1:2] NA NA
## .. ..$ expected: chr [1:2] "a number" "a number"
## .. ..$ actual : chr [1:2] "-" "-"
## $ excess_mortality_upper : num [1:77] 1000000 2000000 30000 961043 100000 ...
## $ source : chr [1:77] "Ó Gráda (2007)" "Kumar and Raychaudhuri (1983)" "Ó Gráda (2010): 22" "Kumar and Raychaudhuri (1983)" ...
## $ mortality : num [1:77] 1000000 2000000 30000 961043 100000 ...
## $ duration : num [1:77] 6 1 4 1 0 2 1 3 3 2 ...
## $ continent : chr [1:77] "Europe & Central Asia" "South Asia" "Sub-Saharan Africa" "South Asia" ...
Since we have time-to-event data with potentially multiple event types, an event chart is an effective way to display the raw outcome data for a small sample like this.
library(ggforce)
library(plotly)
p_1 <- d |>
ggplot(aes(y = duration, x = year_end, key = source, label = country, label2 = year, label3 = duration)) +
geom_diagonal(aes(x = year_start, xend = year_end, y = 0, yend = duration, col = continent), alpha = 0.5, inherit.aes = FALSE) +
geom_point(aes(size = mortality, col = continent), alpha = 0.9) +
geom_rug(sides = "b", aes(col = continent), alpha = 0.5) +
scale_y_continuous(
expand = c(0.07, 0.07),
breaks = scales::breaks_pretty(10),
labels = scales::unit_format(unit = "Years", scale = 1, digits = 2),
position = "top"
) +
scale_x_continuous(breaks = scales::breaks_pretty(10)) +
scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628", "black")) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank()
) +
ggtitle("Duration of Famines Worldwide, from 1860 to 2016") +
labs(col = NULL) +
guides(
size = "none",
alpha = "none"
)
p_1 |>
ggplotly(tooltip = c("continent", "country", "year", "mortality"), width = 768, height = 480) |>
layout(
yaxis = list(
title = "Duration per famine",
titlefont = list(size = 16),
textangle = 45,
side = "right"
),
legend = list(
orientation = "h"
),
images = list(
list(
source = base64enc::dataURI(file = "www/Our_World_in_Data_logo.png"),
xref = "paper",
yref = "paper",
x = 1.01,
y = 1.01,
sizex = 0.1,
sizey = 0.1,
xanchor = "right",
yanchor = "bottom"
)
)
)