Commit d4f451e2 authored by Bearloga's avatar Bearloga
Browse files

Chart draft

parent 06f5aa61
# Data Visualization: Content Interactions
![Chart of content interactions on Wikimedia sites](figures/chart.png)
**Phab**: [T314545](https://phabricator.wikimedia.org/T314545)
**Note**: The scripts use the `|>` pipe operator available in R 4.1.0 and later. If using older versions of R you would need to change all instances of `|>` to `%>%` (from [magrittr](https://magrittr.tidyverse.org/) package, available as part of [tidyverse](https://www.tidyverse.org/)).
## Table of Contents
- [data-metrics.R](scripts/data-metrics.R) reads the data and creates a `metrics` tibble
- [data-annotations.R](scripts/data-annotations.R) creates an `annotations` tibble out of `metrics`
- [data-colors.R](scripts/data-colors.R) creates a `wmf_colors` based on colors in the [Wikimedia Design Visual Style Guide](https://design.wikimedia.org/style-guide/visual-style_colors.html)
- [viz-chart.R](scripts/viz-chart.R) creates [chart.png](figures/chart.png) out of `metrics` and `annotations`, using `wmf_colors`
- [viz-ext-ref.R](scripts/viz-ext-ref.R) is a possible extension of the chart that adds a reference line for the previous year
- [viz-ext-arrows.R](scripts/viz-ext-arrows.R) is a possible extension of the chart that adds red and green arrows for current and previous years to highlight positive and negative month-over-month changes, respectively
annotations <- metrics |>
select(month) |>
mutate(
curr_offset = (month %in% (current_month - years(0:4))),
prev_offset = (month %in% ((current_month - months(1)) - years(0:4)))
) |>
filter(curr_offset | prev_offset) |>
mutate(year = year(month)) |>
inner_join(metrics, by = "month") |>
group_by(year) |>
summarize(
from = interactions_corrected[prev_offset],
to = interactions_corrected[curr_offset],
# The change from 2021-06 to 2021-07 (for example):
delta = to - from,
direction = factor(delta > 0, c(TRUE, FALSE), c("up", "down"))
) |>
mutate(
month = ymd(sprintf("%i-%02.0f-01", year, month(current_month))),
prev_month = month - months(1)
)
wmf_colors <- list(
"Base100" = "#ffffff",
"Base90" = "#f8f9fa",
"Base80" = "#eaecf0",
"Base70" = "#c8ccd1",
"Base50" = "#a2a9b1",
"Base30" = "#72777d",
"Base20" = "#54595d",
"Base10" = "#222222",
"Base0" = "#000000",
"Accent50" = "#3366cc",
"Accent30" = "#2a4b8d",
"Accent90" = "#eaf3ff",
"Red90" = "#fee7e6",
"Red50" = "#dd3333",
"Red30" = "#b32424",
"Green90" = "#d5fdf4",
"Green50" = "#00af89",
"Green30" = "#14866d",
"Yellow90" = "#fef6e7",
"Yellow50" = "#ffcc33",
"Yellow30" = "#ac6600"
)
library(here)
library(tidyverse)
library(lubridate)
metrics <- here("data/metrics.csv") |>
read_csv(show_col_types = FALSE) |>
janitor::clean_names() |>
arrange(month) |>
filter(month >= "2018-05-01")
current_month <- max(metrics$month)
metrics <- metrics |>
mutate(
dataloss = (pageview_multiplier > 1.0) |
month %in% (
metrics |>
filter(pageview_multiplier > 1.0) |>
pull(month) |>
range() |>
(\(x) x + months(c(-1, 1)))()
),
period = case_when(
month < "2021-06-01" ~ "before dataloss",
month >= "2021-06-01" & month < "2022-02-01" ~ "during dataloss",
month >= "2022-02-01" ~ "after dataloss"
)
)
library(scales)
library(hrbrthemes)
y_tick_values <- seq(16e9, 23e9, 1e9)
y_tick_labels <- c(
label_number(scale = 1e-9, suffix = " B", accuracy = 1)(y_tick_values[-length(y_tick_values)]),
label_number(scale = 1e-9, suffix = " Billion Content Interactions", accuracy = 1)(y_tick_values[length(y_tick_values)])
)
y_tick_colors <- c(
rep_len(wmf_colors$Base70, length(y_tick_values) - 1),
wmf_colors$Base100
)
p <- metrics |>
filter(period != "during dataloss") |>
ggplot() +
geom_point(
aes(x = month, y = to),
data = annotations,
shape = 21, stroke = 1, size = 20,
color = wmf_colors$Yellow50, fill = wmf_colors$Yellow90,
alpha = 0.5
) +
## Undercounted due to data loss:
geom_line(
aes(x = month, y = interactions),
data = metrics |> filter(dataloss),
color = wmf_colors$Accent50,
linetype = "33",
) +
geom_line(
aes(x = month, y = interactions, group = period),
color = wmf_colors$Accent30
) +
## Corrected estimate:
geom_line(
aes(x = month, y = interactions_corrected),
data = metrics |> filter(dataloss),
color = wmf_colors$Accent30,
linetype = "92",
) +
geom_point(
aes(x = month, y = to),
size = 2, # default 1.5
color = wmf_colors$Accent30,
data = annotations
) +
scale_y_continuous(
name = NULL,
labels = y_tick_labels,
limits = range(y_tick_values),
breaks = y_tick_values
) +
scale_color_manual(
values = c("up" = wmf_colors$Green50, "down" = wmf_colors$Red50),
guide = "none"
) +
scale_x_date(
name = NULL,
breaks = annotations$month,
date_minor_breaks = "1 month",
date_labels = "%B\n%Y"
) +
theme_ipsum_rc(grid = "Yy", base_family = "Montserrat") +
theme(
plot.background = element_rect(fill = "white"),
panel.grid.major.y = element_line(color = y_tick_colors),
panel.grid.minor.y = element_line(color = wmf_colors$Base80),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(
hjust = 0, size = 14,
margin = margin(r = -6, unit = "cm")
),
plot.margin = margin(l = 1, t = 0.5, b = 0.5, r = 0.1, unit = "cm")
)
p
ggsave(
here("figures/chart.png"),
p, width = 12, height = 6, units = "in"
)
p <- p +
geom_segment(
aes(
x = prev_month, xend = month, y = from, yend = to,
color = direction
),
size = 1.0, # default 0.5
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
data = annotations
)
# Previous year for reference:
p <- p +
geom_line(
aes(x = month, y = interactions),
color = wmf_colors$Base70,
data = metrics |>
filter(
between(month, current_month - years(2), current_month - years(1))
) |>
mutate(
month = (\(x) {
year(x) <- year(x) + 1
x
})(month)
)
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment