TyT2019W28 - Sankey Diagram

By Johanie Fournier, agr. in rstats tidyverse tidytuesday

July 15, 2019

Get the data

wwc_outcomes <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv")
## Rows: 568 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): team, round, win_status
## dbl (4): year, score, yearly_game_id, team_num
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
codes <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, team
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the data

summary(wwc_outcomes)
##       year          team               score           round          
##  Min.   :1991   Length:568         Min.   : 0.000   Length:568        
##  1st Qu.:1999   Class :character   1st Qu.: 0.000   Class :character  
##  Median :2007   Mode  :character   Median : 1.000   Mode  :character  
##  Mean   :2007                      Mean   : 1.614                     
##  3rd Qu.:2015                      3rd Qu.: 2.000                     
##  Max.   :2019                      Max.   :13.000                     
##  yearly_game_id     team_num    win_status       
##  Min.   : 1.00   Min.   :1.0   Length:568        
##  1st Qu.: 9.00   1st Qu.:1.0   Class :character  
##  Median :18.00   Median :1.5   Mode  :character  
##  Mean   :19.61   Mean   :1.5                     
##  3rd Qu.:27.00   3rd Qu.:2.0                     
##  Max.   :52.00   Max.   :2.0
summary(codes)
##    country              team          
##  Length:212         Length:212        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Prepare the data

data<-wwc_outcomes%>%
  left_join(codes, by = "team") %>% 
  mutate(country=ifelse(country=="Brazil", "Brésil", country)) %>% 
  mutate(country=ifelse(country=="Germany", "Allemagne", country)) %>% 
  mutate(country=ifelse(country=="Japan", "Japon", country)) %>% 
  mutate(country=ifelse(country=="Norway", "Norvège", country)) %>% 
  mutate(country=ifelse(country=="Sweden", "Suède", country)) %>% 
  mutate(country=ifelse(country=="United States", "États-Unis", country)) %>% 
  mutate(country=ifelse(country=="North Korea", "Corée du Nord", country)) %>% 
  mutate(country=ifelse(country=="England", "Angleterre", country)) %>%
  arrange(year,yearly_game_id, team_num) %>% 
  mutate(equipe_gagante=country) %>% 
  mutate(equipe_perdante=lead(country)) %>% 
  filter(!is.na(equipe_perdante)) %>% 
  select(equipe_gagante, equipe_perdante) %>% 
  group_by(equipe_gagante, equipe_perdante) %>% 
  summarise(freq=n()) %>% 
  filter(freq>4) %>% 
  ungroup() %>%
  mutate(equipe_gagante = factor(equipe_gagante,
                        levels = c("Brésil", "Canada", "Allemagne",
                                   "Japon", "Norvège", "Suède",
                                   "États-Unis")))

Visualize the data

#Graphique 
data<-wwc_outcomes%>%
  left_join(codes, by = "team") %>% 
  arrange(year,yearly_game_id, team_num) %>% 
  mutate(equipe_gagante=country) %>% 
  mutate(equipe_perdante=lead(country)) %>% 
  filter(!is.na(equipe_perdante)) %>% 
  select(equipe_gagante, equipe_perdante) %>% 
  group_by(equipe_gagante, equipe_perdante) %>% 
  summarise(freq=n()) %>% 
  filter(freq>4) 

gg<-ggplot(data=data, aes(axis1 = equipe_gagante, axis2 = equipe_perdante, y=freq))
gg<-gg + geom_alluvium(aes(fill = equipe_gagante), width = 1/7)
gg<-gg + geom_stratum(width = 1/7, alpha=0.5, color = "black") 
gg<-gg + geom_text(stat = "stratum", label.strata = TRUE) 
gg<-gg + scale_fill_manual(values = c("#736F6E", "#736F6E", "#018E42","#736F6E", "#736F6E", "#736F6E", "#736F6E"))
#ajuster les axes 
gg<-gg + scale_x_discrete(limits = c("Winning Team", "Losing Team"), expand = c(.05, .05), position = "top")
#modifier la légende
gg<-gg + theme(legend.position="none")
#modifier le thème
gg<-gg +theme(panel.border = element_blank(),
              panel.background = element_blank(),
              plot.background = element_blank(),
              panel.grid.major.y= element_blank(),
              panel.grid.major.x= element_blank(),
              panel.grid.minor = element_blank(),
              axis.line = element_blank(),
              axis.ticks= element_blank())
#ajouter les titres
gg<-gg + labs(title="Germany wins often .... but not the right matches!\n",
              subtitle="The following graph shows the countries of the national women's teams that have won more than 4 victories against their opponents in all FIFA's history.\nThe German team has won 44 games in the last 28 years compared to 50 for the US team. Unfortunately, Germany did not win enough in the final\nbecause they won only 2 World Cups compared to 4 for the United States.\n\n",
             y=" ", 
              x=" ")
gg<-gg + theme(plot.title    = element_text(hjust=0,size=26, color="#018E42", face="bold"),
                plot.subtitle = element_text(hjust=0,size=12, color="#736F6E"),
               axis.title.y  = element_blank(),
               axis.title.x  = element_blank(),
               axis.text.y   = element_blank(), 
               axis.text.x   = element_text(hjust=0.5, size=12, color="#000000"))
Posted on:
July 15, 2019
Length:
3 minute read, 529 words
Categories:
rstats tidyverse tidytuesday
Tags:
rstats tidyverse tidytuesday
See Also:
Predicting MO with H2O Models from IRDA data
IRDA soil data
This is the begining of a cheat sheet!