TyT2019W15 - Left_join or Right_join?

By Johanie Fournier, agr. in rstats tidyverse tidytuesday

January 13, 2019

Get the data

player_dob <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/player_dob.csv")
## Rows: 105 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, grand_slam
## dbl  (1): age
## date (2): date_of_birth, date_of_first_title
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grand_slams <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slams.csv")
## Rows: 416 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): grand_slam, name, gender
## dbl  (2): year, rolling_win_count
## date (1): tournament_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grand_slam_timeline <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slam_timeline.csv")
## Rows: 12605 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): player, tournament, outcome, gender
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the data

summary(player_dob)
##      name            grand_slam        date_of_birth       
##  Length:105         Length:105         Min.   :1934-11-02  
##  Class :character   Class :character   1st Qu.:1956-03-19  
##  Mode  :character   Mode  :character   Median :1971-08-12  
##                                        Mean   :1968-10-21  
##                                        3rd Qu.:1981-08-08  
##                                        Max.   :1997-10-16  
##                                                            
##  date_of_first_title       age       
##  Min.   :1968-06-08   Min.   : 5961  
##  1st Qu.:1978-06-16   1st Qu.: 7512  
##  Median :1994-10-15   Median : 8286  
##  Mean   :1992-10-28   Mean   : 8531  
##  3rd Qu.:2004-06-06   3rd Qu.: 9502  
##  Max.   :2018-09-08   Max.   :12724  
##  NA's   :3            NA's   :3
summary(grand_slams)
##       year       grand_slam            name           rolling_win_count
##  Min.   :1968   Length:416         Length:416         Min.   : 1.000   
##  1st Qu.:1980   Class :character   Class :character   1st Qu.: 1.000   
##  Median :1993   Mode  :character   Mode  :character   Median : 4.000   
##  Mean   :1993                                         Mean   : 5.507   
##  3rd Qu.:2006                                         3rd Qu.: 8.000   
##  Max.   :2019                                         Max.   :23.000   
##  tournament_date         gender         
##  Min.   :1968-01-10   Length:416        
##  1st Qu.:1979-12-10   Class :character  
##  Median :1993-03-26   Mode  :character  
##  Mean   :1993-04-09                     
##  3rd Qu.:2006-02-16                     
##  Max.   :2019-01-10
summary(grand_slam_timeline)
##     player               year       tournament          outcome         
##  Length:12605       Min.   :1968   Length:12605       Length:12605      
##  Class :character   1st Qu.:1981   Class :character   Class :character  
##  Mode  :character   Median :1993   Mode  :character   Mode  :character  
##                     Mean   :1993                                        
##                     3rd Qu.:2005                                        
##                     Max.   :2019                                        
##     gender         
##  Length:12605      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Prepare the data

gender<-grand_slams%>% #J'ai besion de sélectionner seulement le genre
  select(name, gender)%>% 
  distinct()

data<-data_age%>%
  mutate(age_y=round(age/365, digits = 0))%>% #modifier l'age pour l'avoir en année
  mutate(tournament_date=date_of_first_title)%>% #avoir le même nom de colonne pour joindre les fichiers
  left_join(gender, by="name")%>%
  mutate(annee=year(date_of_first_title))%>%
  select("name", "gender", "age_y", "date_of_first_title")

Visualize the data

gg<-ggplot(data=data, aes(x=decennie, y=age_moy, group=gender, color=gender))
gg<-gg + geom_line(size=3)
gg<-gg + geom_point(size=6)
gg<- gg +scale_color_manual(values=c("#931328", "#3E7BBC"))
gg<-gg + geom_point(size=5, color="#FFFFFF")
#Ajouter les étiquettes de données
gg<-gg + geom_text(data=data, aes(x=decennie, y=age_moy, label=round(age_moy, digits=0)), size=2.75, vjust=0.5, family="Calibri")
gg<- gg +scale_color_manual(values=c("#931328", "#3E7BBC"))
#modifier la légende
gg<-gg + theme(legend.position="none")
#ajuster les étiquettes des axes
gg<-gg + scale_y_continuous(breaks=seq(15, 35, 5),limits = c(15, 35))
#modifier le thème
gg<-gg +theme(panel.border = element_blank(),
              panel.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF"),
              plot.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF"),
              panel.grid.major.x= element_line(linetype="dotted", size=0.5, color="#9F9F9F"),
              panel.grid.major.y= element_blank(),
              panel.grid.minor = element_blank(),
              axis.line.y = element_blank(),
              axis.line.x = element_line(linetype="solid", size=1, color="#9F9F9F"),
              axis.ticks.x = element_line(linetype="solid", size=1, color="#9F9F9F"),
              axis.ticks.y = element_blank())
#ajouter les titres
gg<-gg + labs(title= "Grand Chelem: What happend in the 80s?",
              subtitle="The average age at which players won their first title went from 28 to 22 for men and 25 to 18 for women\nbetween the 1960s and the 1980s. The average age returned to 27 for men and 26 for women in 2010.",
              y="Mean age at first win", 
              x="Years")
gg<-gg + theme(plot.title    = element_text(hjust=0,size=20, color="#5B5B5B"),
               plot.subtitle = element_text(hjust=0,size=12, color="#5B5B5B"),
               axis.title.x  = element_text(hjust=0.5, size=12,angle=360, color="#5B5B5B"),
               axis.title.y  = element_text(hjust=0.5, size=12, angle=90,color="#5B5B5B"),
               axis.text.y   = element_blank(), 
               axis.text.x   = element_text(hjust=0.5, size=8, color="#5B5B5B"))
Posted on:
January 13, 2019
Length:
3 minute read, 627 words
Categories:
rstats tidyverse tidytuesday
Tags:
rstats tidyverse tidytuesday
See Also:
Predicting MO with H2O Models from IRDA data
IRDA soil data
This is the begining of a cheat sheet!