Skip to content

Commit 935748a

Browse files
author
weste
committed
started births scripts. basketball counts and births don't work quite yet.
1 parent e5faef3 commit 935748a

File tree

5 files changed

+199
-0
lines changed

5 files changed

+199
-0
lines changed

data_scripts/births/baseball_births.R

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
library(tidyverse)
2+
3+
# Grabs data from Lahman database, which is available in R.
4+
baseball_births <- Lahman::People %>%
5+
select(nameGiven, playerID, birthMonth, birthDay, birthCountry, birthDate, debut, finalGame) %>%
6+
rename_all("str_to_lower") %>%
7+
rename_all(.funs = funs(str_remove_all(.,"birth"))) %>%
8+
mutate(from = year(ymd(debut)), to = year(ymd(finalgame)), yrs = to - from) %>%
9+
select(player = namegiven, birthday = date, yrs, from, to, everything(), -playerid, -debut, -finalgame) %>%
10+
filter(!is.na(birthday))
11+
12+
# Data details
13+
dpr_document(baseball_births, extension = ".md.R", export_folder = usethis::proj_get(),
14+
object_name = "baseball_births",
15+
title = "The birth dates of MLB players",
16+
description = "Data obtained from Lahman http://www.seanlahman.com/baseball-archive/statistics/",
17+
source = "https://github.com/cdalzell/Lahman",
18+
var_details = baseball_description)
+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
pacman::p_load(tidyverse, glue, rvest)
2+
3+
# Make month_days tibble for wrangling later
4+
month_days <- bind_rows(
5+
tibble(month = 1, days = 1:31),
6+
tibble(month = 2, days = 1:29),
7+
tibble(month = 3, days = 1:31),
8+
tibble(month = 4, days = 1:30),
9+
tibble(month = 5, days = 1:31),
10+
tibble(month = 6, days = 1:30),
11+
tibble(month = 7, days = 1:31),
12+
tibble(month = 8, days = 1:31),
13+
tibble(month = 9, days = 1:30),
14+
tibble(month = 10, days = 1:31),
15+
tibble(month = 11, days = 1:30),
16+
tibble(month = 12, days = 1:31),
17+
)
18+
19+
20+
basketball_date_rip <- function(m,d) {
21+
22+
urlpath <- glue("https://www.basketball-reference.com/friv/birthdays.fcgi?month={month}&day={day}",
23+
month = m, day = d)
24+
25+
out <- read_html(urlpath) %>%
26+
html_nodes("table") %>%
27+
html_table() %>%
28+
.[[1]]
29+
30+
col_names <- str_remove_all(out[1, 2:4], "\\(s\\)") %>% str_to_lower()
31+
32+
out <- out %>%
33+
.[-1, 2:4]
34+
35+
colnames(out) <- col_names
36+
37+
print(str_c(m,"/", d))
38+
39+
out %>%
40+
mutate(birthday = ymd(str_c(born,"-", m,"-", d))) %>%
41+
select(player, birthday, everything(),-born)
42+
43+
44+
}
45+
46+
47+
basketball_list <- map2(month_days$month, month_days$days, ~basketball_date_rip(.x, .y))
48+
49+
basketball_births <- basketball_list %>%
50+
bind_rows() %>%
51+
as_tibble() %>%
52+
mutate(yrs = as.integer(yrs))
53+
# write_csv(basketball_births, "basketball.csv")
54+
55+
# Data details
56+
dpr_document(basketball_births, extension = ".md.R", export_folder = usethis::proj_get(),
57+
object_name = "basketball_births",
58+
title = "The birth dates of NBA/ABA players",
59+
description = "Data obtained from https://www.basketball-reference.com",
60+
source = "https://www.basketball-reference.com/friv/birthdays.fcgi?month=1&day=1",
61+
var_details = basketball_description)

data_scripts/births/counts_baseball.R

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
library(tidyverse)
2+
3+
# Grabs data from Lahman database, which is available in R (from baseball_births)
4+
baseball_births <- Lahman::People %>%
5+
select(nameGiven, playerID, birthMonth, birthDay, birthCountry, birthDate, debut, finalGame) %>%
6+
rename_all("str_to_lower") %>%
7+
rename_all(.funs = funs(str_remove_all(.,"birth"))) %>%
8+
mutate(from = year(ymd(debut)), to = year(ymd(finalgame)), yrs = to - from) %>%
9+
select(player = namegiven, birthday = date, yrs, from, to, everything(), -playerid, -debut, -finalgame) %>%
10+
filter(!is.na(birthday))
11+
12+
# Wrangling baseball_births into counts_baseball
13+
counts_baseball <- baseball_births %>%
14+
filter(country == "USA", year(birthday) > 1925, year(birthday) < 2015) %>%
15+
count(month, day, name = "births") %>%
16+
mutate(day_of_year = 1:n()) %>%
17+
left_join(tibble(month = 1:12, month_name = month.name)) %>%
18+
select(month_number = month, month_name, day_of_month = day, day_of_year, births)
19+
20+
# Data details
21+
dpr_document(counts_baseball, extension = ".md.R", export_folder = usethis::proj_get(),
22+
object_name = "counts_baseball",
23+
title = "The count of births of MLB players for US born players from 1925 to 2015",
24+
description = "Data obtained from Lahman http://www.seanlahman.com/baseball-archive/statistics/",
25+
source = "https://github.com/cdalzell/Lahman",
26+
var_details = counts_description)
+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
pacman::p_load(tidyverse, glue, rvest)
2+
3+
# Make basketball_births first
4+
5+
# Make month_days tibble for wrangling later
6+
month_days <- bind_rows(
7+
tibble(month = 1, days = 1:31),
8+
tibble(month = 2, days = 1:29),
9+
tibble(month = 3, days = 1:31),
10+
tibble(month = 4, days = 1:30),
11+
tibble(month = 5, days = 1:31),
12+
tibble(month = 6, days = 1:30),
13+
tibble(month = 7, days = 1:31),
14+
tibble(month = 8, days = 1:31),
15+
tibble(month = 9, days = 1:30),
16+
tibble(month = 10, days = 1:31),
17+
tibble(month = 11, days = 1:30),
18+
tibble(month = 12, days = 1:31),
19+
)
20+
21+
22+
basketball_date_rip <- function(m,d) {
23+
24+
urlpath <- glue("https://www.basketball-reference.com/friv/birthdays.fcgi?month={month}&day={day}",
25+
month = m, day = d)
26+
27+
out <- read_html(urlpath) %>%
28+
html_nodes("table") %>%
29+
html_table() %>%
30+
.[[1]]
31+
32+
col_names <- str_remove_all(out[1, 2:4], "\\(s\\)") %>% str_to_lower()
33+
34+
out <- out %>%
35+
.[-1, 2:4]
36+
37+
colnames(out) <- col_names
38+
39+
print(str_c(m,"/", d))
40+
41+
out %>%
42+
mutate(birthday = ymd(str_c(born,"-", m,"-", d))) %>%
43+
select(player, birthday, everything(),-born)
44+
45+
46+
}
47+
48+
49+
basketball_list <- map2(month_days$month, month_days$days, ~basketball_date_rip(.x, .y))
50+
51+
basketball_births <- basketball_list %>%
52+
bind_rows() %>%
53+
as_tibble() %>%
54+
mutate(yrs = as.integer(yrs))
55+
56+
# Convert basketball_births into counts_basketball
57+
counts_basketball <- basketball_births %>%
58+
mutate(month = month(birthday), day = mday(birthday)) %>%
59+
count(month, day, name = "births") %>%
60+
mutate(day_of_year = 1:n()) %>%
61+
left_join(tibble(month = 1:12, month_name = month.name)) %>%
62+
select(month_number = month, month_name, day_of_month = day, day_of_year, births)
63+
64+
# Data details
65+
dpr_document(counts_basketball, extension = ".md.R", export_folder = usethis::proj_get(),
66+
object_name = "counts_basketball",
67+
title = "The count of births of NBA/ABA players",
68+
description = "Data obtained from https://www.basketball-reference.com",
69+
source = "https://www.basketball-reference.com/friv/birthdays.fcgi?month=1&day=1",
70+
var_details = counts_description)

data_scripts/births/counts_us.R

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
library(tidyverse)
2+
3+
# US population data
4+
dat_94_03 <- read_csv("https://github.com/fivethirtyeight/data/raw/master/births/US_births_1994-2003_CDC_NCHS.csv")
5+
dat_00_14 <- read_csv("https://github.com/fivethirtyeight/data/raw/master/births/US_births_2000-2014_SSA.csv")
6+
7+
8+
counts_us <- dat_94_03 %>%
9+
filter(year < 2000) %>%
10+
bind_rows(dat_00_14) %>%
11+
group_by(month, date_of_month) %>%
12+
summarise(births = sum(births)) %>%
13+
ungroup() %>%
14+
mutate(day_of_year = 1:n()) %>%
15+
left_join(tibble(month = 1:12, month_name = month.name)) %>%
16+
select(month_number = month, month_name, day_of_month = date_of_month, day_of_year, births)
17+
18+
19+
# Data details
20+
dpr_document(counts_us, extension = ".md.R", export_folder = usethis::proj_get(),
21+
object_name = "counts_us", title = "The count of births in the United States from 1994-2014",
22+
description = "Data obtained from the CDC and Census parsed by FiveThirtyEight ",
23+
source = "https://github.com/fivethirtyeight/data/tree/master/births",
24+
var_details = counts_description)

0 commit comments

Comments
 (0)