--- title: "USCOTS 2019 tidyverse workshop" author: "Albert Y. Kim" date: "5/15/2019" output: html_document --- # Chapter 7: Multiple regression ```{r} library(tidyverse) library(moderndive) ex1 <- tibble( x = c(0, 0.5, 1, 0, 1), y = c(1, 0, 2, 2, 0), type = c("hot", "hot", "hot", "cold", "cold") ) ggplot(ex1, aes(x=x, y=y, col=type)) + geom_point() ggplot(ex1, aes(x=x, y=y, col=type)) + geom_point() + geom_smooth(method = "lm", se = FALSE) # 1. Fit regression model_3 <- lm(y ~ x + type + x:type, data = ex1) # 2.a) get_regression_table(model_3) ``` ```{r} ex2 <- tibble( x = c(0, 0.5, 1, 0, 1), y = c(1, 0, 2, 2, 3), type = c("hot", "hot", "hot", "cold", "cold") ) ggplot(ex2, aes(x=x, y=y, col=type)) + geom_point() ggplot(ex2, aes(x=x, y=y, col=type)) + geom_point() + geom_smooth(method = "lm", se = FALSE) # 1. Fit regression model_3 <- lm(y ~ x * type, data = ex2) # 2.a) get_regression_table(model_3) # 1. model_4 <- lm(y ~ x + type, data = ex2) # 2.a) get_regression_table(model_4) ``` ## Exercise ```{r} africa <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSttoZ6MRXOTqw6yMUJY5rUxgFuoqLq-VQIQsccSsxONSwJ935evaIrBhZv3oR6CIo4wLuTbvdlErTT/pub?gid=0&single=true&output=csv") %>% mutate(num_countries_range_guess = factor(num_countries_range_guess)) %>% rename(priming = num_countries_range_guess) ggplot(africa, aes(x=height, y=num_countries_africa, col=priming)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(y = "Number of countries", col = "Priming number", title = "Africa") ``` ```{r} # 1. Fit regression model_africa <- lm(num_countries_africa ~ height + priming + height:priming, data = africa) # 2.a) get_regression_table(model_africa) ``` ```{r} get_regression_points(model_africa) ``` *** # Chapter 8: Sampling ## Exercise ```{r} library(tidyverse) library(moderndive) # Inspect sampling bowl. How many rows are there? bowl ``` ```{r} # Use the shovel! How many balls are there? prop_red_33 <- bowl %>% rep_sample_n(size = 50, reps = 1000, replace = FALSE) %>% #group_by(replicate) %>% summarize(prop_red = sum(color == "red")/n()) ``` ```{r} ggplot(data = prop_red_33, mapping = aes(x = prop_red)) + geom_histogram(binwidth = 0.05) ``` *** # Chapter 9: Confidence intervals ## Exercise ```{r} library(tidyverse) pennies_sample <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRtLeHU6j9PRTAJ0bRcUF2uVc1TzYeXd9cC0lwCRfBREy8POx6MgfVeK2CJU6emRKFn_51H-Z8H5YlS/pub?gid=0&single=true&output=csv") %>% mutate(ID = 1:n()) %>% select(ID, year) ``` ```{r} # Use the shovel! How many balls are there? my_means <- pennies_sample %>% rep_sample_n(size = 50, reps = 1000, replace = TRUE) %>% group_by(replicate) %>% summarize(x_bar = mean(year)) ``` ```{r} ggplot(data = my_means, mapping = aes(x = x_bar)) + geom_histogram() + labs(title = "Bootstrap dist = approximation to sampling dist") ``` ```{r} my_samples <- pennies_sample %>% rep_sample_n(size = 50, reps = 1000, replace = TRUE) View(my_samples) my_means <- my_samples %>% group_by(replicate) %>% summarize(x_bar = mean(year)) ``` *** # Chapter 10: Hypothesis testing ## Exercise ```{r} promotions <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSjk7QmpO1bZXeq6tad7HLhqOoVlAEyznkHzmrNvLh4Vcg3XDJpOEam1z2JGBcUhdgsbu-AJ95Msinc/pub?gid=1241329819&single=true&output=csv") ``` *** # Chapter 11: Inference for regression ## Exercise ```{r} library(infer) # Null distribution of test statistic null_distribution <- promotions %>% specify(formula = decision ~ gender, success = "promoted") %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in props", order = c("male", "female")) # Observed test statistic obs_test_stat <- promotions %>% specify(decision ~ gender, success = "promoted") %>% calculate(stat = "diff in props", order = c("male", "female")) obs_test_stat # Plot! ggplot(null_distribution, aes(x=stat)) + geom_histogram(bins = 10) + geom_vline(xintercept = obs_test_stat\$stat, col="red") ``` ***