Part 1

Read in the following infant mortality data from http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv

mort = read_csv("http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

or use

mort = read_mortality()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

Then change the first column name to country"

mort = mort %>% 
  rename(country = X1)

or

colnames(mort)[1] = "country"
  1. Compute the correlation between the 1980, 1990, 2000, and 2010 mortality data. No need to save this in an object. Just display the result to the screen.
sub = mort %>% 
  select(`1980`, `1990`, `2000`, `2010`)
cor(sub)
##           1980      1990      2000 2010
## 1980 1.0000000 0.9601540 0.8888411   NA
## 1990 0.9601540 1.0000000 0.9613842   NA
## 2000 0.8888411 0.9613842 1.0000000   NA
## 2010        NA        NA        NA    1
cor(sub, use = "complete.obs")
##           1980      1990      2000      2010
## 1980 1.0000000 0.9596846 0.8877433 0.8468284
## 1990 0.9596846 1.0000000 0.9610269 0.9247192
## 2000 0.8877433 0.9610269 1.0000000 0.9862345
## 2010 0.8468284 0.9247192 0.9862345 1.0000000
psych::corr.test(sub, use = "complete.obs")
## Call:psych::corr.test(x = sub, use = "complete.obs")
## Correlation matrix 
##      1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size 
##      1980 1990 2000 2010
## 1980  197  197  197  193
## 1990  197  197  197  193
## 2000  197  197  197  193
## 2010  193  193  193  193
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      1980 1990 2000 2010
## 1980    0    0    0    0
## 1990    0    0    0    0
## 2000    0    0    0    0
## 2010    0    0    0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
psych::corr.test(sub)
## Call:psych::corr.test(x = sub)
## Correlation matrix 
##      1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size 
##      1980 1990 2000 2010
## 1980  197  197  197  193
## 1990  197  197  197  193
## 2000  197  197  197  193
## 2010  193  193  193  193
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      1980 1990 2000 2010
## 1980    0    0    0    0
## 1990    0    0    0    0
## 2000    0    0    0    0
## 2010    0    0    0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
cor(mort[,c("1980", "1990", "2000", "2010")])
##           1980      1990      2000 2010
## 1980 1.0000000 0.9601540 0.8888411   NA
## 1990 0.9601540 1.0000000 0.9613842   NA
## 2000 0.8888411 0.9613842 1.0000000   NA
## 2010        NA        NA        NA    1

What’s going on?! Seems we have NAs in the 2010 column

summary(mort[,c("1980", "1990", "2000", "2010")])
##       1980              1990              2000              2010        
##  Min.   :0.07049   Min.   :0.04922   Min.   :0.03232   Min.   :0.03173  
##  1st Qu.:0.18649   1st Qu.:0.14942   1st Qu.:0.09336   1st Qu.:0.08170  
##  Median :0.56665   Median :0.34315   Median :0.21037   Median :0.14339  
##  Mean   :0.94731   Mean   :0.70911   Mean   :0.55004   Mean   :0.39601  
##  3rd Qu.:1.47828   3rd Qu.:0.99579   3rd Qu.:0.71003   3rd Qu.:0.49919  
##  Max.   :3.49035   Max.   :3.28983   Max.   :2.81186   Max.   :2.02857  
##                                                        NA's   :4

Both are equivalent

cor(mort[,c("1980", "1990", "2000", "2010")], use = "complete.obs")
##           1980      1990      2000      2010
## 1980 1.0000000 0.9596846 0.8877433 0.8468284
## 1990 0.9596846 1.0000000 0.9610269 0.9247192
## 2000 0.8877433 0.9610269 1.0000000 0.9862345
## 2010 0.8468284 0.9247192 0.9862345 1.0000000
cor(mort %>% select("1980", "1990", "2000", "2010"), use = "complete.obs")
##           1980      1990      2000      2010
## 1980 1.0000000 0.9596846 0.8877433 0.8468284
## 1990 0.9596846 1.0000000 0.9610269 0.9247192
## 2000 0.8877433 0.9610269 1.0000000 0.9862345
## 2010 0.8468284 0.9247192 0.9862345 1.0000000
    1. Compute the correlation between the Myanmar, China, and United States mortality data. Store this correlation matrix in an object called country_cor
  1. Extract the Myanmar-US correlation from the correlation matrix.
mort_subs = mort %>% 
  filter(country %in% c( "China", "Myanmar","United States")) %>% 
  arrange(country)
long = mort_subs %>% 
  gather(key = "year", value = death, -country) %>% 
  filter(!is.na(death))
long = long %>% 
  spread(key = country, value = death)
sub = long %>% 
  select(Myanmar, China, `United States`)
cor(sub)
##                 Myanmar     China United States
## Myanmar       1.0000000 0.9743646     0.6292625
## China         0.9743646 1.0000000     0.6909263
## United States 0.6292625 0.6909263     1.0000000
mort_subs = mort_subs %>% 
  select(-country)
mort_subs = t(mort_subs)
country_cor = cor(mort_subs, 
                  use = "complete.obs")

Run the following to see that the order is China, Myanmar, US:

mort %>% filter(country %in% c("Myanmar", "China", "United States"))
## # A tibble: 3 x 255
##   country `1760` `1761` `1762` `1763` `1764` `1765` `1766` `1767` `1768`
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 China       NA     NA     NA     NA     NA     NA     NA     NA     NA
## 2 Myanmar     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 3 United~     NA     NA     NA     NA     NA     NA     NA     NA     NA
## # ... with 245 more variables: `1769` <dbl>, `1770` <dbl>, `1771` <dbl>,
## #   `1772` <dbl>, `1773` <dbl>, `1774` <dbl>, `1775` <dbl>, `1776` <dbl>,
## #   `1777` <dbl>, `1778` <dbl>, `1779` <dbl>, `1780` <dbl>, `1781` <dbl>,
## #   `1782` <dbl>, `1783` <dbl>, `1784` <dbl>, `1785` <dbl>, `1786` <dbl>,
## #   `1787` <dbl>, `1788` <dbl>, `1789` <dbl>, `1790` <dbl>, `1791` <dbl>,
## #   `1792` <dbl>, `1793` <dbl>, `1794` <dbl>, `1795` <dbl>, `1796` <dbl>,
## #   `1797` <dbl>, `1798` <dbl>, `1799` <dbl>, `1800` <dbl>, `1801` <dbl>,
## #   `1802` <dbl>, `1803` <dbl>, `1804` <dbl>, `1805` <dbl>, `1806` <dbl>,
## #   `1807` <dbl>, `1808` <dbl>, `1809` <dbl>, `1810` <dbl>, `1811` <dbl>,
## #   `1812` <dbl>, `1813` <dbl>, `1814` <dbl>, `1815` <dbl>, `1816` <dbl>,
## #   `1817` <dbl>, `1818` <dbl>, `1819` <dbl>, `1820` <dbl>, `1821` <dbl>,
## #   `1822` <dbl>, `1823` <dbl>, `1824` <dbl>, `1825` <dbl>, `1826` <dbl>,
## #   `1827` <dbl>, `1828` <dbl>, `1829` <dbl>, `1830` <dbl>, `1831` <dbl>,
## #   `1832` <dbl>, `1833` <dbl>, `1834` <dbl>, `1835` <dbl>, `1836` <dbl>,
## #   `1837` <dbl>, `1838` <dbl>, `1839` <dbl>, `1840` <dbl>, `1841` <dbl>,
## #   `1842` <dbl>, `1843` <dbl>, `1844` <dbl>, `1845` <dbl>, `1846` <dbl>,
## #   `1847` <dbl>, `1848` <dbl>, `1849` <dbl>, `1850` <dbl>, `1851` <dbl>,
## #   `1852` <dbl>, `1853` <dbl>, `1854` <dbl>, `1855` <dbl>, `1856` <dbl>,
## #   `1857` <dbl>, `1858` <dbl>, `1859` <dbl>, `1860` <dbl>, `1861` <dbl>,
## #   `1862` <dbl>, `1863` <dbl>, `1864` <dbl>, `1865` <dbl>, `1866` <dbl>,
## #   `1867` <dbl>, `1868` <dbl>, ...

Extract the Myanmar-US correlation

country_cor[1,3]
## [1] 0.6909263
  1. Is there a difference between mortality information from 1990 and 2000? Run a t-test and a Wilcoxon rank sum test to assess this. Hint: to extract the column of information for 1990, use mort$"1990"
t.test(mort$"1990", mort$"2000")
## 
##  Welch Two Sample t-test
## 
## data:  mort$"1990" and mort$"2000"
## t = 2.146, df = 386.23, p-value = 0.03249
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.01333621 0.30479998
## sample estimates:
## mean of x mean of y 
## 0.7091080 0.5500399
t.test(mort$`1990`, mort$`2000`)
## 
##  Welch Two Sample t-test
## 
## data:  mort$`1990` and mort$`2000`
## t = 2.146, df = 386.23, p-value = 0.03249
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.01333621 0.30479998
## sample estimates:
## mean of x mean of y 
## 0.7091080 0.5500399
t.test(mort$`1990`, mort$`2000`, paired = TRUE)
## 
##  Paired t-test
## 
## data:  mort$`1990` and mort$`2000`
## t = 10.025, df = 196, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1277769 0.1903593
## sample estimates:
## mean of the differences 
##               0.1590681
wilcox.test(mort$"1990", mort$"2000")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  mort$"1990" and mort$"2000"
## W = 23006, p-value = 0.001445
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(mort$"1990", mort$"2000", paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mort$"1990" and mort$"2000"
## V = 17997, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Part 2

Read in the Kaggle cars auction dataset from http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv with the argument col_types = cols(VehBCost = col_double()):

cars = read_csv("http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv",
col_types = cols(VehBCost = col_double()))
  1. Fit a linear regression model with vehicle cost (VehBCost) as the outcome and vehicle age (VehicleAge) and size (Size) as predictors as well as their interaction. Save the model fit in an object called lmfit_cars and display the summary table.
lmfit_cars = lm(VehBCost ~ VehicleAge + Size + VehicleAge*Size, data = cars)
lmfit_cars = lm(VehBCost ~ VehicleAge*Size, data = cars)
summary(lmfit_cars)
## 
## Call:
## lm(formula = VehBCost ~ VehicleAge * Size, data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6377   -763    -47    673  34277 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                6702.1722    41.1090 163.034  < 2e-16 ***
## VehicleAge                 -359.2154     8.6234 -41.656  < 2e-16 ***
## SizeCROSSOVER              1707.9277    97.0448  17.599  < 2e-16 ***
## SizeLARGE                  1970.8006    52.0946  37.831  < 2e-16 ***
## SizeLARGE SUV              5300.3814   150.4731  35.225  < 2e-16 ***
## SizeLARGE TRUCK            3941.3630    87.6070  44.989  < 2e-16 ***
## SizeMEDIUM                 1356.2214    44.2506  30.649  < 2e-16 ***
## SizeMEDIUM SUV             3393.6482    63.9123  53.099  < 2e-16 ***
## SizeNULL                   3050.7369   927.9307   3.288 0.001011 ** 
## SizeSMALL SUV              1982.9486    81.5497  24.316  < 2e-16 ***
## SizeSMALL TRUCK            1220.0354   140.1903   8.703  < 2e-16 ***
## SizeSPECIALTY              3226.9361    87.7318  36.782  < 2e-16 ***
## SizeSPORTS                 4455.2034   156.9750  28.382  < 2e-16 ***
## SizeVAN                    1188.9626    60.5054  19.651  < 2e-16 ***
## VehicleAge:SizeCROSSOVER    257.0727    24.0669  10.682  < 2e-16 ***
## VehicleAge:SizeLARGE        -70.7684    11.7230  -6.037 1.58e-09 ***
## VehicleAge:SizeLARGE SUV    -46.0031    26.0801  -1.764 0.077751 .  
## VehicleAge:SizeLARGE TRUCK    0.8802    17.2386   0.051 0.959278    
## VehicleAge:SizeMEDIUM      -126.3283     9.4690 -13.341  < 2e-16 ***
## VehicleAge:SizeMEDIUM SUV   -76.7502    12.8391  -5.978 2.27e-09 ***
## VehicleAge:SizeNULL         -54.4209   253.3892  -0.215 0.829946    
## VehicleAge:SizeSMALL SUV     59.2823    16.3770   3.620 0.000295 ***
## VehicleAge:SizeSMALL TRUCK   47.4726    25.0526   1.895 0.058108 .  
## VehicleAge:SizeSPECIALTY    168.1692    24.0002   7.007 2.46e-12 ***
## VehicleAge:SizeSPORTS      -346.3364    28.7290 -12.055  < 2e-16 ***
## VehicleAge:SizeVAN          -72.9747    12.7391  -5.728 1.02e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1188 on 72957 degrees of freedom
## Multiple R-squared:  0.5487, Adjusted R-squared:  0.5486 
## F-statistic:  3548 on 25 and 72957 DF,  p-value: < 2.2e-16
  1. Create a variable called expensive in the cars data that indicates if the vehicle cost is over $10,000. Use a chi-squared test to assess if there is a relationship between a car being expensive and it being labeled as a “bad buy” (IsBadBuy).
cars = cars %>%
mutate(expensive = VehBCost > 10000)
tab = table(cars$expensive, cars$IsBadBuy)
chisq.test(tab)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 1.8152, df = 1, p-value = 0.1779
fisher.test(tab)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.179
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8156428 1.0362735
## sample estimates:
## odds ratio 
##  0.9205426
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 1.8152, df = 1, p-value = 0.1779
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.020853549  0.003506105
## sample estimates:
##    prop 1    prop 2 
## 0.8766766 0.8853503
  1. Fit a logistic regression model where the outcome is “bad buy” status and predictors are the expensive status and vehicle age (VehicleAge). Save the model fit in an object called logfit_cars and display the summary table. Use summary or tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE) or tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE) for log odds ratios
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial())
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = "binomial")
summary(logfit_cars)
## 
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = "binomial", 
##     data = cars)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9091  -0.5489  -0.4800  -0.3648   2.4859  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.249490   0.033244 -97.745   <2e-16 ***
## expensiveTRUE -0.080387   0.060696  -1.324    0.185    
## VehicleAge     0.286605   0.006476  44.257   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54421  on 72982  degrees of freedom
## Residual deviance: 52442  on 72980  degrees of freedom
## AIC: 52448
## 
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.0388   0.0332     -97.7    0       0.0363    0.0414
## 2 expensiveTRUE   0.923    0.0607      -1.32   0.185   0.818     1.04  
## 3 VehicleAge      1.33     0.00648     44.3    0       1.32      1.35
tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)    -3.25     0.0332     -97.7    0       -3.31    -3.18  
## 2 expensiveTRUE  -0.0804   0.0607      -1.32   0.185   -0.201    0.0370
## 3 VehicleAge      0.287    0.00648     44.3    0        0.274    0.299
  1. Randomly sample 10,000 observations (rows) with replacement from the cars dataset and store this new dataset in an object called cars_subsample. Hint: sample_n function
set.seed(1)
cars_subsample = cars %>% 
  sample_n(size = 10000)

or another way

rows = sample(nrow(cars), size = 10000, replace = TRUE)
cars_subsample = cars[rows,]
  1. Fit the same logistic regression model as in problem 6 above with this sample and display the summary table using the tidy function. How do the results compare? Call this model logfit_cars_sub:
logfit_cars_sub = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
summary(logfit_cars_sub)
## 
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = binomial, 
##     data = cars)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9091  -0.5489  -0.4800  -0.3648   2.4859  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.249490   0.033244 -97.745   <2e-16 ***
## expensiveTRUE -0.080387   0.060696  -1.324    0.185    
## VehicleAge     0.286605   0.006476  44.257   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54421  on 72982  degrees of freedom
## Residual deviance: 52442  on 72980  degrees of freedom
## AIC: 52448
## 
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars_sub, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.0388   0.0332     -97.7    0       0.0363    0.0414
## 2 expensiveTRUE   0.923    0.0607      -1.32   0.185   0.818     1.04  
## 3 VehicleAge      1.33     0.00648     44.3    0       1.32      1.35