Part 1

Read in the following infant mortality data from http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv

mort = read_csv("http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

or use

mort = read_mortality()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

Then change the first column name to country"

mort = mort %>% 
  rename(country = X1)

or

colnames(mort)[1] = "country"
  1. Compute the correlation between the 1980, 1990, 2000, and 2010 mortality data. No need to save this in an object. Just display the result to the screen.
sub = mort %>% 
  select(`1980`, `1990`, `2000`, `2010`)
cor(sub)
##                1980           1990           2000 2010
## 1980 1.000000000000 0.960153953723 0.888841117492   NA
## 1990 0.960153953723 1.000000000000 0.961384203917   NA
## 2000 0.888841117492 0.961384203917 1.000000000000   NA
## 2010             NA             NA             NA    1
cor(sub, use = "complete.obs")
##                1980           1990           2000           2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
psych::corr.test(sub, use = "complete.obs")
## Call:psych::corr.test(x = sub, use = "complete.obs")
## Correlation matrix 
##      1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size 
##      1980 1990 2000 2010
## 1980  197  197  197  193
## 1990  197  197  197  193
## 2000  197  197  197  193
## 2010  193  193  193  193
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      1980 1990 2000 2010
## 1980    0    0    0    0
## 1990    0    0    0    0
## 2000    0    0    0    0
## 2010    0    0    0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
psych::corr.test(sub)
## Call:psych::corr.test(x = sub)
## Correlation matrix 
##      1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size 
##      1980 1990 2000 2010
## 1980  197  197  197  193
## 1990  197  197  197  193
## 2000  197  197  197  193
## 2010  193  193  193  193
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      1980 1990 2000 2010
## 1980    0    0    0    0
## 1990    0    0    0    0
## 2000    0    0    0    0
## 2010    0    0    0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
cor(mort[,c("1980", "1990", "2000", "2010")])
##                1980           1990           2000 2010
## 1980 1.000000000000 0.960153953723 0.888841117492   NA
## 1990 0.960153953723 1.000000000000 0.961384203917   NA
## 2000 0.888841117492 0.961384203917 1.000000000000   NA
## 2010             NA             NA             NA    1

What’s going on?! Seems we have NAs in the 2010 column

summary(mort[,c("1980", "1990", "2000", "2010")])
##       1980                  1990                  2000            
##  Min.   :0.070490887   Min.   :0.049215555   Min.   :0.032320019  
##  1st Qu.:0.186491646   1st Qu.:0.149422819   1st Qu.:0.093360836  
##  Median :0.566647329   Median :0.343152280   Median :0.210366768  
##  Mean   :0.947310323   Mean   :0.709108004   Mean   :0.550039910  
##  3rd Qu.:1.478281448   3rd Qu.:0.995794215   3rd Qu.:0.710028877  
##  Max.   :3.490352139   Max.   :3.289831218   Max.   :2.811858292  
##                                                                   
##       2010            
##  Min.   :0.031731875  
##  1st Qu.:0.081694736  
##  Median :0.143392163  
##  Mean   :0.396007581  
##  3rd Qu.:0.499190608  
##  Max.   :2.028572693  
##  NA's   :4

Both are equivalent

cor(mort[,c("1980", "1990", "2000", "2010")], use = "complete.obs")
##                1980           1990           2000           2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
cor(mort %>% select("1980", "1990", "2000", "2010"), use = "complete.obs")
##                1980           1990           2000           2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
    1. Compute the correlation between the Myanmar, China, and United States mortality data. Store this correlation matrix in an object called country_cor
  1. Extract the Myanmar-US correlation from the correlation matrix.
mort_subs = mort %>% 
  filter(country %in% c( "China", "Myanmar","United States")) %>% 
  arrange(country)
long = mort_subs %>% 
  gather(key = "year", value = death, -country) %>% 
  filter(!is.na(death))
long = long %>% 
  spread(key = country, value = death)
sub = long %>% 
  select(Myanmar, China, `United States`)
cor(sub)
##                      Myanmar          China  United States
## Myanmar       1.000000000000 0.974364554315 0.629262462521
## China         0.974364554315 1.000000000000 0.690926321344
## United States 0.629262462521 0.690926321344 1.000000000000
mort_subs = mort_subs %>% 
  select(-country)
mort_subs = t(mort_subs)
country_cor = cor(mort_subs, 
                  use = "complete.obs")

Run the following to see that the order is China, Myanmar, US:

mort %>% filter(country %in% c("Myanmar", "China", "United States"))
## # A tibble: 3 x 255
##   country `1760` `1761` `1762` `1763` `1764` `1765` `1766` `1767` `1768`
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 China       NA     NA     NA     NA     NA     NA     NA     NA     NA
## 2 Myanmar     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 3 United…     NA     NA     NA     NA     NA     NA     NA     NA     NA
## # … with 245 more variables: `1769` <dbl>, `1770` <dbl>, `1771` <dbl>,
## #   `1772` <dbl>, `1773` <dbl>, `1774` <dbl>, `1775` <dbl>,
## #   `1776` <dbl>, `1777` <dbl>, `1778` <dbl>, `1779` <dbl>,
## #   `1780` <dbl>, `1781` <dbl>, `1782` <dbl>, `1783` <dbl>,
## #   `1784` <dbl>, `1785` <dbl>, `1786` <dbl>, `1787` <dbl>,
## #   `1788` <dbl>, `1789` <dbl>, `1790` <dbl>, `1791` <dbl>,
## #   `1792` <dbl>, `1793` <dbl>, `1794` <dbl>, `1795` <dbl>,
## #   `1796` <dbl>, `1797` <dbl>, `1798` <dbl>, `1799` <dbl>,
## #   `1800` <dbl>, `1801` <dbl>, `1802` <dbl>, `1803` <dbl>,
## #   `1804` <dbl>, `1805` <dbl>, `1806` <dbl>, `1807` <dbl>,
## #   `1808` <dbl>, `1809` <dbl>, `1810` <dbl>, `1811` <dbl>,
## #   `1812` <dbl>, `1813` <dbl>, `1814` <dbl>, `1815` <dbl>,
## #   `1816` <dbl>, `1817` <dbl>, `1818` <dbl>, `1819` <dbl>,
## #   `1820` <dbl>, `1821` <dbl>, `1822` <dbl>, `1823` <dbl>,
## #   `1824` <dbl>, `1825` <dbl>, `1826` <dbl>, `1827` <dbl>,
## #   `1828` <dbl>, `1829` <dbl>, `1830` <dbl>, `1831` <dbl>,
## #   `1832` <dbl>, `1833` <dbl>, `1834` <dbl>, `1835` <dbl>,
## #   `1836` <dbl>, `1837` <dbl>, `1838` <dbl>, `1839` <dbl>,
## #   `1840` <dbl>, `1841` <dbl>, `1842` <dbl>, `1843` <dbl>,
## #   `1844` <dbl>, `1845` <dbl>, `1846` <dbl>, `1847` <dbl>,
## #   `1848` <dbl>, `1849` <dbl>, `1850` <dbl>, `1851` <dbl>,
## #   `1852` <dbl>, `1853` <dbl>, `1854` <dbl>, `1855` <dbl>,
## #   `1856` <dbl>, `1857` <dbl>, `1858` <dbl>, `1859` <dbl>,
## #   `1860` <dbl>, `1861` <dbl>, `1862` <dbl>, `1863` <dbl>,
## #   `1864` <dbl>, `1865` <dbl>, `1866` <dbl>, `1867` <dbl>,
## #   `1868` <dbl>, …

Extract the Myanmar-US correlation

country_cor[1,3]
## [1] 0.690926321344
  1. Is there a difference between mortality information from 1990 and 2000? Run a t-test and a Wilcoxon rank sum test to assess this. Hint: to extract the column of information for 1990, use mort$"1990"
t.test(mort$"1990", mort$"2000")
## 
##  Welch Two Sample t-test
## 
## data:  mort$"1990" and mort$"2000"
## t = 2.146049034, df = 386.2310023, p-value = 0.0324916449
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0133362118521 0.3047999761377
## sample estimates:
##      mean of x      mean of y 
## 0.709108003990 0.550039909995
t.test(mort$`1990`, mort$`2000`)
## 
##  Welch Two Sample t-test
## 
## data:  mort$`1990` and mort$`2000`
## t = 2.146049034, df = 386.2310023, p-value = 0.0324916449
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0133362118521 0.3047999761377
## sample estimates:
##      mean of x      mean of y 
## 0.709108003990 0.550039909995
t.test(mort$`1990`, mort$`2000`, paired = TRUE)
## 
##  Paired t-test
## 
## data:  mort$`1990` and mort$`2000`
## t = 10.02534013, df = 196, p-value < 2.220446e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.127776911751 0.190359276239
## sample estimates:
## mean of the differences 
##          0.159068093995
wilcox.test(mort$"1990", mort$"2000")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  mort$"1990" and mort$"2000"
## W = 23005.5, p-value = 0.00144462911
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(mort$"1990", mort$"2000", paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mort$"1990" and mort$"2000"
## V = 17997, p-value < 2.220446e-16
## alternative hypothesis: true location shift is not equal to 0

Part 2

Read in the Kaggle cars auction dataset from http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv with the argument col_types = cols(VehBCost = col_double()):

cars = read_csv("http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv",
col_types = cols(VehBCost = col_double()))
  1. Fit a linear regression model with vehicle cost (VehBCost) as the outcome and vehicle age (VehicleAge) and whether it’s an online sale (IsOnlineSale) as predictors as well as their interaction. Save the model fit in an object called lmfit_cars and display the summary table.
lmfit_cars = lm(VehBCost ~ VehicleAge + IsOnlineSale + VehicleAge*IsOnlineSale, data = cars)
lmfit_cars = lm(VehBCost ~ VehicleAge*IsOnlineSale, data = cars)
tidy(lmfit_cars)
## # A tibble: 4 x 5
##   term                    estimate std.error statistic    p.value
##   <chr>                      <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)               8063.      16.6     486.   0         
## 2 VehicleAge                -321.       3.67    -87.4  0         
## 3 IsOnlineSale               514.     108.        4.77 0.00000180
## 4 VehicleAge:IsOnlineSale    -55.4     25.6      -2.17 0.0303
  1. Create a variable called expensive in the cars data that indicates if the vehicle cost is over $10,000. Use a chi-squared test to assess if there is a relationship between a car being expensive and it being labeled as a “bad buy” (IsBadBuy).
cars = cars %>%
mutate(expensive = VehBCost > 10000)
tab = table(cars$expensive, cars$IsBadBuy)
chisq.test(tab)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 1.815247312, df = 1, p-value = 0.177880045
fisher.test(tab)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.178978818
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.815642760873 1.036273536773
## sample estimates:
##     odds ratio 
## 0.920542562973
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 1.815247312, df = 1, p-value = 0.177880045
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.02085354859625  0.00350610520518
## sample estimates:
##         prop 1         prop 2 
## 0.876676596776 0.885350318471
  1. Fit a logistic regression model where the outcome is “bad buy” status and predictors are the expensive status and vehicle age (VehicleAge). Save the model fit in an object called logfit_cars and display the summary table. Use summary or tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE) or tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE) for log odds ratios
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial())
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = "binomial")
summary(logfit_cars)
## 
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = "binomial", 
##     data = cars)
## 
## Deviance Residuals: 
##          Min            1Q        Median            3Q           Max  
## -0.909091727  -0.548922874  -0.479968473  -0.364839574   2.485900048  
## 
## Coefficients:
##                     Estimate     Std. Error   z value Pr(>|z|)    
## (Intercept)   -3.24948976307  0.03324440927 -97.74545  < 2e-16 ***
## expensiveTRUE -0.08038741797  0.06069562592  -1.32444  0.18536    
## VehicleAge     0.28660451302  0.00647588104  44.25722  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54421.30728  on 72982  degrees of freedom
## Residual deviance: 52441.78751  on 72980  degrees of freedom
## AIC: 52447.78751
## 
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.0388   0.0332     -97.7    0       0.0363    0.0414
## 2 expensiveTRUE   0.923    0.0607      -1.32   0.185   0.818     1.04  
## 3 VehicleAge      1.33     0.00648     44.3    0       1.32      1.35
tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)    -3.25     0.0332     -97.7    0       -3.31    -3.18  
## 2 expensiveTRUE  -0.0804   0.0607      -1.32   0.185   -0.201    0.0370
## 3 VehicleAge      0.287    0.00648     44.3    0        0.274    0.299
  1. Randomly sample 10,000 observations (rows) with replacement from the cars dataset and store this new dataset in an object called cars_subsample. Hint: sample_n function
set.seed(1)
cars_subsample = cars %>% 
  sample_n(size = 10000)

or another way

rows = sample(nrow(cars), size = 10000, replace = TRUE)
cars_subsample = cars[rows,]
  1. Fit the same logistic regression model as in problem 6 above with this sample and display the summary table using the tidy function. How do the results compare? Call this model logfit_cars_sub:
logfit_cars_sub = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
summary(logfit_cars_sub)
## 
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = binomial, 
##     data = cars)
## 
## Deviance Residuals: 
##          Min            1Q        Median            3Q           Max  
## -0.909091727  -0.548922874  -0.479968473  -0.364839574   2.485900048  
## 
## Coefficients:
##                     Estimate     Std. Error   z value Pr(>|z|)    
## (Intercept)   -3.24948976307  0.03324440927 -97.74545  < 2e-16 ***
## expensiveTRUE -0.08038741797  0.06069562592  -1.32444  0.18536    
## VehicleAge     0.28660451302  0.00647588104  44.25722  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54421.30728  on 72982  degrees of freedom
## Residual deviance: 52441.78751  on 72980  degrees of freedom
## AIC: 52447.78751
## 
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars_sub, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.0388   0.0332     -97.7    0       0.0363    0.0414
## 2 expensiveTRUE   0.923    0.0607      -1.32   0.185   0.818     1.04  
## 3 VehicleAge      1.33     0.00648     44.3    0       1.32      1.35