Read in the following infant mortality data from http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv
mort = read_csv("http://johnmuschelli.com/intro_to_r/data/indicatordeadkids35.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
or use
mort = read_mortality()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
Then change the first column name to country
"
mort = mort %>%
rename(country = X1)
or
colnames(mort)[1] = "country"
1980
, 1990
, 2000
, and 2010
mortality data. No need to save this in an object. Just display the result to the screen.sub = mort %>%
select(`1980`, `1990`, `2000`, `2010`)
cor(sub)
## 1980 1990 2000 2010
## 1980 1.000000000000 0.960153953723 0.888841117492 NA
## 1990 0.960153953723 1.000000000000 0.961384203917 NA
## 2000 0.888841117492 0.961384203917 1.000000000000 NA
## 2010 NA NA NA 1
cor(sub, use = "complete.obs")
## 1980 1990 2000 2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
psych::corr.test(sub, use = "complete.obs")
## Call:psych::corr.test(x = sub, use = "complete.obs")
## Correlation matrix
## 1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size
## 1980 1990 2000 2010
## 1980 197 197 197 193
## 1990 197 197 197 193
## 2000 197 197 197 193
## 2010 193 193 193 193
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## 1980 1990 2000 2010
## 1980 0 0 0 0
## 1990 0 0 0 0
## 2000 0 0 0 0
## 2010 0 0 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
psych::corr.test(sub)
## Call:psych::corr.test(x = sub)
## Correlation matrix
## 1980 1990 2000 2010
## 1980 1.00 0.96 0.89 0.85
## 1990 0.96 1.00 0.96 0.92
## 2000 0.89 0.96 1.00 0.99
## 2010 0.85 0.92 0.99 1.00
## Sample Size
## 1980 1990 2000 2010
## 1980 197 197 197 193
## 1990 197 197 197 193
## 2000 197 197 197 193
## 2010 193 193 193 193
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## 1980 1990 2000 2010
## 1980 0 0 0 0
## 1990 0 0 0 0
## 2000 0 0 0 0
## 2010 0 0 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
cor(mort[,c("1980", "1990", "2000", "2010")])
## 1980 1990 2000 2010
## 1980 1.000000000000 0.960153953723 0.888841117492 NA
## 1990 0.960153953723 1.000000000000 0.961384203917 NA
## 2000 0.888841117492 0.961384203917 1.000000000000 NA
## 2010 NA NA NA 1
What’s going on?! Seems we have NA
s in the 2010
column
summary(mort[,c("1980", "1990", "2000", "2010")])
## 1980 1990 2000
## Min. :0.070490887 Min. :0.049215555 Min. :0.032320019
## 1st Qu.:0.186491646 1st Qu.:0.149422819 1st Qu.:0.093360836
## Median :0.566647329 Median :0.343152280 Median :0.210366768
## Mean :0.947310323 Mean :0.709108004 Mean :0.550039910
## 3rd Qu.:1.478281448 3rd Qu.:0.995794215 3rd Qu.:0.710028877
## Max. :3.490352139 Max. :3.289831218 Max. :2.811858292
##
## 2010
## Min. :0.031731875
## 1st Qu.:0.081694736
## Median :0.143392163
## Mean :0.396007581
## 3rd Qu.:0.499190608
## Max. :2.028572693
## NA's :4
Both are equivalent
cor(mort[,c("1980", "1990", "2000", "2010")], use = "complete.obs")
## 1980 1990 2000 2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
cor(mort %>% select("1980", "1990", "2000", "2010"), use = "complete.obs")
## 1980 1990 2000 2010
## 1980 1.000000000000 0.959684628292 0.887743334542 0.846828397953
## 1990 0.959684628292 1.000000000000 0.961026893827 0.924719217724
## 2000 0.887743334542 0.961026893827 1.000000000000 0.986234456933
## 2010 0.846828397953 0.924719217724 0.986234456933 1.000000000000
Myanmar
, China
, and United States
mortality data. Store this correlation matrix in an object called country_cor
mort_subs = mort %>%
filter(country %in% c( "China", "Myanmar","United States")) %>%
arrange(country)
long = mort_subs %>%
gather(key = "year", value = death, -country) %>%
filter(!is.na(death))
long = long %>%
spread(key = country, value = death)
sub = long %>%
select(Myanmar, China, `United States`)
cor(sub)
## Myanmar China United States
## Myanmar 1.000000000000 0.974364554315 0.629262462521
## China 0.974364554315 1.000000000000 0.690926321344
## United States 0.629262462521 0.690926321344 1.000000000000
mort_subs = mort_subs %>%
select(-country)
mort_subs = t(mort_subs)
country_cor = cor(mort_subs,
use = "complete.obs")
Run the following to see that the order is China, Myanmar, US:
mort %>% filter(country %in% c("Myanmar", "China", "United States"))
## # A tibble: 3 x 255
## country `1760` `1761` `1762` `1763` `1764` `1765` `1766` `1767` `1768`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 China NA NA NA NA NA NA NA NA NA
## 2 Myanmar NA NA NA NA NA NA NA NA NA
## 3 United… NA NA NA NA NA NA NA NA NA
## # … with 245 more variables: `1769` <dbl>, `1770` <dbl>, `1771` <dbl>,
## # `1772` <dbl>, `1773` <dbl>, `1774` <dbl>, `1775` <dbl>,
## # `1776` <dbl>, `1777` <dbl>, `1778` <dbl>, `1779` <dbl>,
## # `1780` <dbl>, `1781` <dbl>, `1782` <dbl>, `1783` <dbl>,
## # `1784` <dbl>, `1785` <dbl>, `1786` <dbl>, `1787` <dbl>,
## # `1788` <dbl>, `1789` <dbl>, `1790` <dbl>, `1791` <dbl>,
## # `1792` <dbl>, `1793` <dbl>, `1794` <dbl>, `1795` <dbl>,
## # `1796` <dbl>, `1797` <dbl>, `1798` <dbl>, `1799` <dbl>,
## # `1800` <dbl>, `1801` <dbl>, `1802` <dbl>, `1803` <dbl>,
## # `1804` <dbl>, `1805` <dbl>, `1806` <dbl>, `1807` <dbl>,
## # `1808` <dbl>, `1809` <dbl>, `1810` <dbl>, `1811` <dbl>,
## # `1812` <dbl>, `1813` <dbl>, `1814` <dbl>, `1815` <dbl>,
## # `1816` <dbl>, `1817` <dbl>, `1818` <dbl>, `1819` <dbl>,
## # `1820` <dbl>, `1821` <dbl>, `1822` <dbl>, `1823` <dbl>,
## # `1824` <dbl>, `1825` <dbl>, `1826` <dbl>, `1827` <dbl>,
## # `1828` <dbl>, `1829` <dbl>, `1830` <dbl>, `1831` <dbl>,
## # `1832` <dbl>, `1833` <dbl>, `1834` <dbl>, `1835` <dbl>,
## # `1836` <dbl>, `1837` <dbl>, `1838` <dbl>, `1839` <dbl>,
## # `1840` <dbl>, `1841` <dbl>, `1842` <dbl>, `1843` <dbl>,
## # `1844` <dbl>, `1845` <dbl>, `1846` <dbl>, `1847` <dbl>,
## # `1848` <dbl>, `1849` <dbl>, `1850` <dbl>, `1851` <dbl>,
## # `1852` <dbl>, `1853` <dbl>, `1854` <dbl>, `1855` <dbl>,
## # `1856` <dbl>, `1857` <dbl>, `1858` <dbl>, `1859` <dbl>,
## # `1860` <dbl>, `1861` <dbl>, `1862` <dbl>, `1863` <dbl>,
## # `1864` <dbl>, `1865` <dbl>, `1866` <dbl>, `1867` <dbl>,
## # `1868` <dbl>, …
Extract the Myanmar-US correlation
country_cor[1,3]
## [1] 0.690926321344
1990
and 2000
? Run a t-test and a Wilcoxon rank sum test to assess this. Hint: to extract the column of information for 1990
, use mort$"1990"
t.test(mort$"1990", mort$"2000")
##
## Welch Two Sample t-test
##
## data: mort$"1990" and mort$"2000"
## t = 2.146049034, df = 386.2310023, p-value = 0.0324916449
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0133362118521 0.3047999761377
## sample estimates:
## mean of x mean of y
## 0.709108003990 0.550039909995
t.test(mort$`1990`, mort$`2000`)
##
## Welch Two Sample t-test
##
## data: mort$`1990` and mort$`2000`
## t = 2.146049034, df = 386.2310023, p-value = 0.0324916449
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0133362118521 0.3047999761377
## sample estimates:
## mean of x mean of y
## 0.709108003990 0.550039909995
t.test(mort$`1990`, mort$`2000`, paired = TRUE)
##
## Paired t-test
##
## data: mort$`1990` and mort$`2000`
## t = 10.02534013, df = 196, p-value < 2.220446e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.127776911751 0.190359276239
## sample estimates:
## mean of the differences
## 0.159068093995
wilcox.test(mort$"1990", mort$"2000")
##
## Wilcoxon rank sum test with continuity correction
##
## data: mort$"1990" and mort$"2000"
## W = 23005.5, p-value = 0.00144462911
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(mort$"1990", mort$"2000", paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: mort$"1990" and mort$"2000"
## V = 17997, p-value < 2.220446e-16
## alternative hypothesis: true location shift is not equal to 0
Read in the Kaggle cars auction dataset from http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv with the argument col_types = cols(VehBCost = col_double())
:
cars = read_csv("http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv",
col_types = cols(VehBCost = col_double()))
VehBCost
) as the outcome and vehicle age (VehicleAge
) and whether it’s an online sale (IsOnlineSale
) as predictors as well as their interaction. Save the model fit in an object called lmfit_cars
and display the summary table.lmfit_cars = lm(VehBCost ~ VehicleAge + IsOnlineSale + VehicleAge*IsOnlineSale, data = cars)
lmfit_cars = lm(VehBCost ~ VehicleAge*IsOnlineSale, data = cars)
tidy(lmfit_cars)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8063. 16.6 486. 0
## 2 VehicleAge -321. 3.67 -87.4 0
## 3 IsOnlineSale 514. 108. 4.77 0.00000180
## 4 VehicleAge:IsOnlineSale -55.4 25.6 -2.17 0.0303
expensive
in the cars
data that indicates if the vehicle cost is over $10,000
. Use a chi-squared test to assess if there is a relationship between a car being expensive and it being labeled as a “bad buy” (IsBadBuy
).cars = cars %>%
mutate(expensive = VehBCost > 10000)
tab = table(cars$expensive, cars$IsBadBuy)
chisq.test(tab)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab
## X-squared = 1.815247312, df = 1, p-value = 0.177880045
fisher.test(tab)
##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.178978818
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.815642760873 1.036273536773
## sample estimates:
## odds ratio
## 0.920542562973
prop.test(tab)
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: tab
## X-squared = 1.815247312, df = 1, p-value = 0.177880045
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.02085354859625 0.00350610520518
## sample estimates:
## prop 1 prop 2
## 0.876676596776 0.885350318471
expensive
status and vehicle age (VehicleAge
). Save the model fit in an object called logfit_cars
and display the summary table. Use summary
or tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE)
or tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE)
for log odds ratioslogfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial())
logfit_cars = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = "binomial")
summary(logfit_cars)
##
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = "binomial",
## data = cars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.909091727 -0.548922874 -0.479968473 -0.364839574 2.485900048
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.24948976307 0.03324440927 -97.74545 < 2e-16 ***
## expensiveTRUE -0.08038741797 0.06069562592 -1.32444 0.18536
## VehicleAge 0.28660451302 0.00647588104 44.25722 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 54421.30728 on 72982 degrees of freedom
## Residual deviance: 52441.78751 on 72980 degrees of freedom
## AIC: 52447.78751
##
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0388 0.0332 -97.7 0 0.0363 0.0414
## 2 expensiveTRUE 0.923 0.0607 -1.32 0.185 0.818 1.04
## 3 VehicleAge 1.33 0.00648 44.3 0 1.32 1.35
tidy(logfit_cars, conf.int = TRUE, exponentiate = FALSE)
## # A tibble: 3 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.25 0.0332 -97.7 0 -3.31 -3.18
## 2 expensiveTRUE -0.0804 0.0607 -1.32 0.185 -0.201 0.0370
## 3 VehicleAge 0.287 0.00648 44.3 0 0.274 0.299
10,000
observations (rows) with replacement from the cars
dataset and store this new dataset in an object called cars_subsample
. Hint: sample_n
functionset.seed(1)
cars_subsample = cars %>%
sample_n(size = 10000)
or another way
rows = sample(nrow(cars), size = 10000, replace = TRUE)
cars_subsample = cars[rows,]
tidy
function. How do the results compare? Call this model logfit_cars_sub
:logfit_cars_sub = glm(IsBadBuy ~ expensive + VehicleAge, data = cars, family = binomial)
summary(logfit_cars_sub)
##
## Call:
## glm(formula = IsBadBuy ~ expensive + VehicleAge, family = binomial,
## data = cars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.909091727 -0.548922874 -0.479968473 -0.364839574 2.485900048
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.24948976307 0.03324440927 -97.74545 < 2e-16 ***
## expensiveTRUE -0.08038741797 0.06069562592 -1.32444 0.18536
## VehicleAge 0.28660451302 0.00647588104 44.25722 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 54421.30728 on 72982 degrees of freedom
## Residual deviance: 52441.78751 on 72980 degrees of freedom
## AIC: 52447.78751
##
## Number of Fisher Scoring iterations: 5
tidy(logfit_cars_sub, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 3 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0388 0.0332 -97.7 0 0.0363 0.0414
## 2 expensiveTRUE 0.923 0.0607 -1.32 0.185 0.818 1.04
## 3 VehicleAge 1.33 0.00648 44.3 0 1.32 1.35