In this assignment, we will be working with a dataset from the “Kaggle” website, which hosts competitions for prediction and machine learning. More details on this dataset are here: https://www.kaggle.com/c/DontGetKicked/overview/background.
## you can add more, or change...these are suggestions
library(tidyverse)
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
read_csv()
and name the dataset “cars”.cars = read_csv(
"http://johnmuschelli.com/intro_to_r/data/kaggleCarAuction.csv",
col_types = cols(VehBCost = col_double())
)
read_tsv()
function and name it “key”.key = read_tsv("http://johnmuschelli.com/intro_to_r/data/Carvana_Data_Dictionary_formatted.txt")
readLines()
function.dict = readLines("http://johnmuschelli.com/intro_to_r/data/Carvana_Data_Dictionary.txt")
dict = dict[ !dict %in% "" ]
dict = strsplit(dict, split = "\t")
dict = sapply(dict, function(x) {
if (length(x) == 1) {
# split on long spaces
x = strsplit(x, split = " ")[[1]]
}
x = trimws(x)
x = x[ !x %in% ""]
x
})
dict = t(dict)
colnames(dict) = dict[1,]
dict = dict[-1,]
dict = tibble::as_data_frame(dict)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
save()
function.save(cars, key, file = "kaggle.rda")
dim(cars)
## [1] 72983 34
nrow(cars)
## [1] 72983
There are 72983 observations in the dataset.
range(cars$VehYear)
## [1] 2001 2010
table(cars$VehYear)
##
## 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## 1481 3405 6227 10207 15489 17043 11423 6885 822 1
summarize()
and filter()
functions or sum()
.sum(cars$VehYear < 2004)
## [1] 11113
mean(cars$VehYear < 2004)
## [1] 0.1522683
table(cars$VehYear < 2004)
##
## FALSE TRUE
## 61870 11113
filter()
removes missing values.sum(cars$VehBCost <= 1500 | is.na(cars$VehBCost))
## [1] 3
cars = filter(cars, VehBCost > 1500)
nrow(cars)
## [1] 72980
There are 72980 after filtering by price.
table()
or group_by()
with tally()
or summarize()
.length(unique(cars$Make)) # a
## [1] 33
length(table(cars$Make))
## [1] 33
length(unique(cars$Model)) # b
## [1] 1063
length(unique(cars$Size)) # c
## [1] 13
group_by()
with summarize()
. Be sure to use the key to find the column that corresponds to the aquisition cost paid for the vehicle at time of purchase!cars %>% group_by(Make) %>%
summarize(mean = mean(VehBCost)) %>%
arrange(desc(mean))
## # A tibble: 33 x 2
## Make mean
## <chr> <dbl>
## 1 LEXUS 13023.
## 2 HUMMER 11920
## 3 CADILLAC 10958.
## 4 INFINITI 9844.
## 5 VOLVO 9756.
## 6 ACURA 9039.
## 7 SUBARU 8958.
## 8 MINI 8560.
## 9 HONDA 8350.
## 10 GMC 8342.
## # … with 23 more rows
cars %>% group_by(Model) %>%
summarize(mean = mean(VehBCost)) %>%
arrange(desc(mean))
## # A tibble: 1,063 x 2
## Model mean
## <chr> <dbl>
## 1 ESCALADE 2WD 45469
## 2 GX470 4WD 38785
## 3 RX400H AWD 33140
## 4 GS450H 28560
## 5 TOUAREG V8 28180
## 6 FJ CRUISER 4WD V6 20380
## 7 HIGHLANDER 4WD V6 16892.
## 8 M35 15419.
## 9 4 RUNNER 4WD V6 15300.
## 10 4 RUNNER 2WD V6 15148.
## # … with 1,053 more rows
cars %>% group_by(Color) %>%
summarize(mean = mean(VehBCost)) %>%
arrange(desc(mean))
## # A tibble: 17 x 2
## Color mean
## <chr> <dbl>
## 1 BLACK 7146.
## 2 GREY 7093.
## 3 BROWN 6886.
## 4 OTHER 6845.
## 5 BEIGE 6760.
## 6 BLUE 6719.
## 7 RED 6718.
## 8 ORANGE 6695.
## 9 WHITE 6650.
## 10 MAROON 6636.
## 11 SILVER 6610.
## 12 NOT AVAIL 6514.
## 13 GREEN 6487.
## 14 GOLD 6375.
## 15 PURPLE 6304.
## 16 YELLOW 6299.
## 17 NULL 4891.
cars %>% group_by(Make) %>%
summarize(var = var(VehBCost)) %>%
arrange(desc(var))
## # A tibble: 33 x 2
## Make var
## <chr> <dbl>
## 1 LEXUS 68614545.
## 2 CADILLAC 39008786.
## 3 INFINITI 8240801.
## 4 LINCOLN 4523891.
## 5 VOLKSWAGEN 4358504.
## 6 FORD 3824550.
## 7 DODGE 3650695.
## 8 TOYOTA 3646264.
## 9 MERCURY 3198687.
## 10 NISSAN 2803932.
## # … with 23 more rows
cars %>% group_by(Model) %>%
summarize(var = var(VehBCost)) %>%
arrange(desc(var))
## # A tibble: 1,063 x 2
## Model var
## <chr> <dbl>
## 1 HIGHLANDER 4WD V6 164749479.
## 2 M35 43817437.
## 3 RX400H AWD 22378050
## 4 2500 SILVERADO PICKU 19220000
## 5 4 RUNNER 2WD V6 19189012.
## 6 F150 PICKUP 4WD V8 16749639.
## 7 2500 RAM PICKUP 4WD 8224008.
## 8 G35 7127987.
## 9 GS300 3.0L I6 EFI 6275149.
## 10 COMMANDER 4WD V8 6030312.
## # … with 1,053 more rows
cars %>% group_by(Color) %>%
summarize(var = var(VehBCost)) %>%
arrange(desc(var))
## # A tibble: 17 x 2
## Color var
## <chr> <dbl>
## 1 MAROON 3388012.
## 2 BLACK 3302015.
## 3 OTHER 3257129.
## 4 GREY 3245465.
## 5 RED 3228092.
## 6 WHITE 3114782.
## 7 BROWN 3064809.
## 8 BEIGE 3032798.
## 9 SILVER 3025037.
## 10 GREEN 2984072.
## 11 YELLOW 2963724.
## 12 GOLD 2936305.
## 13 BLUE 2802224.
## 14 PURPLE 2645876.
## 15 NOT AVAIL 2637445.
## 16 ORANGE 2182411.
## 17 NULL 1027184.
filter()
or sum()
):
sum(cars$Color == "RED" & cars$VehOdo < 30000)
## [1] 28
# b. Are made by GMC and were purchased in Texas?
sum(cars$Make == "GMC" & cars$VNST == "TX")
## [1] 137
# c. Are blue or red?
sum(cars$Color == "BLUE" | cars$Color == "RED" )
## [1] 16603
sum(cars$Color %in% c("BLUE","RED"))
## [1] 16603
# d. Are made by Chrysler or Nissan and are white or silver?
sum((cars$Make == "CHRYSLER" | cars$Make=="NISSAN") & (cars$Color == "WHITE" | cars$Color == "SILVER" ))
## [1] 4335
sum(cars$Make %in% c("CHRYSLER","NISSAN") &
cars$Color %in% c("WHITE","SILVER" ))
## [1] 4335
# e. Are automatic, blue, Pontiac cars with under 40,000 miles?
sum(cars$Transmission == "AUTO" & cars$Color=="BLUE" &
cars$Make == "PONTIAC" & cars$VehOdo < 40000)
## [1] 1