Data used

Bike Lanes Dataset: BikeBaltimore is the Department of Transportation’s bike program. The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms

You can Download as a CSV in your current working directory. Note its also available at: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv

library(readr)
library(tidyverse)
library(dplyr)
library(lubridate)
library(jhur)
library(tidyverse)
library(broom)

bike = read_csv(
  "http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")
## Parsed with column specification:
## cols(
##   subType = col_character(),
##   name = col_character(),
##   block = col_character(),
##   type = col_character(),
##   numLanes = col_double(),
##   project = col_character(),
##   route = col_character(),
##   length = col_double(),
##   dateInstalled = col_double()
## )

Part 1

  1. Count the number of rows of the bike data and count the number of complete cases of the bike data. Use sum and complete.cases.
nrow(bike)
## [1] 1631
sum(complete.cases(bike))
## [1] 257
  1. Create a data set called namat which is equal to is.na(bike).
    What is the class of namat? Run rowSums and colSums on namat. These represent the number of missing values in the rows and columns of bike. Don’t print rowSums, but do a table of the rowSums
namat = is.na(bike)
colSums(namat)
##       subType          name         block          type      numLanes 
##             4            12           215             9             0 
##       project         route        length dateInstalled 
##            74          1269             0             0
table(rowSums(namat))
## 
##    0    1    2    3    4 
##  257 1181  182    6    5
  1. Filter rows of bike that are NOT missing the route variable, assign this to the object have_route. Do a table of the subType using table, including the missing subTypes Get the same frequency distribution using group_by(subType) and tally()
have_route = bike %>%  filter(!is.na(route))
dim(have_route)
## [1] 362   9
table(have_route$subType, useNA = "always")
## 
## STRALY STRPRD   <NA> 
##      3    358      1
have_route %>%  group_by(subType) %>%  tally()
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route %>%  count(subType)
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route %>% group_by(subType) %>% 
  summarize(n_obs = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   subType n_obs
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
tally(   group_by(have_route, subType) )
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route = group_by(have_route, subType)
tally(have_route)
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
  1. Filter rows of bike that have the type SIDEPATH or BIKE LANE using %in%. Call it side_bike. Confirm this gives you the same number of results using the | and ==.
side_bike = bike %>% filter(type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = bike %>% filter(type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE
nrow(side_bike)
## [1] 628
nrow(side_bike2)
## [1] 628
side_bike = filter(bike,type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = filter(bike, type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE

Part 2

  1. Do a cross tabulation of the bike type and the number of lanes. Call it tab. Do a prop.table on the rows and columns margins. Try as.data.frame(tab) or broom::tidy(tab)
tab = table(type=bike$type, numLanes=bike$numLanes)
prop.table(tab, 1)
##                  numLanes
## type                            0               1               2
##   BIKE BOULEVARD  0.0000000000000 0.0204081632653 0.9795918367347
##   BIKE LANE       0.0322061191626 0.6618357487923 0.3059581320451
##   CONTRAFLOW      0.0000000000000 0.5384615384615 0.4615384615385
##   SHARED BUS BIKE 0.0000000000000 1.0000000000000 0.0000000000000
##   SHARROW         0.0000000000000 0.3684210526316 0.6315789473684
##   SIDEPATH        0.0000000000000 0.8571428571429 0.1428571428571
##   SIGNED ROUTE    0.0000000000000 0.6940789473684 0.3059210526316
prop.table(tab, 2)
##                  numLanes
## type                             0                1                2
##   BIKE BOULEVARD  0.00000000000000 0.00112107623318 0.06760563380282
##   BIKE LANE       1.00000000000000 0.46076233183857 0.26760563380282
##   CONTRAFLOW      0.00000000000000 0.00784753363229 0.00845070422535
##   SHARED BUS BIKE 0.00000000000000 0.04372197309417 0.00000000000000
##   SHARROW         0.00000000000000 0.24327354260090 0.52394366197183
##   SIDEPATH        0.00000000000000 0.00672645739910 0.00140845070423
##   SIGNED ROUTE    0.00000000000000 0.23654708520179 0.13098591549296
as.data.frame(tab)
##               type numLanes Freq
## 1   BIKE BOULEVARD        0    0
## 2        BIKE LANE        0   20
## 3       CONTRAFLOW        0    0
## 4  SHARED BUS BIKE        0    0
## 5          SHARROW        0    0
## 6         SIDEPATH        0    0
## 7     SIGNED ROUTE        0    0
## 8   BIKE BOULEVARD        1    1
## 9        BIKE LANE        1  411
## 10      CONTRAFLOW        1    7
## 11 SHARED BUS BIKE        1   39
## 12         SHARROW        1  217
## 13        SIDEPATH        1    6
## 14    SIGNED ROUTE        1  211
## 15  BIKE BOULEVARD        2   48
## 16       BIKE LANE        2  190
## 17      CONTRAFLOW        2    6
## 18 SHARED BUS BIKE        2    0
## 19         SHARROW        2  372
## 20        SIDEPATH        2    1
## 21    SIGNED ROUTE        2   93
tidy(tab)
## # A tibble: 21 x 3
##    type            numLanes     n
##    <chr>           <chr>    <int>
##  1 BIKE BOULEVARD  0            0
##  2 BIKE LANE       0           20
##  3 CONTRAFLOW      0            0
##  4 SHARED BUS BIKE 0            0
##  5 SHARROW         0            0
##  6 SIDEPATH        0            0
##  7 SIGNED ROUTE    0            0
##  8 BIKE BOULEVARD  1            1
##  9 BIKE LANE       1          411
## 10 CONTRAFLOW      1            7
## # … with 11 more rows

Part 3

New Data set

Download the “Real Property Taxes” Data from my website (via OpenBaltimore): http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz note you don’t need to unzip it to read it into R

  1. Read the Property Tax data into R and call it the variable tax
tax = read_csv( "http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz")
## Parsed with column specification:
## cols(
##   PropertyID = col_character(),
##   Block = col_character(),
##   Lot = col_character(),
##   Ward = col_character(),
##   Sect = col_character(),
##   PropertyAddress = col_character(),
##   LotSize = col_character(),
##   CityTax = col_character(),
##   StateTax = col_character(),
##   ResCode = col_character(),
##   AmountDue = col_character(),
##   AsOfDate = col_character(),
##   Neighborhood = col_character(),
##   PoliceDistrict = col_character(),
##   CouncilDistrict = col_double(),
##   Location = col_character()
## )
  1. How many addresses pay property taxes?
dim(tax)
## [1] 238298     16
nrow(tax)
## [1] 238298
length(tax$PropertyID)
## [1] 238298
sum(is.na(tax$CityTax))
## [1] 19052
sum(!is.na(tax$CityTax))
## [1] 219246
  1. What is the total city and state tax paid?
    You need to remove the $ from the CityTax variable then you need to make it numeric. Try str_replace, but remember $ is “special” and you need fixed() around it.
  1. city
head(tax$CityTax)
## [1] "$3929.50" "$2697.60" "$5604.26" "$1852.35" "$2268.97" "$3903.27"
tax = tax %>% 
  mutate(
    CityTax = str_replace(CityTax, 
      fixed("$"), "") ,
    CityTax = as.numeric(CityTax)
  )

# no piping
tax$CityTax = str_replace(tax$CityTax, fixed("$"), "")
tax$CityTax = as.numeric(tax$CityTax)
  1. state
## useing parse_numbers()
options(digits=12) # so no rounding
tax = tax %>% mutate(StateTax = parse_number(StateTax))
                     
sum(tax$CityTax)
## [1] NA
sum(tax$CityTax, na.rm = TRUE)
## [1] 859700558.94
sum(tax$CityTax, na.rm = TRUE)/1e6
## [1] 859.70055894
sum(tax$StateTax, na.rm = TRUE)
## [1] 43484914.8
sum(tax$StateTax, na.rm = TRUE)/1e6
## [1] 43.4849148
  1. Using table() or group_by and summarize(n()) or tally()
  1. how many observations/properties are in each ward?
table(tax$Ward)
## 
##    01    02    03    04    05    06    07    08    09    10    11    12 
##  6619  3547  2541  1731   465  4505  4903 11445 11833  1704  3592  8864 
##    13    14    15    16    17    18    19    20    21    22    23    24 
##  9459  3113 18655 11286  1454  2310  3833 11927  3181  1719  2567  6116 
##    25    26    27    28    50 
## 14841 25704 50158 10219     7
ward_table = tax %>% 
  group_by(Ward) %>% 
  tally()

ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(number_of_obs = n(),
            mean(StateTax, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
  1. what is the mean state tax per ward? use group_by and summarize
tax %>%   group_by(Ward) %>% 
  summarize(mean_state = mean(StateTax, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
##    Ward  mean_state
##    <chr>      <dbl>
##  1 01         301. 
##  2 02         369. 
##  3 03         879. 
##  4 04        2442. 
##  5 05         356. 
##  6 06         178. 
##  7 07          95.9
##  8 08          51.7
##  9 09          86.5
## 10 10          74.1
## # … with 19 more rows
  1. what is the maximum amount still due in each ward? different summarization (max)
tax$AmountDue = tax$AmountDue %>% 
  str_replace(fixed("$"), "") %>%
  as.numeric

tax = tax %>% mutate(
  AmountDue = as.numeric(str_replace(AmountDue, fixed("$"), ""))
)

tax %>% group_by(Ward) %>% 
  summarize(maxDue = max(AmountDue, na.rm = TRUE))
## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
##    Ward    maxDue
##    <chr>    <dbl>
##  1 01     627349.
##  2 02    1452842.
##  3 03    1362318.
##  4 04    3688217.
##  5 05     951087.
##  6 06     414725.
##  7 07    1239508.
##  8 08     396504.
##  9 09     155496.
## 10 10     169659.
## # … with 19 more rows
  1. What is the 75th percentile of city and state tax paid by Ward? (quantile)
tax %>% group_by(Ward) %>% 
  summarize(Percentile = quantile(StateTax, prob = 0.75,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
##    Ward  Percentile
##    <chr>      <dbl>
##  1 01         320. 
##  2 02         329. 
##  3 03         311. 
##  4 04         582. 
##  5 05         128. 
##  6 06         198. 
##  7 07          43.3
##  8 08          73.7
##  9 09         132. 
## 10 10          82.9
## # … with 19 more rows
ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(
    number_of_obs = n(),
    mean_state_tax = mean(StateTax, na.rm = TRUE),
    max_amount_due = max(AmountDue, na.rm = TRUE),
    q75_city = quantile(CityTax, probs = 0.75, na.rm = TRUE),
    q75_state = quantile(StateTax, probs = 0.75, na.rm = TRUE)
  )
## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
  1. Make boxplots using showing cityTax (y -variable) by whether the property is a principal residence (x) or not.
tax = tax %>% 
  mutate(ResCode = str_trim(ResCode))

qplot(y = log10(CityTax+1), x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

qplot(y = CityTax, x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

boxplot(log10(CityTax+1) ~ ResCode, data = tax)

boxplot(CityTax ~ ResCode, data = tax)

tax %>% filter(CityTax == max(CityTax, na.rm = TRUE))
## # A tibble: 1 x 16
##   PropertyID Block Lot   Ward  Sect  PropertyAddress LotSize CityTax
##   <chr>      <chr> <chr> <chr> <chr> <chr>           <chr>     <dbl>
## 1 0841 001   0841  001   21    090   1525 RUSSELL ST 3.627 …  6.31e6
## # … with 8 more variables: StateTax <dbl>, ResCode <chr>,
## #   AmountDue <dbl>, AsOfDate <chr>, Neighborhood <chr>,
## #   PoliceDistrict <chr>, CouncilDistrict <dbl>, Location <chr>
  1. Subset the data to only retain those houses that are principal residences. which command subsets rows? Filter or select?
  1. How many such houses are there?
pres = tax %>% filter( ResCode %in% "PRINCIPAL RESIDENCE")
pres = tax %>% filter( ResCode == "PRINCIPAL RESIDENCE")
dim(pres)
## [1] 113592     16
  1. Describe the distribution of property taxes on these residences. Use hist with certain breaks or plot(density(variable))
qplot(x = log10(CityTax+1),data = pres, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 525 rows containing non-finite values (stat_bin).

qplot(x = CityTax, data = pres, geom = "density")
## Warning: Removed 525 rows containing non-finite values (stat_density).