Data used

Bike Lanes Dataset: BikeBaltimore is the Department of Transportation’s bike program. The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms

You can Download as a CSV in your current working directory. Note its also available at: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv

library(readr)
library(tidyverse)
## -- Attaching packages --- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1     <U+2713> dplyr   0.8.3
## <U+2713> tibble  2.1.3     <U+2713> stringr 1.4.0
## <U+2713> tidyr   1.0.0     <U+2713> forcats 0.4.0
## <U+2713> purrr   0.3.3
## -- Conflicts ------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(jhur)
library(tidyverse)
library(broom)

bike = read_csv(
  "http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")
## Parsed with column specification:
## cols(
##   subType = col_character(),
##   name = col_character(),
##   block = col_character(),
##   type = col_character(),
##   numLanes = col_double(),
##   project = col_character(),
##   route = col_character(),
##   length = col_double(),
##   dateInstalled = col_double()
## )

Part 1

  1. Count the number of rows of the bike data and count the number of complete cases of the bike data. Use sum and complete.cases.
nrow(bike)
## [1] 1631
sum(complete.cases(bike))
## [1] 257
  1. Create a data set called namat which is equal to is.na(bike).
    What is the class of namat? Run rowSums and colSums on namat. These represent the number of missing values in the rows and columns of bike. Don’t print rowSums, but do a table of the rowSums
namat = is.na(bike)
colSums(namat)
##       subType          name         block          type      numLanes 
##             4            12           215             9             0 
##       project         route        length dateInstalled 
##            74          1269             0             0
table(rowSums(namat))
## 
##    0    1    2    3    4 
##  257 1181  182    6    5
  1. Filter rows of bike that are NOT missing the route variable, assign this to the object have_route. Do a table of the subType using table, including the missing subTypes Get the same frequency distribution using group_by(subType) and tally()
have_route = bike %>%  filter(!is.na(route))
dim(have_route)
## [1] 362   9
table(have_route$subType, useNA = "always")
## 
## STRALY STRPRD   <NA> 
##      3    358      1
have_route %>%  group_by(subType) %>%  tally()
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route %>%  count(subType)
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route %>% group_by(subType) %>% 
  summarize(n_obs = n())
## # A tibble: 3 x 2
##   subType n_obs
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
tally(   group_by(have_route, subType) )
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
have_route = group_by(have_route, subType)
tally(have_route)
## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1
  1. Filter rows of bike that have the type SIDEPATH or BIKE LANE using %in%. Call it side_bike. Confirm this gives you the same number of results using the | and ==.
side_bike = bike %>% filter(type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = bike %>% filter(type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE
nrow(side_bike)
## [1] 628
nrow(side_bike2)
## [1] 628
side_bike = filter(bike,type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = filter(bike, type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE

Part 2

  1. Do a cross tabulation of the bike type and the number of lanes. Call it tab. Do a prop.table on the rows and columns margins. Try as.data.frame(tab) or broom::tidy(tab)
tab = table(type=bike$type, numLanes=bike$numLanes)
prop.table(tab, 1)
##                  numLanes
## type                       0          1          2
##   BIKE BOULEVARD  0.00000000 0.02040816 0.97959184
##   BIKE LANE       0.03220612 0.66183575 0.30595813
##   CONTRAFLOW      0.00000000 0.53846154 0.46153846
##   SHARED BUS BIKE 0.00000000 1.00000000 0.00000000
##   SHARROW         0.00000000 0.36842105 0.63157895
##   SIDEPATH        0.00000000 0.85714286 0.14285714
##   SIGNED ROUTE    0.00000000 0.69407895 0.30592105
prop.table(tab, 2)
##                  numLanes
## type                        0           1           2
##   BIKE BOULEVARD  0.000000000 0.001121076 0.067605634
##   BIKE LANE       1.000000000 0.460762332 0.267605634
##   CONTRAFLOW      0.000000000 0.007847534 0.008450704
##   SHARED BUS BIKE 0.000000000 0.043721973 0.000000000
##   SHARROW         0.000000000 0.243273543 0.523943662
##   SIDEPATH        0.000000000 0.006726457 0.001408451
##   SIGNED ROUTE    0.000000000 0.236547085 0.130985915
as.data.frame(tab)
##               type numLanes Freq
## 1   BIKE BOULEVARD        0    0
## 2        BIKE LANE        0   20
## 3       CONTRAFLOW        0    0
## 4  SHARED BUS BIKE        0    0
## 5          SHARROW        0    0
## 6         SIDEPATH        0    0
## 7     SIGNED ROUTE        0    0
## 8   BIKE BOULEVARD        1    1
## 9        BIKE LANE        1  411
## 10      CONTRAFLOW        1    7
## 11 SHARED BUS BIKE        1   39
## 12         SHARROW        1  217
## 13        SIDEPATH        1    6
## 14    SIGNED ROUTE        1  211
## 15  BIKE BOULEVARD        2   48
## 16       BIKE LANE        2  190
## 17      CONTRAFLOW        2    6
## 18 SHARED BUS BIKE        2    0
## 19         SHARROW        2  372
## 20        SIDEPATH        2    1
## 21    SIGNED ROUTE        2   93
tidy(tab)
## # A tibble: 21 x 3
##    type            numLanes     n
##    <chr>           <chr>    <int>
##  1 BIKE BOULEVARD  0            0
##  2 BIKE LANE       0           20
##  3 CONTRAFLOW      0            0
##  4 SHARED BUS BIKE 0            0
##  5 SHARROW         0            0
##  6 SIDEPATH        0            0
##  7 SIGNED ROUTE    0            0
##  8 BIKE BOULEVARD  1            1
##  9 BIKE LANE       1          411
## 10 CONTRAFLOW      1            7
## # … with 11 more rows

Part 3

New Data set

Download the “Real Property Taxes” Data from my website (via OpenBaltimore): http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz note you don’t need to unzip it to read it into R

  1. Read the Property Tax data into R and call it the variable tax
tax = read_csv( "http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz")
## Parsed with column specification:
## cols(
##   PropertyID = col_character(),
##   Block = col_character(),
##   Lot = col_character(),
##   Ward = col_character(),
##   Sect = col_character(),
##   PropertyAddress = col_character(),
##   LotSize = col_character(),
##   CityTax = col_character(),
##   StateTax = col_character(),
##   ResCode = col_character(),
##   AmountDue = col_character(),
##   AsOfDate = col_character(),
##   Neighborhood = col_character(),
##   PoliceDistrict = col_character(),
##   CouncilDistrict = col_double(),
##   Location = col_character()
## )
  1. How many addresses pay property taxes?
dim(tax)
## [1] 238298     16
nrow(tax)
## [1] 238298
length(tax$PropertyID)
## [1] 238298
sum(is.na(tax$CityTax))
## [1] 19052
sum(!is.na(tax$CityTax))
## [1] 219246
  1. What is the total city and state tax paid?
    You need to remove the $ from the CityTax variable then you need to make it numeric. Try str_replace, but remember $ is “special” and you need fixed() around it.
  1. city
head(tax$CityTax)
## [1] "$3929.50" "$2697.60" "$5604.26" "$1852.35" "$2268.97" "$3903.27"
tax = tax %>% 
  mutate(
    CityTax = str_replace(CityTax, 
      fixed("$"), "") ,
    CityTax = as.numeric(CityTax)
  )

# no piping
tax$CityTax = str_replace(tax$CityTax, fixed("$"), "")
tax$CityTax = as.numeric(tax$CityTax)
  1. state
## useing parse_numbers()
options(digits=12) # so no rounding
tax = tax %>% mutate(StateTax = parse_number(StateTax))
                     
sum(tax$CityTax)
## [1] NA
sum(tax$CityTax, na.rm = TRUE)
## [1] 859700558.94
sum(tax$CityTax, na.rm = TRUE)/1e6
## [1] 859.70055894
sum(tax$StateTax, na.rm = TRUE)
## [1] 43484914.8
sum(tax$StateTax, na.rm = TRUE)/1e6
## [1] 43.4849148
  1. Using table() or group_by and summarize(n()) or tally()
  1. how many observations/properties are in each ward?
table(tax$Ward)
## 
##    01    02    03    04    05    06    07    08    09    10    11    12    13 
##  6619  3547  2541  1731   465  4505  4903 11445 11833  1704  3592  8864  9459 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
##  3113 18655 11286  1454  2310  3833 11927  3181  1719  2567  6116 14841 25704 
##    27    28    50 
## 50158 10219     7
ward_table = tax %>% 
  group_by(Ward) %>% 
  tally()

ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(number_of_obs = n(),
            mean(StateTax, na.rm = TRUE))
  1. what is the mean state tax per ward? use group_by and summarize
tax %>%   group_by(Ward) %>% 
  summarize(mean_state = mean(StateTax, na.rm = TRUE))
## # A tibble: 29 x 2
##    Ward  mean_state
##    <chr>      <dbl>
##  1 01         301. 
##  2 02         369. 
##  3 03         879. 
##  4 04        2442. 
##  5 05         356. 
##  6 06         178. 
##  7 07          95.9
##  8 08          51.7
##  9 09          86.5
## 10 10          74.1
## # … with 19 more rows
  1. what is the maximum amount still due in each ward? different summarization (max)
tax$AmountDue = tax$AmountDue %>% 
  str_replace(fixed("$"), "") %>%
  as.numeric

tax = tax %>% mutate(
  AmountDue = as.numeric(str_replace(AmountDue, fixed("$"), ""))
)

tax %>% group_by(Ward) %>% 
  summarize(maxDue = max(AmountDue, na.rm = TRUE))
## # A tibble: 29 x 2
##    Ward    maxDue
##    <chr>    <dbl>
##  1 01     627349.
##  2 02    1452842.
##  3 03    1362318.
##  4 04    3688217.
##  5 05     951087.
##  6 06     414725.
##  7 07    1239508.
##  8 08     396504.
##  9 09     155496.
## 10 10     169659.
## # … with 19 more rows
  1. What is the 75th percentile of city and state tax paid by Ward? (quantile)
tax %>% group_by(Ward) %>% 
  summarize(Percentile = quantile(StateTax, prob = 0.75,na.rm = TRUE))
## # A tibble: 29 x 2
##    Ward  Percentile
##    <chr>      <dbl>
##  1 01         320. 
##  2 02         329. 
##  3 03         311. 
##  4 04         582. 
##  5 05         128. 
##  6 06         198. 
##  7 07          43.3
##  8 08          73.7
##  9 09         132. 
## 10 10          82.9
## # … with 19 more rows
ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(
    number_of_obs = n(),
    mean_state_tax = mean(StateTax, na.rm = TRUE),
    max_amount_due = max(AmountDue, na.rm = TRUE),
    q75_city = quantile(CityTax, probs = 0.75, na.rm = TRUE),
    q75_state = quantile(StateTax, probs = 0.75, na.rm = TRUE)
  )
  1. Make boxplots using showing cityTax (y -variable) by whether the property is a principal residence (x) or not.
tax = tax %>% 
  mutate(ResCode = str_trim(ResCode))

qplot(y = log10(CityTax+1), x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

qplot(y = CityTax, x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

boxplot(log10(CityTax+1) ~ ResCode, data = tax)

boxplot(CityTax ~ ResCode, data = tax)

tax %>% filter(CityTax == max(CityTax, na.rm = TRUE))
## # A tibble: 1 x 16
##   PropertyID Block Lot   Ward  Sect  PropertyAddress LotSize CityTax StateTax
##   <chr>      <chr> <chr> <chr> <chr> <chr>           <chr>     <dbl>    <dbl>
## 1 0841 001   0841  001   21    090   1525 RUSSELL ST 3.627 …  6.31e6  314619.
## # … with 7 more variables: ResCode <chr>, AmountDue <dbl>, AsOfDate <chr>,
## #   Neighborhood <chr>, PoliceDistrict <chr>, CouncilDistrict <dbl>,
## #   Location <chr>
  1. Subset the data to only retain those houses that are principal residences. which command subsets rows? Filter or select?
  1. How many such houses are there?
pres = tax %>% filter( ResCode %in% "PRINCIPAL RESIDENCE")
pres = tax %>% filter( ResCode == "PRINCIPAL RESIDENCE")
dim(pres)
## [1] 113592     16
  1. Describe the distribution of property taxes on these residences. Use hist with certain breaks or plot(density(variable))
qplot(x = log10(CityTax+1),data = pres, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 525 rows containing non-finite values (stat_bin).

qplot(x = CityTax, data = pres, geom = "density")
## Warning: Removed 525 rows containing non-finite values (stat_density).

hist(log2(pres$CityTax+1))

hist(pres$CityTax)

head(pres$CityTax)
## [1] 2268.97 3903.27 3163.68 2419.59  472.08 2991.35
plot(density(pres$CityTax,  na.rm = TRUE))

New Data Set

Read in the Salary FY2015 dataset

sal = read_csv("http://johnmuschelli.com/intro_to_r/data/Baltimore_City_Employee_Salaries_FY2015.csv")
## Parsed with column specification:
## cols(
##   name = col_character(),
##   JobTitle = col_character(),
##   AgencyID = col_character(),
##   Agency = col_character(),
##   HireDate = col_character(),
##   AnnualSalary = col_character(),
##   GrossPay = col_character()
## )
  1. Make an object called health.sal using the salaries data set, with only agencies of those with "fire" (or any forms), if any, in the name remember fixed( ignore_case = TRUE) will ignore cases
health.sal = sal %>% 
  filter(str_detect(JobTitle, 
                    fixed("fire", ignore_case = TRUE)))
  1. Make a data set called trans which contains only agencies that contain “TRANS”.
trans = sal %>% 
  filter(str_detect(JobTitle, "TRANS"))
  1. What is/are the profession(s) of people who have "abra" in their name for Baltimore’s Salaries? Case should be ignored
sal %>% 
  filter(str_detect(name, fixed("abra", ignore_case = TRUE)))
## # A tibble: 12 x 7
##    name      JobTitle       AgencyID Agency       HireDate AnnualSalary GrossPay
##    <chr>     <chr>          <chr>    <chr>        <chr>    <chr>        <chr>   
##  1 Abraham,… LABORER (Hour… B70357   DPW-Solid W… 10/16/2… $30721.00    $32793.…
##  2 Abraham,… ACCOUNTANT I   A50100   DPW-Water &… 12/31/2… $49573.00    $49104.…
##  3 Abraham,… HOUSING INSPE… A06043   Housing & C… 12/15/1… $50365.00    $50804.…
##  4 Abrahams… POLICE OFFICE… A99416   Police Depa… 01/20/2… $46199.00    $18387.…
##  5 Abrams,M… OFFICE SERVIC… A29008   States Atto… 09/19/2… $35691.00    $34970.…
##  6 Abrams,M… COLLECTIONS R… A14003   FIN-Collect… 08/26/2… $37359.00    $37768.…
##  7 Abrams,T… RECREATION AR… P04001   R&P-Recreat… 06/19/2… $20800.00    $1690.00
##  8 Bey,Abra… PROCUREMENT S… A17001   FIN-Purchas… 04/23/2… $69600.00    $69763.…
##  9 Elgamil,… AUDITOR SUPV   A24001   COMP-Audits… 01/31/1… $94400.00    $95791.…
## 10 Gatto,Ab… POLICE OFFICER A99200   Police Depa… 12/12/2… $66086.00    $73651.…
## 11 Schwartz… GENERAL COUNS… A54005   FPR Admin (… 07/26/1… $110400.00   $111408…
## 12 Velez,Ab… POLICE OFFICE… A99036   Police Depa… 10/24/2… $70282.00    $112957…
  1. What is the distribution of annual salaries look like? (use hist, 20 breaks) What is the IQR? Hint: first convert to numeric. Try str_replace, but remember$ is “special” and you need fixed() around it.
sal = sal %>% mutate(AnnualSalary = str_replace(AnnualSalary, fixed("$"), ""))
sal = sal %>% mutate(AnnualSalary = as.numeric(AnnualSalary))
qplot(x = AnnualSalary, data = sal, geom = "histogram", bins = 20)

hist(sal$AnnualSalary, breaks = 20)

quantile(sal$AnnualSalary)
##     0%    25%    50%    75%   100% 
##    900  33354  48126  68112 238772
  1. Convert HireDate to the Date class - plot Annual Salary vs Hire Date. Use AnnualSalary ~ HireDate with a data = sal argument in plot or use x, y notation in scatter.smooth Use lubridate package. Is it mdy(date) or dmy(date) for this data - look at HireDate
sal = sal %>% mutate(HireDate = lubridate::mdy(HireDate))

q = qplot(y = AnnualSalary, x = HireDate, 
      data = sal, geom = "point")
q + geom_smooth(colour = "red", se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning: Removed 10 rows containing missing values (geom_point).

q + geom_smooth(colour = "red", se = FALSE, method = "loess")
## Warning: Removed 10 rows containing non-finite values (stat_smooth).

## Warning: Removed 10 rows containing missing values (geom_point).

q + geom_smooth(colour = "red", se = FALSE, method = "loess", span = 2/3)
## Warning: Removed 10 rows containing non-finite values (stat_smooth).

## Warning: Removed 10 rows containing missing values (geom_point).

plot(AnnualSalary ~ HireDate, data = sal)