Data used

Bike Lanes Dataset: BikeBaltimore is the Department of Transportation’s bike program. The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms

You can Download as a CSV in your current working directory. Note its also available at: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv

library(readr)
library(tidyverse)
library(dplyr)
library(lubridate)
library(jhur)
library(tidyverse)
library(broom)

bike = read_csv(
  "http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")

## Parsed with column specification:
## cols(
##   subType = col_character(),
##   name = col_character(),
##   block = col_character(),
##   type = col_character(),
##   numLanes = col_double(),
##   project = col_character(),
##   route = col_character(),
##   length = col_double(),
##   dateInstalled = col_double()
## )

Part 1

Count the number of rows of the bike data and count the number of complete cases of the bike data. Use sum and complete.cases.

nrow(bike)

## [1] 1631

sum(complete.cases(bike))

## [1] 257

Create a data set called namat which is equal to is.na(bike).
What is the class of namat? Run rowSums and colSums on namat. These represent the number of missing values in the rows and columns of bike. Don’t print rowSums, but do a table of the rowSums

namat = is.na(bike)
colSums(namat)

##       subType          name         block          type      numLanes 
##             4            12           215             9             0 
##       project         route        length dateInstalled 
##            74          1269             0             0

table(rowSums(namat))

## 
##    0    1    2    3    4 
##  257 1181  182    6    5

Filter rows of bike that are NOT missing the route variable, assign this to the object have_route. Do a table of the subType using table, including the missing subTypes Get the same frequency distribution using group_by(subType) and tally()

have_route = bike %>%  filter(!is.na(route))
dim(have_route)

## [1] 362   9

table(have_route$subType, useNA = "always")

## 
## STRALY STRPRD   <NA> 
##      3    358      1

have_route %>%  group_by(subType) %>%  tally()

## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1

have_route %>%  count(subType)

## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1

have_route %>% group_by(subType) %>% 
  summarize(n_obs = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   subType n_obs
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1

tally(   group_by(have_route, subType) )

## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1

have_route = group_by(have_route, subType)
tally(have_route)

## # A tibble: 3 x 2
##   subType     n
##   <chr>   <int>
## 1 STRALY      3
## 2 STRPRD    358
## 3 <NA>        1

Filter rows of bike that have the type SIDEPATH or BIKE LANE using %in%. Call it side_bike. Confirm this gives you the same number of results using the | and ==.

side_bike = bike %>% filter(type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = bike %>% filter(type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)

## [1] TRUE

nrow(side_bike)

## [1] 628

nrow(side_bike2)

## [1] 628

side_bike = filter(bike,type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = filter(bike, type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)

## [1] TRUE

Part 2

Do a cross tabulation of the bike type and the number of lanes. Call it tab. Do a prop.table on the rows and columns margins. Try as.data.frame(tab) or broom::tidy(tab)

tab = table(type=bike$type, numLanes=bike$numLanes)
prop.table(tab, 1)

##                  numLanes
## type                            0               1               2
##   BIKE BOULEVARD  0.0000000000000 0.0204081632653 0.9795918367347
##   BIKE LANE       0.0322061191626 0.6618357487923 0.3059581320451
##   CONTRAFLOW      0.0000000000000 0.5384615384615 0.4615384615385
##   SHARED BUS BIKE 0.0000000000000 1.0000000000000 0.0000000000000
##   SHARROW         0.0000000000000 0.3684210526316 0.6315789473684
##   SIDEPATH        0.0000000000000 0.8571428571429 0.1428571428571
##   SIGNED ROUTE    0.0000000000000 0.6940789473684 0.3059210526316

prop.table(tab, 2)

##                  numLanes
## type                             0                1                2
##   BIKE BOULEVARD  0.00000000000000 0.00112107623318 0.06760563380282
##   BIKE LANE       1.00000000000000 0.46076233183857 0.26760563380282
##   CONTRAFLOW      0.00000000000000 0.00784753363229 0.00845070422535
##   SHARED BUS BIKE 0.00000000000000 0.04372197309417 0.00000000000000
##   SHARROW         0.00000000000000 0.24327354260090 0.52394366197183
##   SIDEPATH        0.00000000000000 0.00672645739910 0.00140845070423
##   SIGNED ROUTE    0.00000000000000 0.23654708520179 0.13098591549296

as.data.frame(tab)

##               type numLanes Freq
## 1   BIKE BOULEVARD        0    0
## 2        BIKE LANE        0   20
## 3       CONTRAFLOW        0    0
## 4  SHARED BUS BIKE        0    0
## 5          SHARROW        0    0
## 6         SIDEPATH        0    0
## 7     SIGNED ROUTE        0    0
## 8   BIKE BOULEVARD        1    1
## 9        BIKE LANE        1  411
## 10      CONTRAFLOW        1    7
## 11 SHARED BUS BIKE        1   39
## 12         SHARROW        1  217
## 13        SIDEPATH        1    6
## 14    SIGNED ROUTE        1  211
## 15  BIKE BOULEVARD        2   48
## 16       BIKE LANE        2  190
## 17      CONTRAFLOW        2    6
## 18 SHARED BUS BIKE        2    0
## 19         SHARROW        2  372
## 20        SIDEPATH        2    1
## 21    SIGNED ROUTE        2   93

tidy(tab)

## # A tibble: 21 x 3
##    type            numLanes     n
##    <chr>           <chr>    <int>
##  1 BIKE BOULEVARD  0            0
##  2 BIKE LANE       0           20
##  3 CONTRAFLOW      0            0
##  4 SHARED BUS BIKE 0            0
##  5 SHARROW         0            0
##  6 SIDEPATH        0            0
##  7 SIGNED ROUTE    0            0
##  8 BIKE BOULEVARD  1            1
##  9 BIKE LANE       1          411
## 10 CONTRAFLOW      1            7
## # … with 11 more rows

Part 3

New Data set

Download the “Real Property Taxes” Data from my website (via OpenBaltimore): http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz note you don’t need to unzip it to read it into R

Read the Property Tax data into R and call it the variable tax

tax = read_csv( "http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz")

## Parsed with column specification:
## cols(
##   PropertyID = col_character(),
##   Block = col_character(),
##   Lot = col_character(),
##   Ward = col_character(),
##   Sect = col_character(),
##   PropertyAddress = col_character(),
##   LotSize = col_character(),
##   CityTax = col_character(),
##   StateTax = col_character(),
##   ResCode = col_character(),
##   AmountDue = col_character(),
##   AsOfDate = col_character(),
##   Neighborhood = col_character(),
##   PoliceDistrict = col_character(),
##   CouncilDistrict = col_double(),
##   Location = col_character()
## )

How many addresses pay property taxes?

dim(tax)

## [1] 238298     16

nrow(tax)

## [1] 238298

length(tax$PropertyID)

## [1] 238298

sum(is.na(tax$CityTax))

## [1] 19052

sum(!is.na(tax$CityTax))

## [1] 219246

What is the total city and state tax paid?
You need to remove the $ from the CityTax variable then you need to make it numeric. Try str_replace, but remember $ is “special” and you need fixed() around it.

city

head(tax$CityTax)

## [1] "$3929.50" "$2697.60" "$5604.26" "$1852.35" "$2268.97" "$3903.27"

tax = tax %>% 
  mutate(
    CityTax = str_replace(CityTax, 
      fixed("$"), "") ,
    CityTax = as.numeric(CityTax)
  )

# no piping
tax$CityTax = str_replace(tax$CityTax, fixed("$"), "")
tax$CityTax = as.numeric(tax$CityTax)

state

## useing parse_numbers()
options(digits=12) # so no rounding
tax = tax %>% mutate(StateTax = parse_number(StateTax))
                     
sum(tax$CityTax)

## [1] NA

sum(tax$CityTax, na.rm = TRUE)

## [1] 859700558.94

sum(tax$CityTax, na.rm = TRUE)/1e6

## [1] 859.70055894

sum(tax$StateTax, na.rm = TRUE)

## [1] 43484914.8

sum(tax$StateTax, na.rm = TRUE)/1e6

## [1] 43.4849148

Using table() or group_by and summarize(n()) or tally()

how many observations/properties are in each ward?

table(tax$Ward)

## 
##    01    02    03    04    05    06    07    08    09    10    11    12 
##  6619  3547  2541  1731   465  4505  4903 11445 11833  1704  3592  8864 
##    13    14    15    16    17    18    19    20    21    22    23    24 
##  9459  3113 18655 11286  1454  2310  3833 11927  3181  1719  2567  6116 
##    25    26    27    28    50 
## 14841 25704 50158 10219     7

ward_table = tax %>% 
  group_by(Ward) %>% 
  tally()

ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(number_of_obs = n(),
            mean(StateTax, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

what is the mean state tax per ward? use group_by and summarize

tax %>%   group_by(Ward) %>% 
  summarize(mean_state = mean(StateTax, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 29 x 2
##    Ward  mean_state
##    <chr>      <dbl>
##  1 01         301. 
##  2 02         369. 
##  3 03         879. 
##  4 04        2442. 
##  5 05         356. 
##  6 06         178. 
##  7 07          95.9
##  8 08          51.7
##  9 09          86.5
## 10 10          74.1
## # … with 19 more rows

what is the maximum amount still due in each ward? different summarization (max)

tax$AmountDue = tax$AmountDue %>% 
  str_replace(fixed("$"), "") %>%
  as.numeric

tax = tax %>% mutate(
  AmountDue = as.numeric(str_replace(AmountDue, fixed("$"), ""))
)

tax %>% group_by(Ward) %>% 
  summarize(maxDue = max(AmountDue, na.rm = TRUE))

## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 29 x 2
##    Ward    maxDue
##    <chr>    <dbl>
##  1 01     627349.
##  2 02    1452842.
##  3 03    1362318.
##  4 04    3688217.
##  5 05     951087.
##  6 06     414725.
##  7 07    1239508.
##  8 08     396504.
##  9 09     155496.
## 10 10     169659.
## # … with 19 more rows

What is the 75th percentile of city and state tax paid by Ward? (quantile)

tax %>% group_by(Ward) %>% 
  summarize(Percentile = quantile(StateTax, prob = 0.75,na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 29 x 2
##    Ward  Percentile
##    <chr>      <dbl>
##  1 01         320. 
##  2 02         329. 
##  3 03         311. 
##  4 04         582. 
##  5 05         128. 
##  6 06         198. 
##  7 07          43.3
##  8 08          73.7
##  9 09         132. 
## 10 10          82.9
## # … with 19 more rows

ward_table = tax %>% 
  group_by(Ward) %>% 
  summarize(
    number_of_obs = n(),
    mean_state_tax = mean(StateTax, na.rm = TRUE),
    max_amount_due = max(AmountDue, na.rm = TRUE),
    q75_city = quantile(CityTax, probs = 0.75, na.rm = TRUE),
    q75_state = quantile(StateTax, probs = 0.75, na.rm = TRUE)
  )

## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## `summarise()` ungrouping output (override with `.groups` argument)

Make boxplots using showing cityTax (y -variable) by whether the property is a principal residence (x) or not.

tax = tax %>% 
  mutate(ResCode = str_trim(ResCode))

qplot(y = log10(CityTax+1), x = ResCode, data = tax, geom = "boxplot")

## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

qplot(y = CityTax, x = ResCode, data = tax, geom = "boxplot")

## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).

boxplot(log10(CityTax+1) ~ ResCode, data = tax)

boxplot(CityTax ~ ResCode, data = tax)

tax %>% filter(CityTax == max(CityTax, na.rm = TRUE))

## # A tibble: 1 x 16
##   PropertyID Block Lot   Ward  Sect  PropertyAddress LotSize CityTax
##   <chr>      <chr> <chr> <chr> <chr> <chr>           <chr>     <dbl>
## 1 0841 001   0841  001   21    090   1525 RUSSELL ST 3.627 …  6.31e6
## # … with 8 more variables: StateTax <dbl>, ResCode <chr>,
## #   AmountDue <dbl>, AsOfDate <chr>, Neighborhood <chr>,
## #   PoliceDistrict <chr>, CouncilDistrict <dbl>, Location <chr>

Subset the data to only retain those houses that are principal residences. which command subsets rows? Filter or select?

How many such houses are there?

pres = tax %>% filter( ResCode %in% "PRINCIPAL RESIDENCE")
pres = tax %>% filter( ResCode == "PRINCIPAL RESIDENCE")
dim(pres)

## [1] 113592     16

Describe the distribution of property taxes on these residences. Use hist with certain breaks or plot(density(variable))

qplot(x = log10(CityTax+1),data = pres, geom = "histogram")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 525 rows containing non-finite values (stat_bin).

qplot(x = CityTax, data = pres, geom = "density")

## Warning: Removed 525 rows containing non-finite values (stat_density).

hist(log2(pres$CityTax+1))

hist(pres$CityTax)

head(pres$CityTax)

## [1] 2268.97 3903.27 3163.68 2419.59  472.08 2991.35

plot(density(pres$CityTax,  na.rm = TRUE))

New Data Set

Read in the Salary FY2015 dataset

sal = read_csv("http://johnmuschelli.com/intro_to_r/data/Baltimore_City_Employee_Salaries_FY2015.csv")

## Parsed with column specification:
## cols(
##   name = col_character(),
##   JobTitle = col_character(),
##   AgencyID = col_character(),
##   Agency = col_character(),
##   HireDate = col_character(),
##   AnnualSalary = col_character(),
##   GrossPay = col_character()
## )

Make an object called health.sal using the salaries data set, with only agencies of those with "fire" (or any forms), if any, in the name remember fixed( ignore_case = TRUE) will ignore cases

health.sal = sal %>% 
  filter(str_detect(JobTitle, 
                    fixed("fire", ignore_case = TRUE)))

Make a data set called trans which contains only agencies that contain “TRANS”.

trans = sal %>% 
  filter(str_detect(JobTitle, "TRANS"))

What is/are the profession(s) of people who have "abra" in their name for Baltimore’s Salaries? Case should be ignored

sal %>% 
  filter(str_detect(name, fixed("abra", ignore_case = TRUE)))

## # A tibble: 12 x 7
##    name    JobTitle    AgencyID Agency    HireDate AnnualSalary GrossPay
##    <chr>   <chr>       <chr>    <chr>     <chr>    <chr>        <chr>   
##  1 Abraha… LABORER (H… B70357   DPW-Soli… 10/16/2… $30721.00    $32793.…
##  2 Abraha… ACCOUNTANT… A50100   DPW-Wate… 12/31/2… $49573.00    $49104.…
##  3 Abraha… HOUSING IN… A06043   Housing … 12/15/1… $50365.00    $50804.…
##  4 Abraha… POLICE OFF… A99416   Police D… 01/20/2… $46199.00    $18387.…
##  5 Abrams… OFFICE SER… A29008   States A… 09/19/2… $35691.00    $34970.…
##  6 Abrams… COLLECTION… A14003   FIN-Coll… 08/26/2… $37359.00    $37768.…
##  7 Abrams… RECREATION… P04001   R&P-Recr… 06/19/2… $20800.00    $1690.00
##  8 Bey,Ab… PROCUREMEN… A17001   FIN-Purc… 04/23/2… $69600.00    $69763.…
##  9 Elgami… AUDITOR SU… A24001   COMP-Aud… 01/31/1… $94400.00    $95791.…
## 10 Gatto,… POLICE OFF… A99200   Police D… 12/12/2… $66086.00    $73651.…
## 11 Schwar… GENERAL CO… A54005   FPR Admi… 07/26/1… $110400.00   $111408…
## 12 Velez,… POLICE OFF… A99036   Police D… 10/24/2… $70282.00    $112957…

What is the distribution of annual salaries look like? (use hist, 20 breaks) What is the IQR? Hint: first convert to numeric. Try str_replace, but remember$ is “special” and you need fixed() around it.

sal = sal %>% mutate(AnnualSalary = str_replace(AnnualSalary, fixed("$"), ""))
sal = sal %>% mutate(AnnualSalary = as.numeric(AnnualSalary))
qplot(x = AnnualSalary, data = sal, geom = "histogram", bins = 20)

hist(sal$AnnualSalary, breaks = 20)

quantile(sal$AnnualSalary)

##     0%    25%    50%    75%   100% 
##    900  33354  48126  68112 238772

Convert HireDate to the Date class - plot Annual Salary vs Hire Date. Use AnnualSalary ~ HireDate with a data = sal argument in plot or use x, y notation in scatter.smooth Use lubridate package. Is it mdy(date) or dmy(date) for this data - look at HireDate

sal = sal %>% mutate(HireDate = lubridate::mdy(HireDate))

q = qplot(y = AnnualSalary, x = HireDate, 
      data = sal, geom = "point")
q + geom_smooth(colour = "red", se = FALSE)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 10 rows containing non-finite values (stat_smooth).

## Warning: Removed 10 rows containing missing values (geom_point).

q + geom_smooth(colour = "red", se = FALSE, method = "loess")

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 10 rows containing non-finite values (stat_smooth).

## Warning: Removed 10 rows containing missing values (geom_point).

q + geom_smooth(colour = "red", se = FALSE, method = "loess", span = 2/3)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 10 rows containing non-finite values (stat_smooth).

## Warning: Removed 10 rows containing missing values (geom_point).

plot(AnnualSalary ~ HireDate, data = sal)

scatter.smooth(sal$AnnualSalary, x = sal$HireDate, col = "red")

Create a smaller dataset that only includes the Police Department, Fire Department and Sheriff’s Office. Use the Agency variable with string matching. Call this emer. How many employees are in this new dataset?

emer = sal %>% filter(
  str_detect(Agency, "Sheriff's Office|Police Department|Fire Department")
)

emer = sal %>% filter(
  str_detect(Agency, "Sheriff's Office") |
    str_detect(Agency, "Police Department") |
    str_detect(Agency, "Fire Department")
)

Create a variable called dept in the emer data set. dept = str_extract(Agency, ".*(ment|ice)"). E.g. want to extract all characters up until ment or ice (we can group in regex using parentheses) and then discard the rest. Replot annual salary versus hire date, color by dept (not yet - using ggplot)

emer = emer %>% 
  mutate(
    dept = str_extract(Agency, ".*(ment|ice)")
  )
# Replot annual salary versus hire date, color by dept (not yet - using ggplot)
ggplot(aes(x = HireDate, y = AnnualSalary, 
           colour = dept), data = emer) + 
  geom_point() + theme(legend.position = c(0.5, 0.8))

## Warning: Removed 10 rows containing missing values (geom_point).

19 (Bonus). Convert the ‘LotSize’ variable to a numeric square feet variable in the tax data set. Some tips: a) 1 acre = 43560 square feet b) The hyphens represent inches (not decimals)

tax$LotSize = str_trim(tax$LotSize) # trim to be safe
lot = tax$LotSize # for checking later

First lets take care of acres

aIndex= which(str_detect(tax$LotSize, "AC.*") | 
            str_detect(tax$LotSize, fixed(" %")))
head(aIndex)

## [1] 20 28 63 70 71 76

head(lot[aIndex])

## [1] "0.491 ACRES" "0.025 ACRES" "0.278 ACRES" "0.067 ACRES"
## [5] "0.024 ACRES" "0.271 ACRES"

acre = tax$LotSize[aIndex] # temporary variable
## find and replace character strings
acre = str_replace_all(acre, " AC.*","")
acre = str_replace_all(acre, " %","")
table(!is.na(as.numeric(acre)))

## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion

## 
## FALSE  TRUE 
##   236 18559

head(acre[is.na(as.numeric(acre))],50)

## Warning in head(acre[is.na(as.numeric(acre))], 50): NAs introduced by
## coercion

##  [1] "1-1383"         "5-25"           "1.914ACRES"     "2-364"         
##  [5] "3-46"           "3-4"            "2-617"          "2-617"         
##  [9] "2-617"          "4.574ACRES"     "1.423ACRES"     "2.460ACRES"    
## [13] "3-94"           "2-456"          "2-24"           "1-36"          
## [17] "21-8X5.5"       "0.047ACRES"     "0.067ACRES"     "0.045ACRES"    
## [21] "16-509"         "2-657"          "1-379"          "2-158"         
## [25] "1-81"           "13-112"         "2-2"            "IMP ONLY 0.190"
## [29] "26,140"         "0.104ACRES"     "O.084"          "1-566"         
## [33] "28-90"          "O-602"          "3-67"           "4-40"          
## [37] "2-486"          "4-3165"         "O-0768"         "O-8577"        
## [41] "8-956"          "18-425"         "9-265"          "1-371"         
## [45] "3-436"          "2.00ACRES"      "8-974"          "2-36"          
## [49] "1-788"          "1-005"

## lets clean the rest
acre = str_replace_all(acre, "-",".") # hyphen instead of decimal
head(acre[is.na(as.numeric(acre))])

## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by
## coercion

## [1] "1.914ACRES" "4.574ACRES" "1.423ACRES" "2.460ACRES" "21.8X5.5"  
## [6] "0.047ACRES"

table(!is.na(as.numeric(acre)))

## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion

## 
## FALSE  TRUE 
##    96 18699

acre = str_replace_all(acre, "ACRES","")
head(acre[is.na(as.numeric(acre))])

## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by
## coercion

## [1] "21.8X5.5"       "IMP ONLY 0.190" "26,140"         "O.084"         
## [5] "O.602"          "O.0768"

# take care of individual mistakes
acre = str_replace_all(acre, "O","0") # 0 vs O
acre = str_replace_all(acre, "Q","") # Q, oops
acre = str_replace_all(acre, ",.",".") # extra ,
acre = str_replace_all(acre, ",","") # extra ,
acre = str_replace_all(acre, "L","0") # leading L
acre = str_replace_all(acre, "-",".") # hyphen to period
acre[is.na(as.numeric(acre))]

## Warning: NAs introduced by coercion

## [1] "21.8X5.5"       "IMP 0N0Y 0.190" "IMP.0N0Y 3.615"

acre2 = as.numeric(acre)*43560

## Warning: NAs introduced by coercion

sum(is.na(acre2)) # all but 3

## [1] 3

Now let’s convert all of the square feet variables

library(purrr)
fIndex = which(str_detect(tax$LotSize, "X"))

ft = tax$LotSize[fIndex]

ft = str_replace_all(ft, fixed("&"), "-")
ft = str_replace_all(ft, "IMP ONLY ", "")
ft = str_replace_all(ft, "`","1")

ft= map_chr(str_split(ft, " "), first)

## now get the widths and lengths
width = map_chr(str_split(ft,"X"), first)
length = map_chr(str_split(ft,"X"), nth, 2) 

## width
widthFeet = as.numeric(map_chr(str_split(width, "-"), first))

## Warning: NAs introduced by coercion

widthInch = as.numeric(map_chr(str_split(width, "-"),nth,2))/12
widthInch[is.na(widthInch)] = 0 # when no inches present
totalWidth = widthFeet + widthInch # add together

# length
lengthFeet = as.numeric(map_chr(str_split(length, "-"),first))
lengthInch = as.numeric(map_chr(str_split(length, "-",2),nth,2))/12

## Warning: NAs introduced by coercion

lengthInch[is.na(lengthInch)] = 0 # when no inches present
totalLength = lengthFeet + lengthInch

# combine together for square feet
sqrtFt = totalWidth*totalLength 
ft[is.na(sqrtFt)] # what is left?

##  [1] "12-3XX4-2"         "161X"              "Q8X120"           
##  [4] "37-1X-60-10X57-11" "11XX0"             "M2X169-9"         
##  [7] "X134"              "POINT"             "22-11"            
## [10] "ASSESS"            "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1"
## [13] "{1-4X128-6"        "O-7X125"           "13XX5"            
## [16] "O-3X125"           "F7-10X80"          "N1-8X92-6"        
## [19] "POINTX100-5X10"    "]7X100"            "16-11XX5"         
## [22] "Q5X119"

And now we combine everything together:

tax$sqft = rep(NA)
tax$sqft[aIndex] = acre2
tax$sqft[fIndex] = sqrtFt
mean(!is.na(tax$sqft))

## [1] 0.934090928166

# already in square feet, easy!!
sIndex=which(str_detect(tax$LotSize, "FT") | str_detect(tax$LotSize, "S.*F."))
sf = tax$LotSize[sIndex] # subset temporary variable

sqft2 = map_chr(str_split(sf,"( |SQ|SF)"),first)
sqft2 = as.numeric(str_replace_all(sqft2, ",", "")) # remove , and convert

## Warning: NAs introduced by coercion

tax$sqft[sIndex] = sqft2
table(is.na(tax$sqft))

## 
##  FALSE   TRUE 
## 238105    193

## progress!

#what remains?
lot[is.na(tax$sqft)]

##   [1] NA                  "19520819"          "IMP ONLY"         
##   [4] "IMP ONLY"          "IMPROVEMENTS ONLY" "IMPROVEMENTS ONLY"
##   [7] "IMP ONLY"          "IMP ONLY"          NA                 
##  [10] "0.016"             "0.230"             "IMP ONLY"         
##  [13] "0.649"             "ASSESSMENT ONLY"   "IMPROVEMENT ONLY" 
##  [16] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "0.033"            
##  [19] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENTS ONLY"
##  [22] "IMPROVEMENT ONLY"  "IMP ONLY"          "IMP ONLY 0.190 AC"
##  [25] "IMP ONLY"          "AIR RIGHTS"        "IMP ONLY"         
##  [28] "IMP ONLY"          "IMP ONLY"          "IMP ONLY"         
##  [31] "IMP ONLY"          "13,846 SF"         NA                 
##  [34] NA                  "0.067"             "IMP ONLY"         
##  [37] "IMPROVEMENTS ONLY" "CO-57 (IMP ONLY)"  "IMP ONLY"         
##  [40] "0.858"             "20080211"          "IMPROVEMENT ONLY" 
##  [43] "12-3XX4-2"         "161X 137"          "IMP ONLY"         
##  [46] "0.040"             "0.028"             "IMP ONLY"         
##  [49] "IMP ONLY"          "IMP ONLY"          "IMP ONLY"         
##  [52] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY" 
##  [55] "IMPROVEMENT ONLY"  "0.681 ARCES"       "IMP ONLY"         
##  [58] "IMP ONLY"          "IMP ONLY"          "IMPROVEMENT ONLY" 
##  [61] "24179D096"         "Q8X120"            "REAR PART 697 S.F"
##  [64] "IMP. ONLY"         "IMPROVEMENT ONLY"  "IMP ONLY"         
##  [67] "37-1X-60-10X57-11" "IMP ONLY"          "68I.0 SQ FT"      
##  [70] "0.024"             "01185"             "11979"            
##  [73] "IMPROVEMENT ONLY"  "196552-8"          "AIR RIGHTS"       
##  [76] "IMPROVEMENT ONLY"  "7,138 SF"          "9,745 SF"         
##  [79] "13,772 SF"         "17,294 SF"         "15,745 SF"        
##  [82] "0.030"             "18, 162 SF"        "0.036 ARES"       
##  [85] "11XX0"             "0.026"             "IMPROVEMENT ONLY" 
##  [88] "IMP ONLY"          "M2X169-9"          "IMPROVEMENT ONLY" 
##  [91] "0.5404%"           "IMPROVEMENTS ONLY" "AIR RIGHTS"       
##  [94] "AIR RIGHTS"        "AIR RIGHTS"        "2381437.2 CUBIC F"
##  [97] "IMPROVEMENT ONLY"  "IMP ONLY"          "AIR RIGHTS ONLY"  
## [100] "X134"              "IMPROVEMENT ONLY"  "AIR RIGHTS"       
## [103] "\\560 S.F."        "1233.04"           NA                 
## [106] NA                  "POINT X 57-9"      "IMPROVEMENTS ONLY"
## [109] "22-11 X57"         "0.013"             "IMP.ONLY 3.615 AC"
## [112] "IMP ONLY"          "IMP ONLY"          "0.129%"           
## [115] "0.129%"            "0.129%"            "ASSESS CTY 84 S.F"
## [118] "0.039"             "1159-0 S.F."       "03281969"         
## [121] "IMPROVEMENT ONLY"  "ASSESS CTY 50 S.F" "ASSESS CTY 57X1-1"
## [124] "IMP ONLY"          "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1"
## [127] "IMPROVEMENT ONLY"  "{1-4X128-6"        "IMPROVEMENT ONLY" 
## [130] "IMPROVEMENT ONLY"  "O-7X125"           NA                 
## [133] "13XX5"             "O-3X125"           "IMPROVEMENT ONLY" 
## [136] "IMP ONLY"          "AIR RIGHTS"        "0.026"            
## [139] "0.129%"            "0.129%"            "IMPROVEMENT ONLY" 
## [142] "F7-10X80"          "ASSESS CTY"        "IMP ONLY"         
## [145] "IMP ONLY"          "IMP ONLY"          "IMP ONLY"         
## [148] "IMPROVEMENTS ONLY" "IMPROVEMENT ONLY"  "0.032"            
## [151] "20050712"          "IMP ONLY"          "IMP ONLY"         
## [154] "0.033"             "IMP.ONLY"          "IMP ONLY"         
## [157] "IMP ONLY"          "IMPROVEMENT ONLY"  "N1-8X92-6"        
## [160] "POINTX100-5X10"    "IMPROVEMENTS ONLY" "]7X100"           
## [163] "07031966"          "13O4 SQ FT"        "05081978"         
## [166] "IMPROVEMENTS ONLY" "AIR RIGHTS"        "AIR RIGHTS"       
## [169] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY" 
## [172] "IMPROVEMENT ONLY"  "0.191 ARES"        "19815-4"          
## [175] "11`79"             NA                  NA                 
## [178] "IMPROVEMENT ONLY"  "IMP ONLY"          "194415"           
## [181] "IMP ONLY"          "IMPROVEMENT ONLY"  "IMP ONLY"         
## [184] "120-103"           "IMP ONLY"          "IMP. ONLY"        
## [187] "196700"            "IMPROVEMENT ONLY"  "16-11XX5"         
## [190] "Q5X119"            "21210"             "IMPROVEMENT ONLY" 
## [193] "5306120"

Data Cleaning Lab Key

Data used

Part 1

Part 2

Part 3