Bike Lanes Dataset: BikeBaltimore is the Department of Transportation’s bike program. The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms
You can Download as a CSV in your current working directory. Note its also available at: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv
library(readr)
library(tidyverse)
library(dplyr)
library(lubridate)
library(jhur)
library(tidyverse)
library(broom)
bike = read_csv(
"http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")
## Parsed with column specification:
## cols(
## subType = col_character(),
## name = col_character(),
## block = col_character(),
## type = col_character(),
## numLanes = col_double(),
## project = col_character(),
## route = col_character(),
## length = col_double(),
## dateInstalled = col_double()
## )
sum
and complete.cases
.nrow(bike)
## [1] 1631
sum(complete.cases(bike))
## [1] 257
namat
which is equal to is.na(bike
).namat
? Run rowSums and colSums on namat.
These represent the number of missing values in the rows and columns of bike.
Don’t print rowSums
, but do a table of the rowSums
namat = is.na(bike)
colSums(namat)
## subType name block type numLanes
## 4 12 215 9 0
## project route length dateInstalled
## 74 1269 0 0
table(rowSums(namat))
##
## 0 1 2 3 4
## 257 1181 182 6 5
route
variable, assign this to the object have_route.
Do a table of the subType
using table
, including the missing subType
s Get the same frequency distribution using group_by(subType)
and tally()
have_route = bike %>% filter(!is.na(route))
dim(have_route)
## [1] 362 9
table(have_route$subType, useNA = "always")
##
## STRALY STRPRD <NA>
## 3 358 1
have_route %>% group_by(subType) %>% tally()
## # A tibble: 3 x 2
## subType n
## <chr> <int>
## 1 STRALY 3
## 2 STRPRD 358
## 3 <NA> 1
have_route %>% count(subType)
## # A tibble: 3 x 2
## subType n
## <chr> <int>
## 1 STRALY 3
## 2 STRPRD 358
## 3 <NA> 1
have_route %>% group_by(subType) %>%
summarize(n_obs = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## subType n_obs
## <chr> <int>
## 1 STRALY 3
## 2 STRPRD 358
## 3 <NA> 1
tally( group_by(have_route, subType) )
## # A tibble: 3 x 2
## subType n
## <chr> <int>
## 1 STRALY 3
## 2 STRPRD 358
## 3 <NA> 1
have_route = group_by(have_route, subType)
tally(have_route)
## # A tibble: 3 x 2
## subType n
## <chr> <int>
## 1 STRALY 3
## 2 STRPRD 358
## 3 <NA> 1
SIDEPATH
or BIKE LANE
using %in%
. Call it side_bike.
Confirm this gives you the same number of results using the |
and ==
.side_bike = bike %>% filter(type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = bike %>% filter(type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE
nrow(side_bike)
## [1] 628
nrow(side_bike2)
## [1] 628
side_bike = filter(bike,type %in% c("SIDEPATH", "BIKE LANE"))
side_bike2 = filter(bike, type == "SIDEPATH" | type == "BIKE LANE")
identical(side_bike, side_bike2)
## [1] TRUE
type
and the number of lanes. Call it tab
. Do a prop.table
on the rows and columns margins. Try as.data.frame(tab)
or broom::tidy(tab)
tab = table(type=bike$type, numLanes=bike$numLanes)
prop.table(tab, 1)
## numLanes
## type 0 1 2
## BIKE BOULEVARD 0.0000000000000 0.0204081632653 0.9795918367347
## BIKE LANE 0.0322061191626 0.6618357487923 0.3059581320451
## CONTRAFLOW 0.0000000000000 0.5384615384615 0.4615384615385
## SHARED BUS BIKE 0.0000000000000 1.0000000000000 0.0000000000000
## SHARROW 0.0000000000000 0.3684210526316 0.6315789473684
## SIDEPATH 0.0000000000000 0.8571428571429 0.1428571428571
## SIGNED ROUTE 0.0000000000000 0.6940789473684 0.3059210526316
prop.table(tab, 2)
## numLanes
## type 0 1 2
## BIKE BOULEVARD 0.00000000000000 0.00112107623318 0.06760563380282
## BIKE LANE 1.00000000000000 0.46076233183857 0.26760563380282
## CONTRAFLOW 0.00000000000000 0.00784753363229 0.00845070422535
## SHARED BUS BIKE 0.00000000000000 0.04372197309417 0.00000000000000
## SHARROW 0.00000000000000 0.24327354260090 0.52394366197183
## SIDEPATH 0.00000000000000 0.00672645739910 0.00140845070423
## SIGNED ROUTE 0.00000000000000 0.23654708520179 0.13098591549296
as.data.frame(tab)
## type numLanes Freq
## 1 BIKE BOULEVARD 0 0
## 2 BIKE LANE 0 20
## 3 CONTRAFLOW 0 0
## 4 SHARED BUS BIKE 0 0
## 5 SHARROW 0 0
## 6 SIDEPATH 0 0
## 7 SIGNED ROUTE 0 0
## 8 BIKE BOULEVARD 1 1
## 9 BIKE LANE 1 411
## 10 CONTRAFLOW 1 7
## 11 SHARED BUS BIKE 1 39
## 12 SHARROW 1 217
## 13 SIDEPATH 1 6
## 14 SIGNED ROUTE 1 211
## 15 BIKE BOULEVARD 2 48
## 16 BIKE LANE 2 190
## 17 CONTRAFLOW 2 6
## 18 SHARED BUS BIKE 2 0
## 19 SHARROW 2 372
## 20 SIDEPATH 2 1
## 21 SIGNED ROUTE 2 93
tidy(tab)
## # A tibble: 21 x 3
## type numLanes n
## <chr> <chr> <int>
## 1 BIKE BOULEVARD 0 0
## 2 BIKE LANE 0 20
## 3 CONTRAFLOW 0 0
## 4 SHARED BUS BIKE 0 0
## 5 SHARROW 0 0
## 6 SIDEPATH 0 0
## 7 SIGNED ROUTE 0 0
## 8 BIKE BOULEVARD 1 1
## 9 BIKE LANE 1 411
## 10 CONTRAFLOW 1 7
## # … with 11 more rows
New Data set
Download the “Real Property Taxes” Data from my website (via OpenBaltimore): http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz note you don’t need to unzip it to read it into R
tax
tax = read_csv( "http://johnmuschelli.com/intro_to_r/data/Real_Property_Taxes.csv.gz")
## Parsed with column specification:
## cols(
## PropertyID = col_character(),
## Block = col_character(),
## Lot = col_character(),
## Ward = col_character(),
## Sect = col_character(),
## PropertyAddress = col_character(),
## LotSize = col_character(),
## CityTax = col_character(),
## StateTax = col_character(),
## ResCode = col_character(),
## AmountDue = col_character(),
## AsOfDate = col_character(),
## Neighborhood = col_character(),
## PoliceDistrict = col_character(),
## CouncilDistrict = col_double(),
## Location = col_character()
## )
dim(tax)
## [1] 238298 16
nrow(tax)
## [1] 238298
length(tax$PropertyID)
## [1] 238298
sum(is.na(tax$CityTax))
## [1] 19052
sum(!is.na(tax$CityTax))
## [1] 219246
$
from the CityTax
variable then you need to make it numeric. Try str_replace
, but remember $ is “special” and you need fixed()
around it.head(tax$CityTax)
## [1] "$3929.50" "$2697.60" "$5604.26" "$1852.35" "$2268.97" "$3903.27"
tax = tax %>%
mutate(
CityTax = str_replace(CityTax,
fixed("$"), "") ,
CityTax = as.numeric(CityTax)
)
# no piping
tax$CityTax = str_replace(tax$CityTax, fixed("$"), "")
tax$CityTax = as.numeric(tax$CityTax)
## useing parse_numbers()
options(digits=12) # so no rounding
tax = tax %>% mutate(StateTax = parse_number(StateTax))
sum(tax$CityTax)
## [1] NA
sum(tax$CityTax, na.rm = TRUE)
## [1] 859700558.94
sum(tax$CityTax, na.rm = TRUE)/1e6
## [1] 859.70055894
sum(tax$StateTax, na.rm = TRUE)
## [1] 43484914.8
sum(tax$StateTax, na.rm = TRUE)/1e6
## [1] 43.4849148
table()
or group_by
and summarize(n())
or tally()
table(tax$Ward)
##
## 01 02 03 04 05 06 07 08 09 10 11 12
## 6619 3547 2541 1731 465 4505 4903 11445 11833 1704 3592 8864
## 13 14 15 16 17 18 19 20 21 22 23 24
## 9459 3113 18655 11286 1454 2310 3833 11927 3181 1719 2567 6116
## 25 26 27 28 50
## 14841 25704 50158 10219 7
ward_table = tax %>%
group_by(Ward) %>%
tally()
ward_table = tax %>%
group_by(Ward) %>%
summarize(number_of_obs = n(),
mean(StateTax, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
group_by
and summarize
tax %>% group_by(Ward) %>%
summarize(mean_state = mean(StateTax, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
## Ward mean_state
## <chr> <dbl>
## 1 01 301.
## 2 02 369.
## 3 03 879.
## 4 04 2442.
## 5 05 356.
## 6 06 178.
## 7 07 95.9
## 8 08 51.7
## 9 09 86.5
## 10 10 74.1
## # … with 19 more rows
max
)tax$AmountDue = tax$AmountDue %>%
str_replace(fixed("$"), "") %>%
as.numeric
tax = tax %>% mutate(
AmountDue = as.numeric(str_replace(AmountDue, fixed("$"), ""))
)
tax %>% group_by(Ward) %>%
summarize(maxDue = max(AmountDue, na.rm = TRUE))
## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
## Ward maxDue
## <chr> <dbl>
## 1 01 627349.
## 2 02 1452842.
## 3 03 1362318.
## 4 04 3688217.
## 5 05 951087.
## 6 06 414725.
## 7 07 1239508.
## 8 08 396504.
## 9 09 155496.
## 10 10 169659.
## # … with 19 more rows
quantile
)tax %>% group_by(Ward) %>%
summarize(Percentile = quantile(StateTax, prob = 0.75,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 29 x 2
## Ward Percentile
## <chr> <dbl>
## 1 01 320.
## 2 02 329.
## 3 03 311.
## 4 04 582.
## 5 05 128.
## 6 06 198.
## 7 07 43.3
## 8 08 73.7
## 9 09 132.
## 10 10 82.9
## # … with 19 more rows
ward_table = tax %>%
group_by(Ward) %>%
summarize(
number_of_obs = n(),
mean_state_tax = mean(StateTax, na.rm = TRUE),
max_amount_due = max(AmountDue, na.rm = TRUE),
q75_city = quantile(CityTax, probs = 0.75, na.rm = TRUE),
q75_state = quantile(StateTax, probs = 0.75, na.rm = TRUE)
)
## Warning in max(AmountDue, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
cityTax
(y
-variable) by whether the property is a principal residence (x
) or not.tax = tax %>%
mutate(ResCode = str_trim(ResCode))
qplot(y = log10(CityTax+1), x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).
qplot(y = CityTax, x = ResCode, data = tax, geom = "boxplot")
## Warning: Removed 19052 rows containing non-finite values (stat_boxplot).
boxplot(log10(CityTax+1) ~ ResCode, data = tax)
boxplot(CityTax ~ ResCode, data = tax)
tax %>% filter(CityTax == max(CityTax, na.rm = TRUE))
## # A tibble: 1 x 16
## PropertyID Block Lot Ward Sect PropertyAddress LotSize CityTax
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 0841 001 0841 001 21 090 1525 RUSSELL ST 3.627 … 6.31e6
## # … with 8 more variables: StateTax <dbl>, ResCode <chr>,
## # AmountDue <dbl>, AsOfDate <chr>, Neighborhood <chr>,
## # PoliceDistrict <chr>, CouncilDistrict <dbl>, Location <chr>
pres = tax %>% filter( ResCode %in% "PRINCIPAL RESIDENCE")
pres = tax %>% filter( ResCode == "PRINCIPAL RESIDENCE")
dim(pres)
## [1] 113592 16
plot(density(variable))
qplot(x = log10(CityTax+1),data = pres, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 525 rows containing non-finite values (stat_bin).
qplot(x = CityTax, data = pres, geom = "density")
## Warning: Removed 525 rows containing non-finite values (stat_density).
hist(log2(pres$CityTax+1))
hist(pres$CityTax)
head(pres$CityTax)
## [1] 2268.97 3903.27 3163.68 2419.59 472.08 2991.35
plot(density(pres$CityTax, na.rm = TRUE))
New Data Set
Read in the Salary FY2015 dataset
sal = read_csv("http://johnmuschelli.com/intro_to_r/data/Baltimore_City_Employee_Salaries_FY2015.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## JobTitle = col_character(),
## AgencyID = col_character(),
## Agency = col_character(),
## HireDate = col_character(),
## AnnualSalary = col_character(),
## GrossPay = col_character()
## )
"fire"
(or any forms), if any, in the name remember fixed( ignore_case = TRUE)
will ignore caseshealth.sal = sal %>%
filter(str_detect(JobTitle,
fixed("fire", ignore_case = TRUE)))
trans
which contains only agencies that contain “TRANS”.trans = sal %>%
filter(str_detect(JobTitle, "TRANS"))
"abra"
in their name for Baltimore’s Salaries? Case should be ignoredsal %>%
filter(str_detect(name, fixed("abra", ignore_case = TRUE)))
## # A tibble: 12 x 7
## name JobTitle AgencyID Agency HireDate AnnualSalary GrossPay
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Abraha… LABORER (H… B70357 DPW-Soli… 10/16/2… $30721.00 $32793.…
## 2 Abraha… ACCOUNTANT… A50100 DPW-Wate… 12/31/2… $49573.00 $49104.…
## 3 Abraha… HOUSING IN… A06043 Housing … 12/15/1… $50365.00 $50804.…
## 4 Abraha… POLICE OFF… A99416 Police D… 01/20/2… $46199.00 $18387.…
## 5 Abrams… OFFICE SER… A29008 States A… 09/19/2… $35691.00 $34970.…
## 6 Abrams… COLLECTION… A14003 FIN-Coll… 08/26/2… $37359.00 $37768.…
## 7 Abrams… RECREATION… P04001 R&P-Recr… 06/19/2… $20800.00 $1690.00
## 8 Bey,Ab… PROCUREMEN… A17001 FIN-Purc… 04/23/2… $69600.00 $69763.…
## 9 Elgami… AUDITOR SU… A24001 COMP-Aud… 01/31/1… $94400.00 $95791.…
## 10 Gatto,… POLICE OFF… A99200 Police D… 12/12/2… $66086.00 $73651.…
## 11 Schwar… GENERAL CO… A54005 FPR Admi… 07/26/1… $110400.00 $111408…
## 12 Velez,… POLICE OFF… A99036 Police D… 10/24/2… $70282.00 $112957…
hist
, 20 breaks) What is the IQR? Hint: first convert to numeric. Try str_replace
, but remember$
is “special” and you need fixed()
around it.sal = sal %>% mutate(AnnualSalary = str_replace(AnnualSalary, fixed("$"), ""))
sal = sal %>% mutate(AnnualSalary = as.numeric(AnnualSalary))
qplot(x = AnnualSalary, data = sal, geom = "histogram", bins = 20)
hist(sal$AnnualSalary, breaks = 20)
quantile(sal$AnnualSalary)
## 0% 25% 50% 75% 100%
## 900 33354 48126 68112 238772
HireDate
to the Date
class - plot Annual Salary vs Hire Date. Use AnnualSalary ~ HireDate
with a data = sal
argument in plot or use x, y notation in scatter.smooth
Use lubridate
package. Is it mdy(date)
or dmy(date)
for this data - look at HireDate
sal = sal %>% mutate(HireDate = lubridate::mdy(HireDate))
q = qplot(y = AnnualSalary, x = HireDate,
data = sal, geom = "point")
q + geom_smooth(colour = "red", se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning: Removed 10 rows containing missing values (geom_point).
q + geom_smooth(colour = "red", se = FALSE, method = "loess")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning: Removed 10 rows containing missing values (geom_point).
q + geom_smooth(colour = "red", se = FALSE, method = "loess", span = 2/3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning: Removed 10 rows containing missing values (geom_point).
plot(AnnualSalary ~ HireDate, data = sal)
scatter.smooth(sal$AnnualSalary, x = sal$HireDate, col = "red")
emer
. How many employees are in this new dataset?emer = sal %>% filter(
str_detect(Agency, "Sheriff's Office|Police Department|Fire Department")
)
emer = sal %>% filter(
str_detect(Agency, "Sheriff's Office") |
str_detect(Agency, "Police Department") |
str_detect(Agency, "Fire Department")
)
dept
in the emer data set. dept = str_extract(Agency, ".*(ment|ice)")
. E.g. want to extract all characters up until ment
or ice
(we can group in regex
using parentheses) and then discard the rest. Replot annual salary versus hire date, color by dept
(not yet - using ggplot)emer = emer %>%
mutate(
dept = str_extract(Agency, ".*(ment|ice)")
)
# Replot annual salary versus hire date, color by dept (not yet - using ggplot)
ggplot(aes(x = HireDate, y = AnnualSalary,
colour = dept), data = emer) +
geom_point() + theme(legend.position = c(0.5, 0.8))
## Warning: Removed 10 rows containing missing values (geom_point).
19 (Bonus). Convert the ‘LotSize’ variable to a numeric square feet variable in the tax data set. Some tips: a) 1 acre = 43560 square feet b) The hyphens represent inches (not decimals)
tax$LotSize = str_trim(tax$LotSize) # trim to be safe
lot = tax$LotSize # for checking later
First lets take care of acres
aIndex= which(str_detect(tax$LotSize, "AC.*") |
str_detect(tax$LotSize, fixed(" %")))
head(aIndex)
## [1] 20 28 63 70 71 76
head(lot[aIndex])
## [1] "0.491 ACRES" "0.025 ACRES" "0.278 ACRES" "0.067 ACRES"
## [5] "0.024 ACRES" "0.271 ACRES"
acre = tax$LotSize[aIndex] # temporary variable
## find and replace character strings
acre = str_replace_all(acre, " AC.*","")
acre = str_replace_all(acre, " %","")
table(!is.na(as.numeric(acre)))
## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion
##
## FALSE TRUE
## 236 18559
head(acre[is.na(as.numeric(acre))],50)
## Warning in head(acre[is.na(as.numeric(acre))], 50): NAs introduced by
## coercion
## [1] "1-1383" "5-25" "1.914ACRES" "2-364"
## [5] "3-46" "3-4" "2-617" "2-617"
## [9] "2-617" "4.574ACRES" "1.423ACRES" "2.460ACRES"
## [13] "3-94" "2-456" "2-24" "1-36"
## [17] "21-8X5.5" "0.047ACRES" "0.067ACRES" "0.045ACRES"
## [21] "16-509" "2-657" "1-379" "2-158"
## [25] "1-81" "13-112" "2-2" "IMP ONLY 0.190"
## [29] "26,140" "0.104ACRES" "O.084" "1-566"
## [33] "28-90" "O-602" "3-67" "4-40"
## [37] "2-486" "4-3165" "O-0768" "O-8577"
## [41] "8-956" "18-425" "9-265" "1-371"
## [45] "3-436" "2.00ACRES" "8-974" "2-36"
## [49] "1-788" "1-005"
## lets clean the rest
acre = str_replace_all(acre, "-",".") # hyphen instead of decimal
head(acre[is.na(as.numeric(acre))])
## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by
## coercion
## [1] "1.914ACRES" "4.574ACRES" "1.423ACRES" "2.460ACRES" "21.8X5.5"
## [6] "0.047ACRES"
table(!is.na(as.numeric(acre)))
## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion
##
## FALSE TRUE
## 96 18699
acre = str_replace_all(acre, "ACRES","")
head(acre[is.na(as.numeric(acre))])
## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by
## coercion
## [1] "21.8X5.5" "IMP ONLY 0.190" "26,140" "O.084"
## [5] "O.602" "O.0768"
# take care of individual mistakes
acre = str_replace_all(acre, "O","0") # 0 vs O
acre = str_replace_all(acre, "Q","") # Q, oops
acre = str_replace_all(acre, ",.",".") # extra ,
acre = str_replace_all(acre, ",","") # extra ,
acre = str_replace_all(acre, "L","0") # leading L
acre = str_replace_all(acre, "-",".") # hyphen to period
acre[is.na(as.numeric(acre))]
## Warning: NAs introduced by coercion
## [1] "21.8X5.5" "IMP 0N0Y 0.190" "IMP.0N0Y 3.615"
acre2 = as.numeric(acre)*43560
## Warning: NAs introduced by coercion
sum(is.na(acre2)) # all but 3
## [1] 3
Now let’s convert all of the square feet variables
library(purrr)
fIndex = which(str_detect(tax$LotSize, "X"))
ft = tax$LotSize[fIndex]
ft = str_replace_all(ft, fixed("&"), "-")
ft = str_replace_all(ft, "IMP ONLY ", "")
ft = str_replace_all(ft, "`","1")
ft= map_chr(str_split(ft, " "), first)
## now get the widths and lengths
width = map_chr(str_split(ft,"X"), first)
length = map_chr(str_split(ft,"X"), nth, 2)
## width
widthFeet = as.numeric(map_chr(str_split(width, "-"), first))
## Warning: NAs introduced by coercion
widthInch = as.numeric(map_chr(str_split(width, "-"),nth,2))/12
widthInch[is.na(widthInch)] = 0 # when no inches present
totalWidth = widthFeet + widthInch # add together
# length
lengthFeet = as.numeric(map_chr(str_split(length, "-"),first))
lengthInch = as.numeric(map_chr(str_split(length, "-",2),nth,2))/12
## Warning: NAs introduced by coercion
lengthInch[is.na(lengthInch)] = 0 # when no inches present
totalLength = lengthFeet + lengthInch
# combine together for square feet
sqrtFt = totalWidth*totalLength
ft[is.na(sqrtFt)] # what is left?
## [1] "12-3XX4-2" "161X" "Q8X120"
## [4] "37-1X-60-10X57-11" "11XX0" "M2X169-9"
## [7] "X134" "POINT" "22-11"
## [10] "ASSESS" "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1"
## [13] "{1-4X128-6" "O-7X125" "13XX5"
## [16] "O-3X125" "F7-10X80" "N1-8X92-6"
## [19] "POINTX100-5X10" "]7X100" "16-11XX5"
## [22] "Q5X119"
And now we combine everything together:
tax$sqft = rep(NA)
tax$sqft[aIndex] = acre2
tax$sqft[fIndex] = sqrtFt
mean(!is.na(tax$sqft))
## [1] 0.934090928166
# already in square feet, easy!!
sIndex=which(str_detect(tax$LotSize, "FT") | str_detect(tax$LotSize, "S.*F."))
sf = tax$LotSize[sIndex] # subset temporary variable
sqft2 = map_chr(str_split(sf,"( |SQ|SF)"),first)
sqft2 = as.numeric(str_replace_all(sqft2, ",", "")) # remove , and convert
## Warning: NAs introduced by coercion
tax$sqft[sIndex] = sqft2
table(is.na(tax$sqft))
##
## FALSE TRUE
## 238105 193
## progress!
#what remains?
lot[is.na(tax$sqft)]
## [1] NA "19520819" "IMP ONLY"
## [4] "IMP ONLY" "IMPROVEMENTS ONLY" "IMPROVEMENTS ONLY"
## [7] "IMP ONLY" "IMP ONLY" NA
## [10] "0.016" "0.230" "IMP ONLY"
## [13] "0.649" "ASSESSMENT ONLY" "IMPROVEMENT ONLY"
## [16] "IMPROVEMENT ONLY" "IMPROVEMENT ONLY" "0.033"
## [19] "IMPROVEMENT ONLY" "IMPROVEMENT ONLY" "IMPROVEMENTS ONLY"
## [22] "IMPROVEMENT ONLY" "IMP ONLY" "IMP ONLY 0.190 AC"
## [25] "IMP ONLY" "AIR RIGHTS" "IMP ONLY"
## [28] "IMP ONLY" "IMP ONLY" "IMP ONLY"
## [31] "IMP ONLY" "13,846 SF" NA
## [34] NA "0.067" "IMP ONLY"
## [37] "IMPROVEMENTS ONLY" "CO-57 (IMP ONLY)" "IMP ONLY"
## [40] "0.858" "20080211" "IMPROVEMENT ONLY"
## [43] "12-3XX4-2" "161X 137" "IMP ONLY"
## [46] "0.040" "0.028" "IMP ONLY"
## [49] "IMP ONLY" "IMP ONLY" "IMP ONLY"
## [52] "IMPROVEMENT ONLY" "IMPROVEMENT ONLY" "IMPROVEMENT ONLY"
## [55] "IMPROVEMENT ONLY" "0.681 ARCES" "IMP ONLY"
## [58] "IMP ONLY" "IMP ONLY" "IMPROVEMENT ONLY"
## [61] "24179D096" "Q8X120" "REAR PART 697 S.F"
## [64] "IMP. ONLY" "IMPROVEMENT ONLY" "IMP ONLY"
## [67] "37-1X-60-10X57-11" "IMP ONLY" "68I.0 SQ FT"
## [70] "0.024" "01185" "11979"
## [73] "IMPROVEMENT ONLY" "196552-8" "AIR RIGHTS"
## [76] "IMPROVEMENT ONLY" "7,138 SF" "9,745 SF"
## [79] "13,772 SF" "17,294 SF" "15,745 SF"
## [82] "0.030" "18, 162 SF" "0.036 ARES"
## [85] "11XX0" "0.026" "IMPROVEMENT ONLY"
## [88] "IMP ONLY" "M2X169-9" "IMPROVEMENT ONLY"
## [91] "0.5404%" "IMPROVEMENTS ONLY" "AIR RIGHTS"
## [94] "AIR RIGHTS" "AIR RIGHTS" "2381437.2 CUBIC F"
## [97] "IMPROVEMENT ONLY" "IMP ONLY" "AIR RIGHTS ONLY"
## [100] "X134" "IMPROVEMENT ONLY" "AIR RIGHTS"
## [103] "\\560 S.F." "1233.04" NA
## [106] NA "POINT X 57-9" "IMPROVEMENTS ONLY"
## [109] "22-11 X57" "0.013" "IMP.ONLY 3.615 AC"
## [112] "IMP ONLY" "IMP ONLY" "0.129%"
## [115] "0.129%" "0.129%" "ASSESS CTY 84 S.F"
## [118] "0.039" "1159-0 S.F." "03281969"
## [121] "IMPROVEMENT ONLY" "ASSESS CTY 50 S.F" "ASSESS CTY 57X1-1"
## [124] "IMP ONLY" "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1"
## [127] "IMPROVEMENT ONLY" "{1-4X128-6" "IMPROVEMENT ONLY"
## [130] "IMPROVEMENT ONLY" "O-7X125" NA
## [133] "13XX5" "O-3X125" "IMPROVEMENT ONLY"
## [136] "IMP ONLY" "AIR RIGHTS" "0.026"
## [139] "0.129%" "0.129%" "IMPROVEMENT ONLY"
## [142] "F7-10X80" "ASSESS CTY" "IMP ONLY"
## [145] "IMP ONLY" "IMP ONLY" "IMP ONLY"
## [148] "IMPROVEMENTS ONLY" "IMPROVEMENT ONLY" "0.032"
## [151] "20050712" "IMP ONLY" "IMP ONLY"
## [154] "0.033" "IMP.ONLY" "IMP ONLY"
## [157] "IMP ONLY" "IMPROVEMENT ONLY" "N1-8X92-6"
## [160] "POINTX100-5X10" "IMPROVEMENTS ONLY" "]7X100"
## [163] "07031966" "13O4 SQ FT" "05081978"
## [166] "IMPROVEMENTS ONLY" "AIR RIGHTS" "AIR RIGHTS"
## [169] "IMPROVEMENT ONLY" "IMPROVEMENT ONLY" "IMPROVEMENT ONLY"
## [172] "IMPROVEMENT ONLY" "0.191 ARES" "19815-4"
## [175] "11`79" NA NA
## [178] "IMPROVEMENT ONLY" "IMP ONLY" "194415"
## [181] "IMP ONLY" "IMPROVEMENT ONLY" "IMP ONLY"
## [184] "120-103" "IMP ONLY" "IMP. ONLY"
## [187] "196700" "IMPROVEMENT ONLY" "16-11XX5"
## [190] "Q5X119" "21210" "IMPROVEMENT ONLY"
## [193] "5306120"