library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)

Part 1

  1. Read in the Wide Bike lanes data: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes_Wide.csv . Name the data set wide.
wide = read_csv("http://johnmuschelli.com/intro_to_r/data/Bike_Lanes_Wide.csv")
## Parsed with column specification:
## cols(
##   name = col_character(),
##   `BIKE BOULEVARD` = col_double(),
##   `BIKE LANE` = col_double(),
##   CONTRAFLOW = col_double(),
##   `SHARED BUS BIKE` = col_double(),
##   SHARROW = col_double(),
##   SIDEPATH = col_double(),
##   `SIGNED ROUTE` = col_double(),
##   `<NA>` = col_double()
## )
  1. Reshape wide using gather. Call this data long. Make the key lanetype, and the value the_length. Make sure we gather all columns but name, using -name. Note the NAs here:
long = wide %>% 
  gather(key = "lanetype", value = "the_length", -name)
head(long)
## # A tibble: 6 x 3
##   name              lanetype       the_length
##   <chr>             <chr>               <dbl>
## 1 ALBEMARLE ST      BIKE BOULEVARD         NA
## 2 ALICEANNA ST      BIKE BOULEVARD         NA
## 3 ARGONNE DR        BIKE BOULEVARD         NA
## 4 ART MUSEUM DR     BIKE BOULEVARD         NA
## 5 AUCHENTOROLY TERR BIKE BOULEVARD         NA
## 6 BANK ST           BIKE BOULEVARD         NA

Part 2

This exercise is motivated by https://anythingbutrbitrary.blogspot.com/2012/08/manipulating-data-frames-using-sqldf.html

  1. read in the roads and crashes CSVs. Call them crash and road crash: http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/crashes.csv road: http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/roads.csv
crash = read_csv("http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/crashes.csv")
## Parsed with column specification:
## cols(
##   Year = col_double(),
##   Road = col_character(),
##   N_Crashes = col_double(),
##   Volume = col_double()
## )
road = read_csv("http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/roads.csv")
## Parsed with column specification:
## cols(
##   Road = col_character(),
##   District = col_character(),
##   Length = col_double()
## )
head(crash)
## # A tibble: 6 x 4
##    Year Road          N_Crashes Volume
##   <dbl> <chr>             <dbl>  <dbl>
## 1  1991 Interstate 65        25  40000
## 2  1992 Interstate 65        37  41000
## 3  1993 Interstate 65        45  45000
## 4  1994 Interstate 65        46  45600
## 5  1995 Interstate 65        46  49000
## 6  1996 Interstate 65        59  51000
head(road)
## # A tibble: 5 x 3
##   Road          District       Length
##   <chr>         <chr>           <dbl>
## 1 Interstate 65 Greenfield        262
## 2 Interstate 70 Vincennes         156
## 3 US-36         Crawfordsville    139
## 4 US-40         Greenfield        150
## 5 US-52         Crawfordsville    172
  1. Replace (using str_replace) any hyphens (-) with a space in crash$Road. Call this data crash2. Table the Road variable.
crash2 = crash %>% mutate(Road = str_replace(Road, "-", " "))
table(crash2$Road)
## 
## Interstate 275  Interstate 65  Interstate 70          US 36 
##             22             22             22             22 
##          US 40 
##             22
  1. How many observations are in each dataset?
dim(crash)
## [1] 110   4
dim(road)
## [1] 5 3
  1. Separate the Road column (using separate) into (type and number) in crash2. Reassign this to crash2. Table crash2$type
crash2 = separate(crash2, col = "Road", into = c("type", "number"))
table( crash2$type)
## 
## Interstate         US 
##         66         44

Create a new variable calling it road_hyphen using the unite function. Unite the type and number columns using a hyphen (-) and then table road_hyphen

crash2 = unite(crash2, col = "road_hyphen", type, number ,sep = "-")
table( crash2$road_hyphen)
## 
## Interstate-275  Interstate-65  Interstate-70          US-36 
##             22             22             22             22 
##          US-40 
##             22
  1. Which and how many years were data collected?
unique(crash$Year)
##  [1] 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## [14] 2004 2005 2006 2007 2008 2009 2010 2011 2012
length(unique(crash$Year))
## [1] 22

Part 3

  1. Read in the Bike lanes data: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv or use jhur::read_bike(). Name the data set bike,
bike = read_csv("http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")
## Parsed with column specification:
## cols(
##   subType = col_character(),
##   name = col_character(),
##   block = col_character(),
##   type = col_character(),
##   numLanes = col_double(),
##   project = col_character(),
##   route = col_character(),
##   length = col_double(),
##   dateInstalled = col_double()
## )
  1. Keep rows where the record is not missing type and not missing name and re-assign the output to bike.
bike = filter(bike, !is.na(type) & !is.na(name))
  1. Summarize and group the data by grouping name and type and take the sum of the length (reassign length variable). Call this data set sub
sub = bike %>% 
  group_by(name, type) %>% 
  summarize(length = sum(length))
## `summarise()` regrouping output by 'name' (override with `.groups` argument)
  1. Reshape sub using spread. Spread the data where the key is type and we want the value in the new columns to be length - the bike lane length.
    Call this wide.
wide = spread(sub, key = type, value = length)

Look at the column names of wide - what are they? (they also have spaces)

Part 4

  1. Join data to retain only complete data, (using an inner join) e.g. those observations with road lengths and districts. Merge without using by argument, then merge using by = "Road". call the output merged. How many observations are there?
merged = inner_join(crash, road)
## Joining, by = "Road"
merged = inner_join(crash, road, by = "Road")
dim(merged)
## [1] 88  6
  1. Join data using a full_join. Call the output full. How many observations are there?
full = full_join(crash, road)
## Joining, by = "Road"
nrow(full)
## [1] 111
  1. Do a left join of the road and crash. ORDER matters here! How many observations are there?
left = left_join(road, crash)
## Joining, by = "Road"
nrow(left)
## [1] 89
  1. Repeat above with a right_join with the same order of the arguments. How many observations are there?
right = right_join(road, crash)
## Joining, by = "Road"
nrow(right)
## [1] 110