library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)

Part 1

Read in the Wide Bike lanes data: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes_Wide.csv . Name the data set wide.

wide = read_csv("http://johnmuschelli.com/intro_to_r/data/Bike_Lanes_Wide.csv")

## Parsed with column specification:
## cols(
##   name = col_character(),
##   `BIKE BOULEVARD` = col_double(),
##   `BIKE LANE` = col_double(),
##   CONTRAFLOW = col_double(),
##   `SHARED BUS BIKE` = col_double(),
##   SHARROW = col_double(),
##   SIDEPATH = col_double(),
##   `SIGNED ROUTE` = col_double(),
##   `<NA>` = col_double()
## )

Reshape wide using gather. Call this data long. Make the key lanetype, and the value the_length. Make sure we gather all columns but name, using -name. Note the NAs here:

long = wide %>% 
  gather(key = "lanetype", value = "the_length", -name)
head(long)

## # A tibble: 6 x 3
##   name              lanetype       the_length
##   <chr>             <chr>               <dbl>
## 1 ALBEMARLE ST      BIKE BOULEVARD         NA
## 2 ALICEANNA ST      BIKE BOULEVARD         NA
## 3 ARGONNE DR        BIKE BOULEVARD         NA
## 4 ART MUSEUM DR     BIKE BOULEVARD         NA
## 5 AUCHENTOROLY TERR BIKE BOULEVARD         NA
## 6 BANK ST           BIKE BOULEVARD         NA

Part 2

This exercise is motivated by https://anythingbutrbitrary.blogspot.com/2012/08/manipulating-data-frames-using-sqldf.html

read in the roads and crashes CSVs. Call them crash and road crash: http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/crashes.csv road: http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/roads.csv

crash = read_csv("http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/crashes.csv")

## Parsed with column specification:
## cols(
##   Year = col_double(),
##   Road = col_character(),
##   N_Crashes = col_double(),
##   Volume = col_double()
## )

road = read_csv("http://johnmuschelli.com/intro_to_r/Manipulating_Data_in_R/lab/roads.csv")

## Parsed with column specification:
## cols(
##   Road = col_character(),
##   District = col_character(),
##   Length = col_double()
## )

head(crash)

## # A tibble: 6 x 4
##    Year Road          N_Crashes Volume
##   <dbl> <chr>             <dbl>  <dbl>
## 1  1991 Interstate 65        25  40000
## 2  1992 Interstate 65        37  41000
## 3  1993 Interstate 65        45  45000
## 4  1994 Interstate 65        46  45600
## 5  1995 Interstate 65        46  49000
## 6  1996 Interstate 65        59  51000

head(road)

## # A tibble: 5 x 3
##   Road          District       Length
##   <chr>         <chr>           <dbl>
## 1 Interstate 65 Greenfield        262
## 2 Interstate 70 Vincennes         156
## 3 US-36         Crawfordsville    139
## 4 US-40         Greenfield        150
## 5 US-52         Crawfordsville    172

Replace (using str_replace) any hyphens (-) with a space in crash$Road. Call this data crash2. Table the Road variable.

crash2 = crash %>% mutate(Road = str_replace(Road, "-", " "))
table(crash2$Road)

## 
## Interstate 275  Interstate 65  Interstate 70          US 36 
##             22             22             22             22 
##          US 40 
##             22

How many observations are in each dataset?

dim(crash)

## [1] 110   4

dim(road)

## [1] 5 3

Separate the Road column (using separate) into (type and number) in crash2. Reassign this to crash2. Table crash2$type

crash2 = separate(crash2, col = "Road", into = c("type", "number"))
table( crash2$type)

## 
## Interstate         US 
##         66         44

Create a new variable calling it road_hyphen using the unite function. Unite the type and number columns using a hyphen (-) and then table road_hyphen

crash2 = unite(crash2, col = "road_hyphen", type, number ,sep = "-")
table( crash2$road_hyphen)

## 
## Interstate-275  Interstate-65  Interstate-70          US-36 
##             22             22             22             22 
##          US-40 
##             22

Which and how many years were data collected?

unique(crash$Year)

##  [1] 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## [14] 2004 2005 2006 2007 2008 2009 2010 2011 2012

length(unique(crash$Year))

## [1] 22

Part 3

Read in the Bike lanes data: http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv or use jhur::read_bike(). Name the data set bike,

bike = read_csv("http://johnmuschelli.com/intro_to_r/data/Bike_Lanes.csv")

## Parsed with column specification:
## cols(
##   subType = col_character(),
##   name = col_character(),
##   block = col_character(),
##   type = col_character(),
##   numLanes = col_double(),
##   project = col_character(),
##   route = col_character(),
##   length = col_double(),
##   dateInstalled = col_double()
## )

Keep rows where the record is not missing type and not missing name and re-assign the output to bike.

bike = filter(bike, !is.na(type) & !is.na(name))

Summarize and group the data by grouping name and type and take the sum of the length (reassign length variable). Call this data set sub

sub = bike %>% 
  group_by(name, type) %>% 
  summarize(length = sum(length))

## `summarise()` regrouping output by 'name' (override with `.groups` argument)

Reshape sub using spread. Spread the data where the key is type and we want the value in the new columns to be length - the bike lane length.
Call this wide.

wide = spread(sub, key = type, value = length)

Look at the column names of wide - what are they? (they also have spaces)

Part 4

Join data to retain only complete data, (using an inner join) e.g. those observations with road lengths and districts. Merge without using by argument, then merge using by = "Road". call the output merged. How many observations are there?

merged = inner_join(crash, road)

## Joining, by = "Road"

merged = inner_join(crash, road, by = "Road")
dim(merged)

## [1] 88  6

Join data using a full_join. Call the output full. How many observations are there?

full = full_join(crash, road)

## Joining, by = "Road"

nrow(full)

## [1] 111

Do a left join of the road and crash. ORDER matters here! How many observations are there?

left = left_join(road, crash)

## Joining, by = "Road"

nrow(left)

## [1] 89

Repeat above with a right_join with the same order of the arguments. How many observations are there?

right = right_join(road, crash)

## Joining, by = "Road"

nrow(right)

## [1] 110

Manipulating Data in R Lab

Part 1

Part 2

Part 3

Part 4