Data Manipulation and Visualization with R and the nycflights13 Dataset
Classified in Computers
Written on in
English with a size of 3.64 KB
library(nycflights13)library(tidyverse)Data Manipulation with dplyr
Ordering Rows with arrange()
arrange(flights, year, month, day)arrange(flights, desc(arr_delay))Handling NAs
df <- tibble(x = c(5, 2, NA))arrange(df, x)arrange(df, desc(x))Selecting Columns with select()
select(flights, year, month, day)select(flights, year:day)select(flights, -(year:day))rename(flights, mes = month)select(flights, time_hour, air_time, everything())Creating New Variables with mutate()
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)mutate(flights_sml, gain = arr_delay - dep_delay, speed = distance / air_time * 60)Creating Functions with Vector Arguments
transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100)(x <- 1:10)cumsum(x)cummean(x)y <- c(1, 2, 2, NA, 3, 4)rank(y)Summarizing Data with summarize()
summarize(flights, delay = mean(dep_delay, na.rm = TRUE))by_day <- group_by(flights, year, month, day)class(by_day)summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))Using the Pipe Operator %>%
x <- 1:10x %>% mean() %>% exp()delays <- flights %>% group_by(dest) %>% summarize( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter(count > 20, dest != "HNL")ggplot(data = delays, mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)xx <- c(1:4, 5, 5, 5, 8, 9, 10)xxxx != lag(xx)table(flights$dest)Counts
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))delays <- not_cancelled %>% group_by(tailnum) %>% summarize(delay = mean(arr_delay))ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)delays <- not_cancelled %>% group_by(tailnum) %>% summarize(delay = mean(arr_delay, na.rm = TRUE), n = n())ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)