Data Manipulation and Visualization with R and the nycflights13 Dataset

Classified in Computers

Written at on English with a size of 3.64 KB.

library(nycflights13)
library(tidyverse)

Data Manipulation with dplyr

Ordering Rows with arrange()

arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))

Handling NAs

df <- tibble(x = c(5, 2, NA))
arrange(df, x)
arrange(df, desc(x))

Selecting Columns with select()

select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
rename(flights, mes = month)
select(flights, time_hour, air_time, everything())

Creating New Variables with mutate()

flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml, gain = arr_delay - dep_delay, speed = distance / air_time * 60)

Creating Functions with Vector Arguments

transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100)
(x <- 1:10)
cumsum(x)
cummean(x)
y <- c(1, 2, 2, NA, 3, 4)
rank(y)

Summarizing Data with summarize()

summarize(flights, delay = mean(dep_delay, na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
class(by_day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))

Using the Pipe Operator %>%

x <- 1:10
x %>% mean() %>% exp()
delays <- flights %>%
  group_by(dest) %>%
  summarize(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
    ) %>%
    filter(count > 20, dest != "HNL")
ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
xx <- c(1:4, 5, 5, 5, 8, 9, 10)
xx
xx != lag(xx)
table(flights$dest)

Counts

not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
        group_by(tailnum) %>%
        summarize(delay = mean(arr_delay))
ggplot(data = delays, mapping = aes(x = delay)) +
      geom_freqpoly(binwidth = 10)
delays <- not_cancelled %>%
          group_by(tailnum) %>%
          summarize(delay = mean(arr_delay, na.rm = TRUE),
            n = n())
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
  geom_point(alpha = 1/10)

Entradas relacionadas: