── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Let's start with Fare Revenuelibrary(tidyverse)if(!file.exists("2022_fare_revenue.xlsx")){# This should work _in theory_ but in practice it's still a bit finicky# If it doesn't work for you, download this file 'by hand' in your# browser and save it as "2022_fare_revenue.xlsx" in your project# directory.download.file("http://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Fare%20Revenue.xlsx", destfile="2022_fare_revenue.xlsx", quiet=FALSE, method="wget")}FARES <- readxl::read_xlsx("2022_fare_revenue.xlsx") |>select(-`State/Parent NTD ID`, -`Reporter Type`,-`Reporting Module`,-`TOS`,-`Passenger Paid Fares`,-`Organization Paid Fares`) |>filter(`Expense Type`=="Funds Earned During Period") |>select(-`Expense Type`) |>group_by(`NTD ID`, # Sum over different `TOS` for the same `Mode``Agency Name`, # These are direct operated and sub-contracted `Mode`) |># of the same transit modality# Not a big effect in most munis (significant DO# tends to get rid of sub-contractors), but we'll sum# to unify different passenger experiencessummarize(`Total Fares`=sum(`Total Fares`)) |>ungroup()
`summarise()` has grouped output by 'NTD ID', 'Agency Name'. You can override
using the `.groups` argument.
# Next, expensesif(!file.exists("2022_expenses.csv")){# This should work _in theory_ but in practice it's still a bit finicky# If it doesn't work for you, download this file 'by hand' in your# browser and save it as "2022_expenses.csv" in your project# directory.download.file("https://data.transportation.gov/api/views/dkxx-zjd6/rows.csv?date=20231102&accessType=DOWNLOAD&bom=true&format=true", destfile="2022_expenses.csv", quiet=FALSE) }EXPENSES <- readr::read_csv("2022_expenses.csv") |>select(`NTD ID`, `Agency`,`Total`, `Mode`) |>mutate(`NTD ID`=as.integer(`NTD ID`)) |>rename(Expenses = Total) |>group_by(`NTD ID`, `Mode`) |>summarize(Expenses =sum(Expenses)) |>ungroup()
Rows: 3744 Columns: 29
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Agency, City, State, NTD ID, Organization Type, Reporter Type, UZA...
dbl (2): Report Year, UACE Code
num (10): Primary UZA Population, Agency VOMS, Mode VOMS, Vehicle Operations...
lgl (7): Vehicle Operations Questionable, Vehicle Maintenance Questionable,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
`summarise()` has grouped output by 'NTD ID'. You can override using the `.groups` argument.
# Monthly Transit Numberslibrary(tidyverse)if(!file.exists("ridership.xlsx")){# This should work _in theory_ but in practice it's still a bit finicky# If it doesn't work for you, download this file 'by hand' in your# browser and save it as "ridership.xlsx" in your project# directory.download.file("https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-09/July%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_240903.xlsx", destfile="ridership.xlsx", quiet=FALSE, method="wget")}TRIPS <- readxl::read_xlsx("ridership.xlsx", sheet="UPT") |>filter(`Mode/Type of Service Status`=="Active") |>select(-`Legacy NTD ID`, -`Reporter Type`, -`Mode/Type of Service Status`, -`UACE CD`, -`TOS`) |>pivot_longer(-c(`NTD ID`:`3 Mode`), names_to="month", values_to="UPT") |>drop_na() |>mutate(month=my(month)) # Parse _m_onth _y_ear date specsMILES <- readxl::read_xlsx("ridership.xlsx", sheet="VRM") |>filter(`Mode/Type of Service Status`=="Active") |>select(-`Legacy NTD ID`, -`Reporter Type`, -`Mode/Type of Service Status`, -`UACE CD`, -`TOS`) |>pivot_longer(-c(`NTD ID`:`3 Mode`), names_to="month", values_to="VRM") |>drop_na() |>rename(metro_area =`UZA Name`) |>group_by(`NTD ID`, `Agency`, `metro_area`, `Mode`, `3 Mode`, month) |>summarize(VRM =sum(VRM)) |>ungroup() |>mutate(month=my(month)) # Parse _m_onth _y_ear date specs
`summarise()` has grouped output by 'NTD ID', 'Agency', 'metro_area', 'Mode',
'3 Mode'. You can override using the `.groups` argument.
# a. What transit agency had the most total VRM in our data set?agency_vrm <- USAGE_cleaned |>group_by(Agency) |>summarize(Total_VRM =sum(Vehicle_Revenue_Miles, na.rm =TRUE)) |>arrange(desc(Total_VRM)) |>slice(1) print(agency_vrm)
# A tibble: 1 × 2
Agency Total_VRM
<chr> <dbl>
1 MTA New York City Transit 10832855350
# b. What transit mode had the most total VRM in our data set?mode_vrm <- USAGE_cleaned |>group_by(Mode) |>summarize(Total_VRM =sum(Vehicle_Revenue_Miles, na.rm =TRUE)) |>arrange(desc(Total_VRM)) |>slice(1) print(mode_vrm)
# c. How many trips were taken on the NYC Subway (Heavy Rail) in May 2024?# Ensure the `month` column is a date (in case it's not)USAGE_cleaned <- USAGE_cleaned |>mutate(month = lubridate::ymd(month)) # Convert to Date format# How many trips were taken on the NYC Subway (Heavy Rail) in May 2024?nyc_may_2024_trips <- USAGE_cleaned |>filter(Agency =="MTA New York City Transit", # Filter for NYC Subway Mode =="Heavy Rail", # Filter for Heavy Railyear(month) ==2024, # Filter for year 2024month(month) ==5) |># Filter for May (5th month)summarize(Total_UPT =sum(Unlinked_Passenger_Trips, na.rm =TRUE)) # Sum UPTprint(nyc_may_2024_trips)
# A tibble: 1 × 1
Total_UPT
<dbl>
1 180458819
# d. How much did NYC subway ridership fall between April 2019 and April 2020?nyc_april_2019_2020 <- USAGE_cleaned |>filter(Agency =="MTA New York City Transit", Mode =="Heavy Rail", month %in%c("2019-04", "2020-04")) |>group_by(month) |>summarize(Total_UPT =sum(Unlinked_Passenger_Trips, na.rm =TRUE)) # Use `reframe()` to calculate the ridership fallnyc_ridership_fall <- nyc_april_2019_2020 |>reframe(Ridership_Fall =diff(Total_UPT)) # Calculate difference between yearsprint(nyc_ridership_fall)
# Find the most popular transit mode for each quarter in 2024quarterly_mode_popularity <- USAGE_cleaned |>filter(year(month) ==2024) |># Filter for the year 2024mutate(Quarter = lubridate::quarter(month)) |># Add a 'Quarter' column based on the monthgroup_by(Quarter, Mode) |># Group by quarter and transit modesummarize(Total_UPT =sum(Unlinked_Passenger_Trips, na.rm =TRUE)) |>arrange(Quarter, desc(Total_UPT)) |># Sort by highest ridership in each quarterslice(1) # Select the top mode for each quarter
`summarise()` has grouped output by 'Quarter'. You can override using the
`.groups` argument.
# Find the month in 2024 with the highest total ridership across all agenciesmonth_ridership_2024 <- USAGE_cleaned |>filter(year(month) ==2024) |># Filter for the year 2024group_by(month(month)) |># Group by monthsummarize(Total_Ridership =sum(Unlinked_Passenger_Trips, na.rm =TRUE)) |>arrange(desc(Total_Ridership)) |># Sort by highest ridershipslice(1) # Select the month with the highest ridershipprint(month_ridership_2024)
# Find the transit agency with the highest total VRM in 2024top_vrm_agency <- USAGE_cleaned |>filter(year(month) ==2024) |># Filter for the year 2024group_by(Agency) |># Group by agencysummarize(Total_VRM =sum(Vehicle_Revenue_Miles, na.rm =TRUE)) |># Sum VRM per agencyarrange(desc(Total_VRM)) |># Sort by highest VRMslice(1) # Select the agency with the highest VRMprint(top_vrm_agency)
# A tibble: 1 × 2
Agency Total_VRM
<chr> <dbl>
1 MTA New York City Transit 273222702
# Install if not already installedif(!require("data.table")) install.packages("data.table")
Loading required package: data.table
Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
library(data.table)
# Assuming USAGE is already loaded in your environmentUSAGE_dt <-as.data.table(USAGE)
USAGE_2022_ANNUAL <- USAGE_dt[grepl("^2022", month), # Filter for months starting with "2022" .(UPT =sum(UPT, na.rm =TRUE), # Total UPT for 2022 (use actual name)VRM =sum(VRM, na.rm =TRUE)), # Total VRM for 2022 (use actual name) by = .(`NTD ID`, Agency, metro_area, Mode) # Group by these columns]# Ungroup the tableUSAGE_2022_ANNUAL <- USAGE_2022_ANNUAL[] # Ensures it is ungrouped# View the first few rows of the resulting tableprint(USAGE_2022_ANNUAL)
NTD ID Agency
<int> <char>
1: 1 King County
2: 1 King County
3: 1 King County
4: 1 King County
5: 1 King County
---
1137: 99423 City of Glendale
1138: 99423 City of Glendale
1139: 99424 City of Pasadena
1140: 99424 City of Pasadena
1141: 99425 Pomona Valley Transportation Authority
metro_area Mode UPT VRM
<char> <char> <num> <num>
1: Seattle--Tacoma, WA Demand Response 663009 12860448
2: Seattle--Tacoma, WA Ferryboat 400407 51236
3: Seattle--Tacoma, WA Motorbus 53983641 61632644
4: Seattle--Tacoma, WA Streetcar Rail 1117605 180369
5: Seattle--Tacoma, WA TB 9575043 2635705
---
1137: Los Angeles--Long Beach--Anaheim, CA Demand Response 19448 91018
1138: Los Angeles--Long Beach--Anaheim, CA Motorbus 624155 868128
1139: Los Angeles--Long Beach--Anaheim, CA Demand Response 38412 136655
1140: Los Angeles--Long Beach--Anaheim, CA Motorbus 1139100 701730
1141: Los Angeles--Long Beach--Anaheim, CA Demand Response 76187 725488
`summarise()` has grouped output by 'Agency'. You can override using the
`.groups` argument.
print(most_upt)
# A tibble: 31 × 3
# Groups: Agency [31]
Agency Mode Total_UPT
<chr> <chr> <dbl>
1 Alameda-Contra Costa Transit District RB 3756519
2 Alaska Railroad Corporation AR 219757
3 Cambria County Transit Authority IP 0
4 Capital Metropolitan Transportation Authority YR 466971
5 Central Florida Regional Transportation Authority RB 391742
6 Chattanooga Area Regional Transportation Authority IP 481957
7 City and County of San Francisco TB 33574391
8 City of Albuquerque RB 1829848
9 City of Fort Collins RB 403214
10 City of Portland TR 687131
# ℹ 21 more rows
`summarise()` has grouped output by 'Agency'. You can override using the
`.groups` argument.
print(lowest_expenses_per_upt)
# A tibble: 31 × 3
# Groups: Agency [31]
Agency Mode Expenses_Per_UPT
<chr> <chr> <dbl>
1 Alameda-Contra Costa Transit District RB 4.96
2 Alaska Railroad Corporation AR 258.
3 Cambria County Transit Authority IP Inf
4 Capital Metropolitan Transportation Authority YR 68.2
5 Central Florida Regional Transportation Authority RB 8.81
6 Chattanooga Area Regional Transportation Authority IP 4.75
7 City and County of San Francisco TB 5.42
8 City of Albuquerque RB 3.57
9 City of Fort Collins RB 5.66
10 City of Portland TR 4.61
# ℹ 21 more rows
`summarise()` has grouped output by 'Agency'. You can override using the
`.groups` argument.
print(lowest_expenses_per_vrm)
# A tibble: 31 × 3
# Groups: Agency [31]
Agency Mode Expenses_Per_VRM
<chr> <chr> <dbl>
1 Alameda-Contra Costa Transit District RB 28.8
2 Alaska Railroad Corporation AR 50.0
3 Cambria County Transit Authority IP Inf
4 Capital Metropolitan Transportation Authority YR 47.7
5 Central Florida Regional Transportation Authority RB 19.8
6 Chattanooga Area Regional Transportation Authority IP 114.
7 City and County of San Francisco TB 42.3
8 City of Albuquerque RB 8.25
9 City of Fort Collins RB 15.6
10 City of Portland TR 166.
# ℹ 21 more rows