library(tidyverse)
library(readr)
library(fixest)
library(broom)
library(GGally)
library(patchwork)
library(corrr)
library(ggplot2)
library(ggridges)
library(eurostat)
library(corrr)
library(corrplot)
library(glmnet)
library(car)
library(tinytex)
Annex I – Clustering FUAs
Download the EMTA Barometer open data, which provides annual statistics on public transport (infrastructure, ridership, etc.) in the largest European cities. Available from: https://sites.google.com/dataveyes.com/emta-barometer/download-data
<- locale(encoding = "UTF-8", decimal_mark = ".", grouping_mark = ",") #define decimal and thousand separators
loc_eu
<- "EMTA_Barometer_Opendata_2.txt" #The path towards the downloaded data
file_path
<- read_delim(file_path, delim = ";", show_col_types = FALSE, locale = loc_eu,
data_long na = c("", "NA", "n/a", "-", "NULL"), trim_ws = TRUE)
<- data_long %>%
data_wide pivot_wider(id_cols = c(PTA, PERIMETER, MODE, YEAR),
names_from = ITEM,
values_from = VALUE,
values_fn = list(VALUE = first)) %>%
mutate(across(c(POP, AREA_TOT, AREA_URB, GDP, PLACEKM, PKM, MS_PT, MS_WLK, MS_CYC,
NETLEN, NETSTOPS, BOARDINGS, COSTOP, FARE_MPASS, REV_TOTAL,~ parse_number(.x, locale = loc_eu)))
REV_FARES, MOTOR),
#Assign countries to cities
<- tibble::tibble(
pta_country PTA = c("AMS", "ATH", "BAR", "BEL", "BER", "BIL", "BIR", "BRU", "BUC", "BUD", "COP", "FRA", "HEL", "KRA", "LIS", "LON", "LYO", "MAD", "MAL", "MAN", "OSL", "PAR", "POR", "PRA", "ROT", "SOF", "STO", "STU", "THE", "TOU", "TUR", "VAL", "VIE", "VIL", "WAR"),
Country = c("Netherlands", "Greece", "Spain", "Serbia", "Germany", "Spain", "United Kingdom", "Belgium", "Romania", "Hungary", "Denmark", "Germany", "Finland", "Poland", "Portugal", "United Kingdom", "France", "Spain", "Spain", "United Kingdom", "Norway", "France", "Portugal", "Czechia", "Netherlands", "Bulgaria", "Sweden", "Germany", "Greece", "France", "Italy", "Spain", "Austria", "Lithuania", "Poland")
)
<- data_wide %>%
data_wide left_join(pta_country, by = "PTA")
Define two distinct variable sets:
(i) Urban-form explanatory variables (city_vars
) which include measures of the core compactness (density_city_total, density_city_builtup)
, as well as the overall metropolitan density (density_pta_total, density_pta_builtup
), and spatial structure (centrality
).
(ii) Public-transport coverage (network_vars, network_mode_vars
) metrics (e.g. route-kilometres per capita, stops per km²);
<- data_wide %>%
pta_vars filter(PERIMETER == "PTA", MODE == "ALL") %>%
::select(PTA, YEAR,
dplyrPOP_PTA = POP,
AREA_TOT = AREA_TOT,
AREA_URB_PTA = AREA_URB,
GDP,
BOARDINGS,
MS_PT, MS_WLK, MS_CYC)
<- data_wide %>%
city_vars_1 filter(PERIMETER == "CITY", MODE == "ALL") %>%
::select(PTA, YEAR,
dplyrPOP_CITY = POP,
AREA_TOT_CITY = AREA_TOT,
AREA_URB_CITY = AREA_URB)
<- pta_vars %>%
city_vars_2 left_join(city_vars_1, by = c("PTA", "YEAR")) %>%
mutate(
ridership_ms = MS_PT / 100,
ridership_pc = BOARDINGS / POP_PTA,
ridership_mstot = (MS_PT + MS_WLK + MS_CYC) / 100,
# Densidades
density_pta_total = POP_PTA / AREA_TOT,
density_pta_builtup = POP_PTA / AREA_URB_PTA,
density_city_total = POP_CITY / AREA_TOT_CITY,
density_city_builtup = POP_CITY / AREA_URB_CITY,
centrality = POP_CITY / POP_PTA
)
<- data_wide %>%
network_vars filter(PERIMETER == "PTA", MODE != "ALL") %>%
group_by(PTA, YEAR) %>%
summarise(
NETLEN = sum(NETLEN, na.rm = TRUE),
NETSTOPS = sum(NETSTOPS, na.rm = TRUE),
.groups = "drop"
)
<- data_wide %>%
network_mode_vars filter(PERIMETER == "PTA", MODE %in% c("BUS", "TRAM", "METRO", "LRAIL", "COMRAIL", "REGRAIL")) %>%
group_by(PTA, YEAR, MODE) %>%
summarise(NETSTOPS = sum(NETSTOPS, na.rm = TRUE),
.groups = "drop") %>%
pivot_wider(names_from = MODE, values_from = NETSTOPS, names_prefix = "NETSTOPS_")
<- city_vars_2 %>%
df_model left_join(network_vars, by = c("PTA", "YEAR")) %>%
left_join(network_mode_vars, by = c("PTA", "YEAR")) %>%
left_join(pta_country, by = "PTA") %>%
mutate(
supply_km_km2 = NETLEN / POP_PTA,
supply_stops_km2 = NETSTOPS / AREA_TOT,
# Mode-specific coverage densities
supply_stops_km2_BUS = NETSTOPS_BUS / AREA_TOT,
supply_stops_km2_TRAM = NETSTOPS_TRAM / AREA_TOT,
supply_stops_km2_METRO = NETSTOPS_METRO / AREA_TOT,
supply_stops_km2_LRAIL = NETSTOPS_LRAIL / AREA_TOT,
supply_stops_km2_COMRAIL = NETSTOPS_COMRAIL / AREA_TOT,
supply_stops_km2_REGRAIL = NETSTOPS_REGRAIL / AREA_TOT
%>%
) drop_na()
Finally, to the explanatory variables set, we add job-related density as a socio-economic variables in the FUA (job_density, city_fua_ratio
), in which the job_density
represents the overall density in the FUA, and city_fua_ratio
captures the centrality of jobs in the core of the FUA.
# 1. Load job market data from Eurostat directly through the eurostat package for R
<- get_eurostat("urb_llma", time_format = "num") llma_raw
indexed 0B in 0s, 0B/s
indexed 2.15GB in 0s, 2.15GB/s
<- get_eurostat("urb_clma", time_format = "num") clma_raw
indexed 0B in 0s, 0B/s
indexed 2.15GB in 0s, 2.15GB/s
<-get_eurostat("met_d3area", time_format = "num") met_d3area_raw
indexed 0B in 0s, 0B/s
indexed 2.15GB in 0s, 2.15GB/s
# 2. Filter for year 2022
<- llma_raw %>%
llma_bui filter(indic_ur == "EC2020V", TIME_PERIOD == 2022, grepl("F$", cities)) %>%
mutate(base_code = substr(cities, 1, 5)) %>%
select(base_code, jobs_fua = values)
<- clma_raw %>%
clma_bui filter(indic_ur == "EC2020V", TIME_PERIOD == 2022, grepl("C$", cities)) %>%
mutate(base_code = substr(cities, 1, 5)) %>%
select(base_code, jobs_city = values)
<- met_d3area_raw %>%
aera_bui filter(landuse == "L0008", TIME_PERIOD == 2022) %>%
mutate(metroreg = substr(metroreg, 1, 5)) %>%
select(metroreg, area = values)
# 3. Join job data to the base_code capturing the city identifier
<- full_join(clma_bui, llma_bui, by = "base_code") %>%
df_jobs filter(!is.na(jobs_city), !is.na(jobs_fua)) %>%
mutate(city_fua_ratio = jobs_city / jobs_fua)
<- full_join(df_jobs, aera_bui, by=c("base_code"="metroreg")) %>%
df_jobs mutate (job_density= jobs_fua/area)
<- get_eurostat_dic("cities")
dic_cities $code_name <- substr(dic_cities$code_name,1,5)
dic_cities
<- df_jobs %>%
df_jobs_named left_join(
%>%
dic_cities distinct(code_name, full_name), # Remove duplicates
by = c("base_code" = "code_name")
%>%
) rename(fua_name = full_name)
<- df_jobs_named %>%
df_jobs_named mutate(base_code = as.character(base_code))
#4. Join the data tothe original FUA list
<- tibble::tibble(
pta_basecode_map PTA = c("BAR", "BER", "BIL", "BRU", "BUD", "FRA", "HEL", "MAD", "MAL", "STU", "TUR", "VAL", "VIE", "VIL"),
base_code = c("ES002", "DE001", "ES019", "BE001", "HU001", "DE005", "FI001", "ES001", "ES006", "DE007", "FI003", "ES003", "AT001", "LT001")
)
<- df_model %>%
df_model_joined left_join(pta_basecode_map, by = "PTA") %>%
left_join(df_jobs_named, by = "base_code")
To gauge the combined influence of all four variable blocks, we fit a OLS specification in which public-transport stop density (supply_stops_km2
) is regressed on:
Core-compactness metrics (
density_city_total, density_city_builtup
)Overall metropolitan density (
density_pta_total, density_pta_builtup
)Spatial structure (
centrality
)Socio-economic variables (
job_density, city_fua_ratio
)Year and country fixed effects (
as.factor(YEAR), as.factor(Country)
)
<- lm(supply_stops_km2 ~ density_city_total + density_pta_total + density_pta_builtup + density_city_builtup + centrality + job_density + city_fua_ratio + as.factor(YEAR) + as.factor(Country), data = df_model_joined)
fit_pt_stop_density
summary(fit_pt_stop_density )
Call:
lm(formula = supply_stops_km2 ~ density_city_total + density_pta_total +
density_pta_builtup + density_city_builtup + centrality +
job_density + city_fua_ratio + as.factor(YEAR) + as.factor(Country),
data = df_model_joined)
Residuals:
Min 1Q Median 3Q Max
-1.41236 -0.36308 0.06878 0.31489 1.21663
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.004e+00 5.773e+00 0.867 0.39013
density_city_total -3.706e-03 2.409e-03 -1.538 0.13021
density_pta_total 1.034e-02 3.577e-03 2.892 0.00562 **
density_pta_builtup 2.726e-04 1.670e-04 1.632 0.10874
density_city_builtup -3.027e-04 1.415e-04 -2.139 0.03724 *
centrality -3.115e+00 7.624e+00 -0.409 0.68461
job_density 1.463e-02 1.847e-02 0.792 0.43185
city_fua_ratio 3.101e+01 1.182e+01 2.622 0.01149 *
as.factor(YEAR)2014 -2.511e-02 3.512e-01 -0.072 0.94327
as.factor(YEAR)2015 -9.517e-02 3.662e-01 -0.260 0.79600
as.factor(YEAR)2016 -5.850e-01 3.874e-01 -1.510 0.13721
as.factor(YEAR)2017 6.063e-01 4.494e-01 1.349 0.18325
as.factor(YEAR)2018 6.547e-01 5.085e-01 1.287 0.20375
as.factor(YEAR)2019 7.304e-01 5.801e-01 1.259 0.21372
as.factor(YEAR)2020 8.920e-02 5.970e-01 0.149 0.88182
as.factor(YEAR)2021 9.881e-01 7.177e-01 1.377 0.17461
as.factor(YEAR)2022 1.642e+00 7.045e-01 2.331 0.02373 *
as.factor(YEAR)2023 1.624e+00 7.854e-01 2.068 0.04375 *
as.factor(Country)Finland -1.342e+01 8.321e+00 -1.612 0.11308
as.factor(Country)Germany -1.612e+01 1.030e+01 -1.565 0.12369
as.factor(Country)Italy -2.596e+00 2.764e+00 -0.939 0.35196
as.factor(Country)Lithuania -3.940e+01 1.842e+01 -2.139 0.03727 *
as.factor(Country)Spain -2.205e+01 1.255e+01 -1.757 0.08491 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6542 on 51 degrees of freedom
(76 observations deleted due to missingness)
Multiple R-squared: 0.8664, Adjusted R-squared: 0.8087
F-statistic: 15.03 on 22 and 51 DF, p-value: 2.809e-15
The full-controls specification—which adds both country and year fixed effects to the urban-form block—explains nearly 87 % of the variation in surface-stop density (adj. R² = 0.81). Once national dummies absorb cross-country heterogeneity, only metropolitan population density (density_pta_total
) retains a positive, statistically significant effect ( t ≈ 2.9, p ≈ 0.006). All alternative morphology metrics either lose significance or flip sign and become collinear.
These patterns indicate that metropolitan-wide density is the only urban-form attribute whose influence survives once national context is held constant. To avoid multicollinearity, maximise sample size, and keep the model interpretable, we therefore drop the redundant covariates and re-estimate a specification that retains density_pta_total
alongside the country and year fixed effects.
<- lm(
fit_core ~ density_pta_total + factor(YEAR) + factor(Country),
supply_stops_km2 data = df_model_joined
)
summary(fit_core )
Call:
lm(formula = supply_stops_km2 ~ density_pta_total + factor(YEAR) +
factor(Country), data = df_model_joined)
Residuals:
Min 1Q Median 3Q Max
-2.7330 -0.3056 0.0121 0.2406 5.0121
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.1218533 0.7384921 -0.165 0.8692
density_pta_total 0.0007739 0.0001308 5.917 3.03e-08 ***
factor(YEAR)2014 -0.0349211 0.4527469 -0.077 0.9386
factor(YEAR)2015 -0.0588648 0.4451113 -0.132 0.8950
factor(YEAR)2016 -0.5804466 0.4328281 -1.341 0.1824
factor(YEAR)2017 -0.0806696 0.4262426 -0.189 0.8502
factor(YEAR)2018 -0.0720508 0.4277533 -0.168 0.8665
factor(YEAR)2019 0.0344709 0.4163001 0.083 0.9341
factor(YEAR)2020 -0.2652264 0.4225014 -0.628 0.5313
factor(YEAR)2021 -0.0544375 0.4942806 -0.110 0.9125
factor(YEAR)2022 0.3846243 0.4947104 0.777 0.4384
factor(YEAR)2023 0.1876878 0.4445351 0.422 0.6736
factor(Country)Denmark 1.4367098 0.8569274 1.677 0.0962 .
factor(Country)Finland 3.7180675 0.8332343 4.462 1.81e-05 ***
factor(Country)France 4.3427844 0.9256151 4.692 7.09e-06 ***
factor(Country)Germany 0.6669973 0.8169715 0.816 0.4158
factor(Country)Greece 0.7181130 1.3435460 0.534 0.5940
factor(Country)Italy 0.1980941 1.3442415 0.147 0.8831
factor(Country)Lithuania 0.5300075 0.9419118 0.563 0.5747
factor(Country)Netherlands 2.0507889 1.1075929 1.852 0.0665 .
factor(Country)Norway 1.5882530 0.9157731 1.734 0.0854 .
factor(Country)Poland 1.5431743 1.1263100 1.370 0.1731
factor(Country)Portugal -0.6846486 1.0223835 -0.670 0.5043
factor(Country)Serbia 1.0999278 0.8904302 1.235 0.2191
factor(Country)Spain 1.2042661 0.8290782 1.453 0.1489
factor(Country)Sweden 0.8483459 0.9032462 0.939 0.3495
factor(Country)United Kingdom 9.6353007 0.9161499 10.517 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.035 on 123 degrees of freedom
Multiple R-squared: 0.9571, Adjusted R-squared: 0.9481
F-statistic: 105.6 on 26 and 123 DF, p-value: < 2.2e-16
Metropolitan population density is the sole urban-form driver that survives once country and year fixed effects are added:
density_pta_total
remains positive and highly significant (β ≈ 7.7 × 10⁻⁴, p < 10⁻⁷).Year dummies are insignificant, indicating minor temporal variation once structure is controlled.
A handful of country dummies (e.g., FI, FR, UK) differ from the reference, highlighting national network standards.
This pared-down model still explains around 95 % of the variance, confirming that metropolitan density is the key morphology lever for stop supply across European FUAs. To identify meaningful peer groups, we cluster FUAs only on the two variables that explain public-transport provision:
Variable | Interpretation |
---|---|
density_pta_total |
Metropolitan population density (morphological driver) |
supply_stops_km2 |
Surface public-transport stop density (service intensity) |
library(dplyr)
library(ggplot2)
# 1. Build the clustering data set
<- df_model_joined %>%
df_cluster select(
PTA,# population
POP_PTA,
supply_stops_km2,
density_pta_total,
job_density
)
# 2. K-means on scaled variables
set.seed(123)
<- kmeans(
kmeans_result scale(df_cluster %>% select(supply_stops_km2, density_pta_total)),
centers = 3, nstart = 25
)
$cluster_raw <- kmeans_result$cluster # original labels 1–3
df_cluster
# 3. Rank clusters by *median* PT supply and relabel so highest = "1"
<- df_cluster %>%
cluster_ranking group_by(cluster_raw) %>%
summarise(med_supply = median(supply_stops_km2), .groups = "drop") %>%
arrange(desc(med_supply)) %>%
mutate(new_label = factor(row_number(), levels = 1:3))
<- df_cluster %>%
df_cluster left_join(cluster_ranking, by = "cluster_raw") %>%
mutate(cluster = new_label) # final ordered factor 1, 2, 3
# 4. Quick check of medians
%>%
df_cluster group_by(cluster) %>%
summarise(
n = n(),
med_stops_km2 = median(supply_stops_km2),
med_density_pta = median(density_pta_total),
.groups = "drop"
)
# A tibble: 3 × 4
cluster n med_stops_km2 med_density_pta
<fct> <int> <dbl> <dbl>
1 1 11 12.9 5573.
2 2 21 12.5 3095.
3 3 118 1.49 678.
# 5. R²
<- summary(lm(supply_stops_km2 ~ density_pta_total, data = df_cluster))$r.squared
r2 <- c("1" = "#04328C", # navy
clust_cols "2" = "#BF0B4D", # magenta
"3" = "#F29F05") # sunflower
ggplot(
df_cluster,aes(
x = density_pta_total,
y = supply_stops_km2,
colour = cluster,
size = POP_PTA
)+
) geom_point(alpha = 0.75) +
scale_colour_manual(values = clust_cols, name = "Cluster") + # <- here
scale_size_continuous(range = c(2, 14), name = "Population") +
labs(
x = "Metropolitan population density (inhab./km²)",
y = "PT stop density (stops/km²)",
caption = paste0("R² (simple fit) = ", round(r2, 2))
+
) theme_minimal(base_size = 11) +
theme(legend.position = "right")
Using k-means on z-scores of metropolitan population density (density_pta_total
) and surface PT-stop density (supply_stops_km2
), we obtain three clusters ( k = 3, 25 random starts). Clusters are ordered by their median stop density.
<- c(
vars supply_stops_km2 = "PT stops density (stops/km²)",
density_pta_total = "FUA pop. density (inhab./km²)",
job_density = "Job density (jobs/km²)",
POP_PTA = "Population (inhab.)"
)
<- df_cluster %>%
plot_df pivot_longer(cols = all_of(names(vars)),
names_to = "variable_raw",
values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(variable = vars[variable_raw])
ggplot(plot_df,
aes(x = value, y = cluster, fill = cluster)) +
geom_density_ridges(alpha = 0.6, scale = 1, colour = "grey40") +
scale_fill_manual(values = clust_cols) + # <- apply your palette
facet_wrap(~ variable, scales = "free_x", ncol = 1) +
labs(
title = "",
x = NULL,
y = "FUA Types (clusters)"
+
) theme_minimal(base_size = 11) +
theme(legend.position = "none")