Code setup:
First we import our data including the Perth Airport Station.
I have downloaded data for several weather stations in the greater Perth area. These were downloaded in two batches so they are in two seperate CSVs. Thankfully, both are in a tidy format by default which makes them quite easy to wrangle.
One batch is American units so we have to convert those to metric so they fit with the rest of the data:
suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(ggdark))
suppressMessages(library(patchwork))
suppressMessages(library(weathermetrics))
PER_airport <- fread("../data/3184544.csv")
#filter for usability
#turns out the data attributes are useless lol
PER_airport <- PER_airport %>% select(!ends_with("_ATTRIBUTES"))
NOAA_other_stations_1_freedomUnits <- fread("../data/3186152.csv")
NOAA_other_stations_1 <- NOAA_other_stations_1_freedomUnits %>%
#drop extra cols
select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX","TAVG")) %>%
mutate(
PRCP = PRCP*25.4,
TMAX = fahrenheit.to.celsius(TMAX),
TMIN = fahrenheit.to.celsius(TMIN)
)
NOAA_other_stations_2 <- fread("../data/3186197.csv") %>%
select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX"))
NOAA_all_stations <- rbind(NOAA_other_stations_1,NOAA_other_stations_2)
PER_airport <- PER_airport %>% select(!c("TAVG","SNWD"))
NOAA_all_stations <- rbind(NOAA_all_stations, PER_airport)Add some convenience columns to the dataset, and sanity check rainfall between stations; they all look similar which is what we are expecting here.
NOAA_all_stations <- NOAA_all_stations %>%
mutate(
Year = format(DATE, "%Y") %>% as.numeric(),
Month = month(DATE),
Day = format(DATE, "%d") %>% as.numeric(),
#bool for any rain
did_rain = PRCP > 0,
Month_name = month.name[Month] %>% fct_relevel(month.name)
)
NOAA_all_stations %>%
group_by(NAME) %>%
summarise(
mean_rainfall = mean(PRCP, na.rm = T),
mean_TMIN = mean(TMIN, na.rm = T),
mean_TMAX = mean(TMAX, na.rm = T),
)The data is incomplete in a number of ways: 1. While the full set of dates runs between 1901-2022. No station has data for the entire period 2. Stations lack some data for periods when they were active.
We can explore this a bit with R. First, we are interested in missing data in temperature and rainfall data.
There are only a few thousand missing rainfall records, but 84754 rows of missing data for TMIN and TMAX. This is 32.0240916% of the rows! This is because there are large gaps in temperature records while rainfall records have continued for the same period.
NOAA_all_stations %>% dim## [1] 264657 14
cbind(
NOAA_all_stations %>% count(is.na(TMAX)),
NOAA_all_stations %>% count(is.na(TMIN)),
NOAA_all_stations %>% count(is.na(PRCP))
)We can inspect the percent missing per station. Some stations are missing over 80% of records, while several stations have very complete TMAX records while rainfall was recorded.
NOAA_all_stations %>%
group_by(NAME) %>%
count(isna_TMAX=!is.na(TMAX)) %>%
pivot_wider(NAME, values_from = n, names_from = isna_TMAX) %>%
mutate(percentage_missing = (`FALSE`/(`FALSE`+`TRUE`))*100) %>%
arrange(desc(percentage_missing))We want to know coverage for rainfall and temperature for each station.
What about specific for each kind of data?
For where data is available, we have complete records of rainfall. But temperature has large gaps for several stations.
dates_covered_by_stations <- NOAA_all_stations %>%
group_by(NAME) %>%
summarise(
minDate = min(DATE),
maxDate = max(DATE),
)
dates_covered <- dates_covered_by_stations %>%
ggplot(aes(NAME, ymin=minDate, ymax=maxDate)) +
geom_linerange() +
dark_theme_bw() +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
xlab("Station Name") + ylab("Date range (Years)") +
ggtitle("Date range available for each station")
dates_covered_by_variable <- NOAA_all_stations %>%
pivot_longer(c(TMAX,TMIN,PRCP), names_to = "measurement") %>%
# filter(NAME == "ROTTNEST ISLAND LIGHTHOUSE, AS") %>%
ggplot(aes(measurement, DATE, colour=!is.na(value))) +
geom_point() +
dark_theme_bw() +
facet_wrap(vars(NAME)) +
ggtitle("Dates where values are available for each station")
dates_covered + dates_covered_by_variableWe can do some quick checks using the non-filtered data. But we will need to use filtered data to ensure robust modelling later.
We can get some statistics for each year and then quickly graph to see how these differ between stations. We can see that rainfall totals trend downwards each year for most stations but there are some exceptions.
yearly_statistics <- NOAA_all_stations %>%
group_by(NAME, Year) %>%
summarise(
meanRainfall = mean(PRCP, na.rm = T),
totalRainfall = sum(PRCP, na.rm = T),
meanMaximumTemperature = mean(TMAX),
meanMinimumTemperature = mean(TMIN),
)
yearly_statistics %>%
ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw()We can see that the two stations on Rottnest are showing roughly similar trends but because of the way the data is split the regression trends predict opposite directions of effect for total rainfall. I think it would be best to include these as a single dataset and then use the station name as a factor variable to account for the difference between each station.
yearly_statistics %>%
filter(str_detect(NAME, "ROTTNEST")) %>%
ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw() +
ggtitle("total rainfall each year for Rottnest Island from both stations")Rotto_yearly_statistics <- NOAA_all_stations %>%
group_by(NAME, Year) %>%
summarise(
totalRainfall = sum(PRCP, na.rm = F),
naRainfallCount = sum(is.na(PRCP)),
) %>%
filter(str_detect(NAME, "ROTTNEST"), Year >= 1907)
Rotto_yearly_statistics_fewNa <- Rotto_yearly_statistics %>%
filter(naRainfallCount < 10)
Rotto_yearly_statistics %>%
ggplot(aes(Year, totalRainfall)) +
geom_point() +
geom_smooth(method="lm") +
dark_theme_bw()After accounting for the station name as an interaction term we can see that the resulting change in rainfall per year is negative (with nominal significance), despite the estimation of a positive slope in the more recent data.
Rotto_yearly_statistics %>%
ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw() +
ggtitle("total rainfall each year for Rottnest Island from both stations")model_rotto_rainfall_1 <- lm(totalRainfall ~ Year * NAME, data=Rotto_yearly_statistics_fewNa)
summary(model_rotto_rainfall_1)##
## Call:
## lm(formula = totalRainfall ~ Year * NAME, data = Rotto_yearly_statistics_fewNa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -360.93 -104.19 -7.58 120.92 434.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.735e+03 1.343e+03 2.781 0.00660 **
## Year -1.549e+00 6.893e-01 -2.248 0.02703 *
## NAMEROTTNEST ISLAND, AS -3.281e+04 9.739e+03 -3.369 0.00111 **
## Year:NAMEROTTNEST ISLAND, AS 1.634e+01 4.867e+00 3.357 0.00116 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157 on 90 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.2667, Adjusted R-squared: 0.2422
## F-statistic: 10.91 on 3 and 90 DF, p-value: 3.517e-06
Since we have decent justification for both stations being similar, we can combine them into a single Rottnest station:
NOAA_all_stations_clean <- NOAA_all_stations %>%
mutate(
NAME = case_when(
NAME == "ROTTNEST ISLAND LIGHTHOUSE, AS" ~ "ROTTNEST",
NAME == "ROTTNEST ISLAND, AS" ~ "ROTTNEST",
.default = NAME
)
)We can account for Year and then hold it at a set value to estimate the differences in temperature and rainfall at different weather stations.
Initially let’s use the raw data:
model_allStations_temp_prcp <- lm(
TMAX ~ PRCP + NAME + Year + as.factor(Month),
data = NOAA_all_stations_clean
)
summary(model_allStations_temp_prcp)##
## Call:
## lm(formula = TMAX ~ PRCP + NAME + Year + as.factor(Month), data = NOAA_all_stations_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6217 -2.5105 -0.3804 2.0531 23.0994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.299e+01 1.034e+00 -12.561 < 2e-16 ***
## PRCP -1.234e-01 1.536e-03 -80.367 < 2e-16 ***
## NAMEBICKLEY, AS -1.381e+00 8.314e-02 -16.614 < 2e-16 ***
## NAMEGOSNELLS CITY, AS 1.528e+00 8.357e-02 18.283 < 2e-16 ***
## NAMEKWINANA BP REFINERY, AS -3.540e-01 7.867e-02 -4.500 6.81e-06 ***
## NAMEMEDINA RESEARCH CENTRE, AS 5.839e-01 8.116e-02 7.194 6.30e-13 ***
## NAMEPEARCE RAAF, AS 1.583e+00 7.859e-02 20.142 < 2e-16 ***
## NAMEPERTH AIRPORT, AS 1.053e+00 7.671e-02 13.723 < 2e-16 ***
## NAMEPERTH METRO, AS 7.562e-01 8.280e-02 9.134 < 2e-16 ***
## NAMEPERTH REGIONAL OFFICE, AS 6.172e-01 7.838e-02 7.874 3.44e-15 ***
## NAMEROTTNEST -1.851e+00 7.775e-02 -23.806 < 2e-16 ***
## NAMESTONEVILLE RESEARCH STATION, AS -3.750e-01 1.253e-01 -2.992 0.00277 **
## NAMESUBIACO TREATMENT PLANT, AS 9.200e-01 9.432e-02 9.754 < 2e-16 ***
## NAMESWANBOURNE, AS -1.248e-02 8.288e-02 -0.151 0.88031
## Year 2.173e-02 5.198e-04 41.803 < 2e-16 ***
## as.factor(Month)2 3.097e-01 4.375e-02 7.078 1.47e-12 ***
## as.factor(Month)3 -1.708e+00 4.276e-02 -39.950 < 2e-16 ***
## as.factor(Month)4 -5.202e+00 4.321e-02 -120.395 < 2e-16 ***
## as.factor(Month)5 -8.605e+00 4.311e-02 -199.586 < 2e-16 ***
## as.factor(Month)6 -1.102e+01 4.383e-02 -251.305 < 2e-16 ***
## as.factor(Month)7 -1.207e+01 4.347e-02 -277.585 < 2e-16 ***
## as.factor(Month)8 -1.173e+01 4.311e-02 -271.994 < 2e-16 ***
## as.factor(Month)9 -1.040e+01 4.323e-02 -240.588 < 2e-16 ***
## as.factor(Month)10 -8.102e+00 4.280e-02 -189.270 < 2e-16 ***
## as.factor(Month)11 -4.976e+00 4.310e-02 -115.471 < 2e-16 ***
## as.factor(Month)12 -2.319e+00 4.311e-02 -53.785 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.697 on 176441 degrees of freedom
## (88190 observations deleted due to missingness)
## Multiple R-squared: 0.6263, Adjusted R-squared: 0.6263
## F-statistic: 1.183e+04 on 25 and 176441 DF, p-value: < 2.2e-16
Use predict Year==1980?
stationNames = NOAA_all_stations_clean$NAME %>% unique
predict_TMAX <- function(station, model, Year, PRCP, Month) {
return(
predict(
model,
list(Year = Year,
PRCP = PRCP,
NAME = station,
Month= Month))
)
}
stationPredictions <- data.frame(station = stationNames)
stationPredictions$Year <- 1980
stationPredictions$PRCP <- 1
stationPredictions$Month <- 1
stationPredictions %>%
rowwise() %>%
mutate(Est_TMAX_Jul1980 = predict_TMAX(station, model_allStations_temp_prcp, Year, PRCP, Month))Turns out the weather between stations is just really similar! There are likely some differences, but it is difficult to measure the relationship between location and temperature/rainfall with only 13 stations.
weatherSummary_all_stations <- NOAA_all_stations_clean %>%
group_by(NAME) %>%
summarise(
meanRainfall = mean(PRCP, na.rm = TRUE),
meanTMAX = mean(TMAX, na.rm = TRUE),
meanTMIN = mean(TMIN, na.rm = TRUE),
maxTMAX = max(TMAX, na.rm = TRUE),
maxTMIN = max(TMIN, na.rm = TRUE),
) %>%
left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first")
weatherSummary_all_stationsweatherSummary_all_stations %>%
ggplot(aes(LONGITUDE, maxTMAX)) + geom_point() +
geom_smooth(method = "lm") +
dark_theme_bw() +
ggtitle("relationship between record high temperature and location")weatherSummary_all_stations %>%
ggplot(aes(LONGITUDE, maxTMIN, colour = NAME == "ROTTNEST")) + geom_point() +
geom_smooth(method = "lm") +
dark_theme_bw() +
ggtitle("relationship between record low temperature and location")Risk of rain for each day of the year at each station.
prop_rained_all <- NOAA_all_stations_clean %>%
drop_na(PRCP) %>%
group_by(Month, NAME, Day) %>%
summarise(
number_of_rainy_days = sum(did_rain), days_observed = n()
) %>%
#change na to 0
mutate(number_of_rainy_days = ifelse(is.na(number_of_rainy_days),0,number_of_rainy_days)) %>%
#calculate proportion rainfall for a given day
mutate(proportion_rained = number_of_rainy_days/days_observed) %>%
#we add an index so we can count the days
group_by(NAME) %>%
mutate(dayIndex = row_number(NAME))
prop_rained_all %>%
ggplot(aes(dayIndex, proportion_rained, colour = NAME)) +
geom_point(alpha=0.1) +
geom_smooth(se=FALSE) +
ggtitle("Risk of any rain for each day in the year") +
xlab("day (0 is January 1st)") +
ylab("Risk of rain") +
ylim(c(0,1))prop_rained_all %>%
group_by(NAME) %>%
summarise(
max_proportion_rained = max(proportion_rained),
mean_proportion_rained = mean(proportion_rained)
) %>%
arrange(desc(max_proportion_rained)) %>%
left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first") %>%
ggplot(aes(LATITUDE, LONGITUDE, colour = mean_proportion_rained)) +
geom_point()