Code setup:

import and clean data

First we import our data including the Perth Airport Station.

I have downloaded data for several weather stations in the greater Perth area. These were downloaded in two batches so they are in two seperate CSVs. Thankfully, both are in a tidy format by default which makes them quite easy to wrangle.

One batch is American units so we have to convert those to metric so they fit with the rest of the data:

suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(ggdark))
suppressMessages(library(patchwork))
suppressMessages(library(weathermetrics))
PER_airport <- fread("../data/3184544.csv")
#filter for usability
#turns out the data attributes are useless lol
PER_airport <- PER_airport %>% select(!ends_with("_ATTRIBUTES"))

NOAA_other_stations_1_freedomUnits <- fread("../data/3186152.csv")
NOAA_other_stations_1 <- NOAA_other_stations_1_freedomUnits %>% 
  #drop extra cols
  select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX","TAVG")) %>% 
  mutate(
    PRCP = PRCP*25.4,
    TMAX = fahrenheit.to.celsius(TMAX),
    TMIN = fahrenheit.to.celsius(TMIN)
  )
NOAA_other_stations_2 <- fread("../data/3186197.csv") %>% 
  select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX"))

NOAA_all_stations <- rbind(NOAA_other_stations_1,NOAA_other_stations_2)
PER_airport <- PER_airport %>% select(!c("TAVG","SNWD"))
NOAA_all_stations <- rbind(NOAA_all_stations, PER_airport)

Add some convenience columns to the dataset, and sanity check rainfall between stations; they all look similar which is what we are expecting here.

NOAA_all_stations <- NOAA_all_stations %>% 
  mutate(
    Year = format(DATE, "%Y") %>% as.numeric(),
    Month = month(DATE),
    Day = format(DATE, "%d") %>% as.numeric(),
    #bool for any rain
    did_rain = PRCP > 0,
    Month_name = month.name[Month] %>% fct_relevel(month.name)
  )

NOAA_all_stations %>% 
  group_by(NAME) %>% 
  summarise(
    mean_rainfall = mean(PRCP, na.rm = T),
    mean_TMIN = mean(TMIN, na.rm = T),
    mean_TMAX = mean(TMAX, na.rm = T),
  )

data completeness

The data is incomplete in a number of ways: 1. While the full set of dates runs between 1901-2022. No station has data for the entire period 2. Stations lack some data for periods when they were active.

We can explore this a bit with R. First, we are interested in missing data in temperature and rainfall data.

There are only a few thousand missing rainfall records, but 84754 rows of missing data for TMIN and TMAX. This is 32.0240916% of the rows! This is because there are large gaps in temperature records while rainfall records have continued for the same period.

NOAA_all_stations %>% dim

## [1] 264657     14

cbind(
  NOAA_all_stations %>% count(is.na(TMAX)),
  NOAA_all_stations %>% count(is.na(TMIN)),
  NOAA_all_stations %>% count(is.na(PRCP))
)

We can inspect the percent missing per station. Some stations are missing over 80% of records, while several stations have very complete TMAX records while rainfall was recorded.

NOAA_all_stations %>% 
  group_by(NAME) %>% 
  count(isna_TMAX=!is.na(TMAX)) %>% 
  pivot_wider(NAME, values_from = n, names_from = isna_TMAX) %>% 
  mutate(percentage_missing = (`FALSE`/(`FALSE`+`TRUE`))*100) %>% 
  arrange(desc(percentage_missing))

periods with recorded data for each station

We want to know coverage for rainfall and temperature for each station.

What about specific for each kind of data?

For where data is available, we have complete records of rainfall. But temperature has large gaps for several stations.

dates_covered_by_stations <- NOAA_all_stations %>% 
  group_by(NAME) %>% 
  summarise(
    minDate = min(DATE),
    maxDate = max(DATE),
  )
dates_covered <- dates_covered_by_stations %>% 
  ggplot(aes(NAME, ymin=minDate, ymax=maxDate)) +
  geom_linerange() +
  dark_theme_bw() +
  theme(axis.text.x=element_text(angle=45,hjust=1)) +
  xlab("Station Name") + ylab("Date range (Years)") +
  ggtitle("Date range available for each station")

dates_covered_by_variable <- NOAA_all_stations %>% 
  pivot_longer(c(TMAX,TMIN,PRCP), names_to = "measurement") %>%
  # filter(NAME == "ROTTNEST ISLAND LIGHTHOUSE, AS") %>% 
  ggplot(aes(measurement, DATE, colour=!is.na(value))) +
  geom_point() +
  dark_theme_bw() +
  facet_wrap(vars(NAME)) +
  ggtitle("Dates where values are available for each station")

dates_covered + dates_covered_by_variable

We can do some quick checks using the non-filtered data. But we will need to use filtered data to ensure robust modelling later.

We can get some statistics for each year and then quickly graph to see how these differ between stations. We can see that rainfall totals trend downwards each year for most stations but there are some exceptions.

yearly_statistics <- NOAA_all_stations %>% 
  group_by(NAME, Year) %>% 
  summarise(
    meanRainfall = mean(PRCP, na.rm = T),
    totalRainfall = sum(PRCP, na.rm = T),
    meanMaximumTemperature = mean(TMAX),
    meanMinimumTemperature = mean(TMIN),
  ) 

yearly_statistics %>% 
  ggplot(aes(Year, totalRainfall, colour = NAME)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", se = F) +
  dark_theme_bw()

We can see that the two stations on Rottnest are showing roughly similar trends but because of the way the data is split the regression trends predict opposite directions of effect for total rainfall. I think it would be best to include these as a single dataset and then use the station name as a factor variable to account for the difference between each station.

yearly_statistics %>% 
  filter(str_detect(NAME, "ROTTNEST")) %>% 
  ggplot(aes(Year, totalRainfall, colour = NAME)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", se = F) +
  dark_theme_bw() +
  ggtitle("total rainfall each year for Rottnest Island from both stations")

Rotto_yearly_statistics <- NOAA_all_stations %>% 
  group_by(NAME, Year) %>%
  summarise(
    totalRainfall = sum(PRCP, na.rm = F),
    naRainfallCount = sum(is.na(PRCP)),
  ) %>% 
  filter(str_detect(NAME, "ROTTNEST"), Year >= 1907)

Rotto_yearly_statistics_fewNa <- Rotto_yearly_statistics %>% 
  filter(naRainfallCount < 10)

Rotto_yearly_statistics %>% 
  ggplot(aes(Year, totalRainfall)) +
  geom_point() +
  geom_smooth(method="lm") +
  dark_theme_bw()

After accounting for the station name as an interaction term we can see that the resulting change in rainfall per year is negative (with nominal significance), despite the estimation of a positive slope in the more recent data.

Rotto_yearly_statistics %>% 
  ggplot(aes(Year, totalRainfall, colour = NAME)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", se = F) +
  dark_theme_bw() +
  ggtitle("total rainfall each year for Rottnest Island from both stations")

model_rotto_rainfall_1 <- lm(totalRainfall ~ Year * NAME, data=Rotto_yearly_statistics_fewNa)
summary(model_rotto_rainfall_1)

## 
## Call:
## lm(formula = totalRainfall ~ Year * NAME, data = Rotto_yearly_statistics_fewNa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -360.93 -104.19   -7.58  120.92  434.12 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                   3.735e+03  1.343e+03   2.781  0.00660 **
## Year                         -1.549e+00  6.893e-01  -2.248  0.02703 * 
## NAMEROTTNEST ISLAND, AS      -3.281e+04  9.739e+03  -3.369  0.00111 **
## Year:NAMEROTTNEST ISLAND, AS  1.634e+01  4.867e+00   3.357  0.00116 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157 on 90 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.2667, Adjusted R-squared:  0.2422 
## F-statistic: 10.91 on 3 and 90 DF,  p-value: 3.517e-06

Since we have decent justification for both stations being similar, we can combine them into a single Rottnest station:

NOAA_all_stations_clean <- NOAA_all_stations %>% 
  mutate(
    NAME = case_when(
      NAME == "ROTTNEST ISLAND LIGHTHOUSE, AS" ~ "ROTTNEST",
      NAME == "ROTTNEST ISLAND, AS" ~ "ROTTNEST",
      .default = NAME
    )
  )

using models to identify similarity and difference between stations

We can account for Year and then hold it at a set value to estimate the differences in temperature and rainfall at different weather stations.

Initially let’s use the raw data:

model_allStations_temp_prcp <- lm(
  TMAX ~ PRCP + NAME + Year + as.factor(Month),
  data = NOAA_all_stations_clean
)
summary(model_allStations_temp_prcp)

## 
## Call:
## lm(formula = TMAX ~ PRCP + NAME + Year + as.factor(Month), data = NOAA_all_stations_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6217  -2.5105  -0.3804   2.0531  23.0994 
## 
## Coefficients:
##                                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                         -1.299e+01  1.034e+00  -12.561  < 2e-16 ***
## PRCP                                -1.234e-01  1.536e-03  -80.367  < 2e-16 ***
## NAMEBICKLEY, AS                     -1.381e+00  8.314e-02  -16.614  < 2e-16 ***
## NAMEGOSNELLS CITY, AS                1.528e+00  8.357e-02   18.283  < 2e-16 ***
## NAMEKWINANA BP REFINERY, AS         -3.540e-01  7.867e-02   -4.500 6.81e-06 ***
## NAMEMEDINA RESEARCH CENTRE, AS       5.839e-01  8.116e-02    7.194 6.30e-13 ***
## NAMEPEARCE RAAF, AS                  1.583e+00  7.859e-02   20.142  < 2e-16 ***
## NAMEPERTH AIRPORT, AS                1.053e+00  7.671e-02   13.723  < 2e-16 ***
## NAMEPERTH METRO, AS                  7.562e-01  8.280e-02    9.134  < 2e-16 ***
## NAMEPERTH REGIONAL OFFICE, AS        6.172e-01  7.838e-02    7.874 3.44e-15 ***
## NAMEROTTNEST                        -1.851e+00  7.775e-02  -23.806  < 2e-16 ***
## NAMESTONEVILLE RESEARCH STATION, AS -3.750e-01  1.253e-01   -2.992  0.00277 ** 
## NAMESUBIACO TREATMENT PLANT, AS      9.200e-01  9.432e-02    9.754  < 2e-16 ***
## NAMESWANBOURNE, AS                  -1.248e-02  8.288e-02   -0.151  0.88031    
## Year                                 2.173e-02  5.198e-04   41.803  < 2e-16 ***
## as.factor(Month)2                    3.097e-01  4.375e-02    7.078 1.47e-12 ***
## as.factor(Month)3                   -1.708e+00  4.276e-02  -39.950  < 2e-16 ***
## as.factor(Month)4                   -5.202e+00  4.321e-02 -120.395  < 2e-16 ***
## as.factor(Month)5                   -8.605e+00  4.311e-02 -199.586  < 2e-16 ***
## as.factor(Month)6                   -1.102e+01  4.383e-02 -251.305  < 2e-16 ***
## as.factor(Month)7                   -1.207e+01  4.347e-02 -277.585  < 2e-16 ***
## as.factor(Month)8                   -1.173e+01  4.311e-02 -271.994  < 2e-16 ***
## as.factor(Month)9                   -1.040e+01  4.323e-02 -240.588  < 2e-16 ***
## as.factor(Month)10                  -8.102e+00  4.280e-02 -189.270  < 2e-16 ***
## as.factor(Month)11                  -4.976e+00  4.310e-02 -115.471  < 2e-16 ***
## as.factor(Month)12                  -2.319e+00  4.311e-02  -53.785  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.697 on 176441 degrees of freedom
##   (88190 observations deleted due to missingness)
## Multiple R-squared:  0.6263, Adjusted R-squared:  0.6263 
## F-statistic: 1.183e+04 on 25 and 176441 DF,  p-value: < 2.2e-16

Use predict Year==1980?

stationNames = NOAA_all_stations_clean$NAME %>% unique

predict_TMAX <- function(station, model, Year, PRCP, Month) {
  return(
    predict(
      model,
      list(Year = Year,
           PRCP = PRCP,
           NAME = station,
           Month= Month))
    )
}

stationPredictions <- data.frame(station = stationNames)
stationPredictions$Year <- 1980
stationPredictions$PRCP <- 1
stationPredictions$Month <- 1

stationPredictions %>% 
  rowwise() %>% 
  mutate(Est_TMAX_Jul1980 = predict_TMAX(station, model_allStations_temp_prcp, Year, PRCP, Month))

differences between stations

Turns out the weather between stations is just really similar! There are likely some differences, but it is difficult to measure the relationship between location and temperature/rainfall with only 13 stations.

weatherSummary_all_stations <- NOAA_all_stations_clean %>% 
  group_by(NAME) %>% 
  summarise(
    meanRainfall = mean(PRCP, na.rm = TRUE),
    meanTMAX = mean(TMAX, na.rm = TRUE),
    meanTMIN = mean(TMIN, na.rm = TRUE),
    maxTMAX = max(TMAX, na.rm = TRUE),
    maxTMIN = max(TMIN, na.rm = TRUE),
  ) %>% 
  left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first")

weatherSummary_all_stations

weatherSummary_all_stations %>% 
  ggplot(aes(LONGITUDE, maxTMAX)) + geom_point() +
  geom_smooth(method = "lm") +
  dark_theme_bw() +
  ggtitle("relationship between record high temperature and location")

weatherSummary_all_stations %>% 
  ggplot(aes(LONGITUDE, maxTMIN, colour = NAME == "ROTTNEST")) + geom_point() +
  geom_smooth(method = "lm") +
  dark_theme_bw() +
  ggtitle("relationship between record low temperature and location")

chance rainfall on a given day for each station?

Risk of rain for each day of the year at each station.

prop_rained_all <- NOAA_all_stations_clean %>% 
  drop_na(PRCP) %>% 
  group_by(Month, NAME, Day) %>% 
  summarise(
      number_of_rainy_days = sum(did_rain), days_observed = n()
    ) %>%
  #change na to 0
  mutate(number_of_rainy_days = ifelse(is.na(number_of_rainy_days),0,number_of_rainy_days)) %>% 
  #calculate proportion rainfall for a given day
    mutate(proportion_rained = number_of_rainy_days/days_observed) %>% 
  #we add an index so we can count the days
  group_by(NAME) %>% 
  mutate(dayIndex = row_number(NAME))

prop_rained_all %>% 
  ggplot(aes(dayIndex, proportion_rained, colour = NAME)) +
  geom_point(alpha=0.1) +
  geom_smooth(se=FALSE) +
  ggtitle("Risk of any rain for each day in the year") +
  xlab("day (0 is January 1st)") +
  ylab("Risk of rain") +
  ylim(c(0,1))

prop_rained_all %>% 
  group_by(NAME) %>% 
  summarise(
    max_proportion_rained = max(proportion_rained),
    mean_proportion_rained = mean(proportion_rained)
  ) %>% 
  arrange(desc(max_proportion_rained)) %>% 
  left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first") %>% 
  ggplot(aes(LATITUDE, LONGITUDE, colour = mean_proportion_rained)) +
  geom_point()