Code setup:
First we import our data including the Perth Airport Station.
I have downloaded data for several weather stations in the greater Perth area. These were downloaded in two batches so they are in two seperate CSVs. Thankfully, both are in a tidy format by default which makes them quite easy to wrangle.
One batch is American units so we have to convert those to metric so they fit with the rest of the data:
suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(ggdark))
suppressMessages(library(patchwork))
suppressMessages(library(weathermetrics))
<- fread("../data/3184544.csv")
PER_airport #filter for usability
#turns out the data attributes are useless lol
<- PER_airport %>% select(!ends_with("_ATTRIBUTES"))
PER_airport
<- fread("../data/3186152.csv")
NOAA_other_stations_1_freedomUnits <- NOAA_other_stations_1_freedomUnits %>%
NOAA_other_stations_1 #drop extra cols
select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX","TAVG")) %>%
mutate(
PRCP = PRCP*25.4,
TMAX = fahrenheit.to.celsius(TMAX),
TMIN = fahrenheit.to.celsius(TMIN)
)<- fread("../data/3186197.csv") %>%
NOAA_other_stations_2 select(!c("DAPR","DATN","DATX","DWPR","MDPR","MDTN","MDTX"))
<- rbind(NOAA_other_stations_1,NOAA_other_stations_2)
NOAA_all_stations <- PER_airport %>% select(!c("TAVG","SNWD"))
PER_airport <- rbind(NOAA_all_stations, PER_airport) NOAA_all_stations
Add some convenience columns to the dataset, and sanity check rainfall between stations; they all look similar which is what we are expecting here.
<- NOAA_all_stations %>%
NOAA_all_stations mutate(
Year = format(DATE, "%Y") %>% as.numeric(),
Month = month(DATE),
Day = format(DATE, "%d") %>% as.numeric(),
#bool for any rain
did_rain = PRCP > 0,
Month_name = month.name[Month] %>% fct_relevel(month.name)
)
%>%
NOAA_all_stations group_by(NAME) %>%
summarise(
mean_rainfall = mean(PRCP, na.rm = T),
mean_TMIN = mean(TMIN, na.rm = T),
mean_TMAX = mean(TMAX, na.rm = T),
)
The data is incomplete in a number of ways: 1. While the full set of dates runs between 1901-2022. No station has data for the entire period 2. Stations lack some data for periods when they were active.
We can explore this a bit with R. First, we are interested in missing data in temperature and rainfall data.
There are only a few thousand missing rainfall records, but 84754 rows of missing data for TMIN and TMAX. This is 32.0240916% of the rows! This is because there are large gaps in temperature records while rainfall records have continued for the same period.
%>% dim NOAA_all_stations
## [1] 264657 14
cbind(
%>% count(is.na(TMAX)),
NOAA_all_stations %>% count(is.na(TMIN)),
NOAA_all_stations %>% count(is.na(PRCP))
NOAA_all_stations )
We can inspect the percent missing per station. Some stations are missing over 80% of records, while several stations have very complete TMAX records while rainfall was recorded.
%>%
NOAA_all_stations group_by(NAME) %>%
count(isna_TMAX=!is.na(TMAX)) %>%
pivot_wider(NAME, values_from = n, names_from = isna_TMAX) %>%
mutate(percentage_missing = (`FALSE`/(`FALSE`+`TRUE`))*100) %>%
arrange(desc(percentage_missing))
We want to know coverage for rainfall and temperature for each station.
What about specific for each kind of data?
For where data is available, we have complete records of rainfall. But temperature has large gaps for several stations.
<- NOAA_all_stations %>%
dates_covered_by_stations group_by(NAME) %>%
summarise(
minDate = min(DATE),
maxDate = max(DATE),
)<- dates_covered_by_stations %>%
dates_covered ggplot(aes(NAME, ymin=minDate, ymax=maxDate)) +
geom_linerange() +
dark_theme_bw() +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
xlab("Station Name") + ylab("Date range (Years)") +
ggtitle("Date range available for each station")
<- NOAA_all_stations %>%
dates_covered_by_variable pivot_longer(c(TMAX,TMIN,PRCP), names_to = "measurement") %>%
# filter(NAME == "ROTTNEST ISLAND LIGHTHOUSE, AS") %>%
ggplot(aes(measurement, DATE, colour=!is.na(value))) +
geom_point() +
dark_theme_bw() +
facet_wrap(vars(NAME)) +
ggtitle("Dates where values are available for each station")
+ dates_covered_by_variable dates_covered
We can do some quick checks using the non-filtered data. But we will need to use filtered data to ensure robust modelling later.
We can get some statistics for each year and then quickly graph to see how these differ between stations. We can see that rainfall totals trend downwards each year for most stations but there are some exceptions.
<- NOAA_all_stations %>%
yearly_statistics group_by(NAME, Year) %>%
summarise(
meanRainfall = mean(PRCP, na.rm = T),
totalRainfall = sum(PRCP, na.rm = T),
meanMaximumTemperature = mean(TMAX),
meanMinimumTemperature = mean(TMIN),
)
%>%
yearly_statistics ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw()
We can see that the two stations on Rottnest are showing roughly similar trends but because of the way the data is split the regression trends predict opposite directions of effect for total rainfall. I think it would be best to include these as a single dataset and then use the station name as a factor variable to account for the difference between each station.
%>%
yearly_statistics filter(str_detect(NAME, "ROTTNEST")) %>%
ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw() +
ggtitle("total rainfall each year for Rottnest Island from both stations")
<- NOAA_all_stations %>%
Rotto_yearly_statistics group_by(NAME, Year) %>%
summarise(
totalRainfall = sum(PRCP, na.rm = F),
naRainfallCount = sum(is.na(PRCP)),
%>%
) filter(str_detect(NAME, "ROTTNEST"), Year >= 1907)
<- Rotto_yearly_statistics %>%
Rotto_yearly_statistics_fewNa filter(naRainfallCount < 10)
%>%
Rotto_yearly_statistics ggplot(aes(Year, totalRainfall)) +
geom_point() +
geom_smooth(method="lm") +
dark_theme_bw()
After accounting for the station name as an interaction term we can see that the resulting change in rainfall per year is negative (with nominal significance), despite the estimation of a positive slope in the more recent data.
%>%
Rotto_yearly_statistics ggplot(aes(Year, totalRainfall, colour = NAME)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se = F) +
dark_theme_bw() +
ggtitle("total rainfall each year for Rottnest Island from both stations")
<- lm(totalRainfall ~ Year * NAME, data=Rotto_yearly_statistics_fewNa)
model_rotto_rainfall_1 summary(model_rotto_rainfall_1)
##
## Call:
## lm(formula = totalRainfall ~ Year * NAME, data = Rotto_yearly_statistics_fewNa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -360.93 -104.19 -7.58 120.92 434.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.735e+03 1.343e+03 2.781 0.00660 **
## Year -1.549e+00 6.893e-01 -2.248 0.02703 *
## NAMEROTTNEST ISLAND, AS -3.281e+04 9.739e+03 -3.369 0.00111 **
## Year:NAMEROTTNEST ISLAND, AS 1.634e+01 4.867e+00 3.357 0.00116 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157 on 90 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.2667, Adjusted R-squared: 0.2422
## F-statistic: 10.91 on 3 and 90 DF, p-value: 3.517e-06
Since we have decent justification for both stations being similar, we can combine them into a single Rottnest station:
<- NOAA_all_stations %>%
NOAA_all_stations_clean mutate(
NAME = case_when(
== "ROTTNEST ISLAND LIGHTHOUSE, AS" ~ "ROTTNEST",
NAME == "ROTTNEST ISLAND, AS" ~ "ROTTNEST",
NAME .default = NAME
) )
We can account for Year and then hold it at a set value to estimate the differences in temperature and rainfall at different weather stations.
Initially let’s use the raw data:
<- lm(
model_allStations_temp_prcp ~ PRCP + NAME + Year + as.factor(Month),
TMAX data = NOAA_all_stations_clean
)summary(model_allStations_temp_prcp)
##
## Call:
## lm(formula = TMAX ~ PRCP + NAME + Year + as.factor(Month), data = NOAA_all_stations_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6217 -2.5105 -0.3804 2.0531 23.0994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.299e+01 1.034e+00 -12.561 < 2e-16 ***
## PRCP -1.234e-01 1.536e-03 -80.367 < 2e-16 ***
## NAMEBICKLEY, AS -1.381e+00 8.314e-02 -16.614 < 2e-16 ***
## NAMEGOSNELLS CITY, AS 1.528e+00 8.357e-02 18.283 < 2e-16 ***
## NAMEKWINANA BP REFINERY, AS -3.540e-01 7.867e-02 -4.500 6.81e-06 ***
## NAMEMEDINA RESEARCH CENTRE, AS 5.839e-01 8.116e-02 7.194 6.30e-13 ***
## NAMEPEARCE RAAF, AS 1.583e+00 7.859e-02 20.142 < 2e-16 ***
## NAMEPERTH AIRPORT, AS 1.053e+00 7.671e-02 13.723 < 2e-16 ***
## NAMEPERTH METRO, AS 7.562e-01 8.280e-02 9.134 < 2e-16 ***
## NAMEPERTH REGIONAL OFFICE, AS 6.172e-01 7.838e-02 7.874 3.44e-15 ***
## NAMEROTTNEST -1.851e+00 7.775e-02 -23.806 < 2e-16 ***
## NAMESTONEVILLE RESEARCH STATION, AS -3.750e-01 1.253e-01 -2.992 0.00277 **
## NAMESUBIACO TREATMENT PLANT, AS 9.200e-01 9.432e-02 9.754 < 2e-16 ***
## NAMESWANBOURNE, AS -1.248e-02 8.288e-02 -0.151 0.88031
## Year 2.173e-02 5.198e-04 41.803 < 2e-16 ***
## as.factor(Month)2 3.097e-01 4.375e-02 7.078 1.47e-12 ***
## as.factor(Month)3 -1.708e+00 4.276e-02 -39.950 < 2e-16 ***
## as.factor(Month)4 -5.202e+00 4.321e-02 -120.395 < 2e-16 ***
## as.factor(Month)5 -8.605e+00 4.311e-02 -199.586 < 2e-16 ***
## as.factor(Month)6 -1.102e+01 4.383e-02 -251.305 < 2e-16 ***
## as.factor(Month)7 -1.207e+01 4.347e-02 -277.585 < 2e-16 ***
## as.factor(Month)8 -1.173e+01 4.311e-02 -271.994 < 2e-16 ***
## as.factor(Month)9 -1.040e+01 4.323e-02 -240.588 < 2e-16 ***
## as.factor(Month)10 -8.102e+00 4.280e-02 -189.270 < 2e-16 ***
## as.factor(Month)11 -4.976e+00 4.310e-02 -115.471 < 2e-16 ***
## as.factor(Month)12 -2.319e+00 4.311e-02 -53.785 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.697 on 176441 degrees of freedom
## (88190 observations deleted due to missingness)
## Multiple R-squared: 0.6263, Adjusted R-squared: 0.6263
## F-statistic: 1.183e+04 on 25 and 176441 DF, p-value: < 2.2e-16
Use predict Year==1980?
= NOAA_all_stations_clean$NAME %>% unique
stationNames
<- function(station, model, Year, PRCP, Month) {
predict_TMAX return(
predict(
model,list(Year = Year,
PRCP = PRCP,
NAME = station,
Month= Month))
)
}
<- data.frame(station = stationNames)
stationPredictions $Year <- 1980
stationPredictions$PRCP <- 1
stationPredictions$Month <- 1
stationPredictions
%>%
stationPredictions rowwise() %>%
mutate(Est_TMAX_Jul1980 = predict_TMAX(station, model_allStations_temp_prcp, Year, PRCP, Month))
Turns out the weather between stations is just really similar! There are likely some differences, but it is difficult to measure the relationship between location and temperature/rainfall with only 13 stations.
<- NOAA_all_stations_clean %>%
weatherSummary_all_stations group_by(NAME) %>%
summarise(
meanRainfall = mean(PRCP, na.rm = TRUE),
meanTMAX = mean(TMAX, na.rm = TRUE),
meanTMIN = mean(TMIN, na.rm = TRUE),
maxTMAX = max(TMAX, na.rm = TRUE),
maxTMIN = max(TMIN, na.rm = TRUE),
%>%
) left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first")
weatherSummary_all_stations
%>%
weatherSummary_all_stations ggplot(aes(LONGITUDE, maxTMAX)) + geom_point() +
geom_smooth(method = "lm") +
dark_theme_bw() +
ggtitle("relationship between record high temperature and location")
%>%
weatherSummary_all_stations ggplot(aes(LONGITUDE, maxTMIN, colour = NAME == "ROTTNEST")) + geom_point() +
geom_smooth(method = "lm") +
dark_theme_bw() +
ggtitle("relationship between record low temperature and location")
Risk of rain for each day of the year at each station.
<- NOAA_all_stations_clean %>%
prop_rained_all drop_na(PRCP) %>%
group_by(Month, NAME, Day) %>%
summarise(
number_of_rainy_days = sum(did_rain), days_observed = n()
%>%
) #change na to 0
mutate(number_of_rainy_days = ifelse(is.na(number_of_rainy_days),0,number_of_rainy_days)) %>%
#calculate proportion rainfall for a given day
mutate(proportion_rained = number_of_rainy_days/days_observed) %>%
#we add an index so we can count the days
group_by(NAME) %>%
mutate(dayIndex = row_number(NAME))
%>%
prop_rained_all ggplot(aes(dayIndex, proportion_rained, colour = NAME)) +
geom_point(alpha=0.1) +
geom_smooth(se=FALSE) +
ggtitle("Risk of any rain for each day in the year") +
xlab("day (0 is January 1st)") +
ylab("Risk of rain") +
ylim(c(0,1))
%>%
prop_rained_all group_by(NAME) %>%
summarise(
max_proportion_rained = max(proportion_rained),
mean_proportion_rained = mean(proportion_rained)
%>%
) arrange(desc(max_proportion_rained)) %>%
left_join(., NOAA_all_stations_clean %>% select(NAME, LATITUDE, LONGITUDE), by="NAME", multiple="first") %>%
ggplot(aes(LATITUDE, LONGITUDE, colour = mean_proportion_rained)) +
geom_point()