# This data aims at examining the rainfall pattern and fit a suitable model for rainfall
# prediction in Kerala (KL) subdivision of India. Data from 1901 to 2017 is downloaded from
# data.gov.in. Our aim is to find out the rainfall pattern in Kerala over last 117 years
# and predict the rainfall in upcoming Months/Quarters/Annual.
# Language used R Programming
# Load the necessary package
library(reshape)
library(forecast)
library(tseries)
library(rmarkdown)
library(knitr)
# Load the data
#setwd('D:\\share\\Hackathon')
setwd('~/R Programming Class')
getwd()
## [1] "C:/Users/PC/Documents/R Programming Class"
rain_india = read.csv("data/Sub_Division_IMD_2017.csv")
# View structure of data
str(rain_india)
## 'data.frame': 4188 obs. of 19 variables:
## $ SUBDIVISION: Factor w/ 36 levels "Andaman & Nicobar Islands",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ YEAR : int 1901 1902 1903 1904 1905 1906 1907 1908 1910 1911 ...
## $ JAN : num 49.2 0 12.7 9.4 1.3 ...
## $ FEB : num 87.1 159.8 144 14.7 0 ...
## $ MAR : num 29.2 12.2 0 0 3.3 ...
## $ APR : num 2.3 0 1 202.4 26.9 ...
## $ MAY : num 529 446 235 304 280 ...
## $ JUN : num 518 537 480 495 629 ...
## $ JUL : num 365 229 728 502 369 ...
## $ AUG : num 481 754 327 160 330 ...
## $ SEP : num 333 666 339 820 297 ...
## $ OCT : num 388 197 181 222 261 ...
## $ NOV : num 558.2 359 284.4 308.7 25.4 ...
## $ DEC : num 33.6 160.5 225 40.1 344.7 ...
## $ ANNUAL : num 3373 3521 2957 3080 2567 ...
## $ JF : num 136.3 159.8 156.7 24.1 1.3 ...
## $ MAM : num 560 458 236 507 310 ...
## $ JJAS : num 1696 2186 1874 1978 1625 ...
## $ OND : num 980 717 691 571 631 ...
# Data has 4188 obs of 19 variables
# Kerala rain data from Jan 1901 to Dec 2017
KL_months = rain_india[rain_india$SUBDIVISION == "Kerala",
c("YEAR","JAN","FEB","MAR","APR","MAY","JUN",
"JUL","AUG","SEP" ,"OCT","NOV","DEC")]
# Renaming row names
rownames(KL_months) = KL_months$YEAR
head(KL_months,n=3)
## YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV
## 1901 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7 266.9 350.8
## 1902 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6 358.4 158.3
## 1903 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8 354.1 157.0
## DEC
## 1901 48.4
## 1902 121.5
## 1903 59.0
# View structure of data
str(KL_months)
## 'data.frame': 117 obs. of 13 variables:
## $ YEAR: int 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 ...
## $ JAN : num 28.7 6.7 3.2 23.7 1.2 26.7 18.8 8 54.1 2.7 ...
## $ FEB : num 44.7 2.6 18.6 3 22.3 7.4 4.8 20.8 11.8 25.7 ...
## $ MAR : num 51.6 57.3 3.1 32.2 9.4 9.9 55.7 38.2 61.3 23.3 ...
## $ APR : num 160 83.9 83.6 71.5 105.9 ...
## $ MAY : num 175 134 250 236 263 ...
## $ JUN : num 825 391 559 1098 850 ...
## $ JUL : num 743 1205 1022 726 520 ...
## $ AUG : num 358 316 420 352 294 ...
## $ SEP : num 198 492 342 223 217 ...
## $ OCT : num 267 358 354 328 384 ...
## $ NOV : num 350.8 158.3 157 33.9 74.4 ...
## $ DEC : num 48.4 121.5 59 3.3 0.2 ...
# Data has 117 obs of 12 variables
# Are there any unusual or No data values?
summary(KL_months)
## YEAR JAN FEB MAR
## Min. :1901 Min. : 0.00 Min. : 0.00 Min. : 0.10
## 1st Qu.:1930 1st Qu.: 2.40 1st Qu.: 4.70 1st Qu.: 18.10
## Median :1959 Median : 6.00 Median : 8.40 Median : 28.30
## Mean :1959 Mean :12.17 Mean :15.37 Mean : 37.13
## 3rd Qu.:1988 3rd Qu.:16.90 3rd Qu.:21.40 3rd Qu.: 50.10
## Max. :2017 Max. :83.50 Max. :79.00 Max. :217.20
## APR MAY JUN JUL
## Min. : 13.1 Min. : 53.4 Min. : 196.8 Min. : 167.5
## 1st Qu.: 71.5 1st Qu.:124.5 1st Qu.: 541.7 1st Qu.: 532.0
## Median :108.4 Median :190.6 Median : 625.8 Median : 687.3
## Mean :109.4 Mean :230.0 Mean : 653.2 Mean : 696.0
## 3rd Qu.:135.1 3rd Qu.:265.4 3rd Qu.: 788.5 3rd Qu.: 831.6
## Max. :238.0 Max. :738.8 Max. :1098.2 Max. :1526.5
## AUG SEP OCT NOV
## Min. : 178.6 Min. : 41.3 Min. : 68.5 Min. : 31.5
## 1st Qu.: 315.3 1st Qu.:150.1 1st Qu.:221.6 1st Qu.: 92.9
## Median : 385.2 Median :223.9 Median :282.6 Median :153.0
## Mean : 420.7 Mean :245.9 Mean :291.9 Mean :162.6
## 3rd Qu.: 495.0 3rd Qu.:335.6 3rd Qu.:354.1 3rd Qu.:219.1
## Max. :1199.2 Max. :526.7 Max. :567.9 Max. :365.6
## DEC
## Min. : 0.10
## 1st Qu.: 10.20
## Median : 31.10
## Mean : 39.98
## 3rd Qu.: 54.10
## Max. :202.30
# Converting Multivariate to Univariate data
KL_months_univ = melt(t(KL_months[,c(2:13)]),
varnames = c("MONTH","YEAR"))
head(KL_months_univ)
## MONTH YEAR value
## 1 JAN 1901 28.7
## 2 FEB 1901 44.7
## 3 MAR 1901 51.6
## 4 APR 1901 160.0
## 5 MAY 1901 174.7
## 6 JUN 1901 824.6
class(KL_months_univ)
## [1] "data.frame"
# Type of the class is data.frame
# Converting data to Time Series object
KL_months_ts = ts(KL_months_univ$value,start= min(KL_months_univ$YEAR),frequency=12)
# As monthly forecast the freq is set to 12
class(KL_months_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_months_ts, n = 36)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct
## 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7 266.9
## 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6 358.4
## 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8 354.1
## Nov Dec
## 1901 350.8 48.4
## 1902 158.3 121.5
## 1903 157.0 59.0
tail(KL_months_ts, n = 36)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6
## 2016 3.0 16.4 22.4 33.3 258.4 595.7 441.5 231.0 84.1 105.1 57.9
## 2017 12.7 0.3 87.8 52.8 213.3 579.8 378.5 462.6 435.5 228.0 152.1
## Dec
## 2015 79.4
## 2016 22.0
## 2017 61.4
# Visualization
# Plot the Time Series
plot(KL_months_ts,
main = "KL monthly Rain from 1901 to 2017",
ylab = "Rainfall in mm")
abline(reg = lm(KL_months_ts ~ time(KL_months_ts)))

# The data looks stationary by observation
# Also the mean and variance is constant
boxplot(KL_months_ts ~ cycle(KL_months_ts),col = "yellow",names = month.abb,
xlab ="Month",ylab ="Rainfall in mm", main = "KL monthly Rain summary")

# The data considered on monthly basis has some outliers
# The month of July recorded the highest amount of rainfall, followed closely by June
# The months of January,February and March recorded the least amount of rainfall.
# KL_months_ts = tsclean(KL_months_ts)
# Decomposition
decomp_KL_months_ts = decompose(KL_months_ts)
plot(decomp_KL_months_ts)

# The data has seasonal effect, with a usual rise and fall pattern being experienced
# yearly over the period. This implies that regular monthly rainfall figures recorded
# each year was influenced by the rise and fall pattern of the seasonality component.
# However, the trend seems to be very constant over time, although there are
# a few ups and downs over some periods.
# The random effect is very stable over the time period.
# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_months_ts)
## Warning in tseries::adf.test(KL_months_ts): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: KL_months_ts
## Dickey-Fuller = -8.2612, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -8.2612, p-value = 0.01
# p-value < 0.05 , so our data is stationary
# Build model on Training time duration
# Training Set
KL_months_tr = window(KL_months_ts,start = c(1901,1),end = c(2014,12))
# 114 years for training
# Test Set
KL_months_te = window(KL_months_ts,start = c(2015,1),end =c(2017,12))
# Remainning 3 years for Test
# plot forecasting for 3 years according to 4 methods
plot(KL_months_tr,main ="KL Rainfall",ylab ="Rainfall in mm",
xlim = c(2013,2018),ylim=c(0,2000))
#Mean Method
lines(meanf(KL_months_tr,h=36)$mean,col=5)
# Naive Method
lines(rwf(KL_months_tr,h=36)$mean,col=2)
# Drift Method
lines(rwf(KL_months_tr,drift = TRUE,h=36)$mean,col=3)
# Seasonal Naive Method
lines(snaive(KL_months_tr,h=36)$mean,col=6)
# test set
lines(KL_months_te,col="darkblue")
#Legend
legend("topleft",lty=1,col = c(5,2,3,6,"darkblue"),bty='n', cex = 0.8,
legend = c("Mean Method","Naive Method","Drift Method","Seasonal Naive Method",
"Actual Test"))

# Accuracy
accuracy(meanf(KL_months_tr,h=36),KL_months_te)
## ME RMSE MAE MPE MAPE MASE
## Training set -8.310238e-15 257.9332 205.9044 -Inf Inf 2.014211
## Test set -4.579846e+01 185.9648 159.4902 -3021.489 3045.45 1.560176
## ACF1 Theil's U
## Training set 0.5779024 NA
## Test set 0.6434968 1.827923
accuracy(rwf(KL_months_tr,h=36),KL_months_te)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01353328 236.9446 158.7688 -Inf Inf 1.553118
## Test set 151.03055556 235.1502 168.0194 -503.7572 605.6595 1.643611
## ACF1 Theil's U
## Training set -0.04801825 NA
## Test set 0.64349684 0.4684641
accuracy(rwf(KL_months_tr,drift = TRUE,h=36),KL_months_te)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.011742e-15 236.9446 158.7688 -Inf Inf 1.553119
## Test set 1.507802e+02 234.9791 167.8713 -507.5336 609.1558 1.642162
## ACF1 Theil's U
## Training set -0.04801825 NA
## Test set 0.64345563 0.4643569
accuracy(snaive(KL_months_tr,h=36),KL_months_te)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1489676 161.2041 102.2258 -Inf Inf 1.00000
## Test set -55.6527778 168.5781 110.2250 -129.7359 161.5687 1.07825
## ACF1 Theil's U
## Training set -0.02145137 NA
## Test set 0.23549073 0.7986112
# Mean,Naive, Drift Method do not detect nor the trend neither the seasonality
# Seasonal Naive method detects the seasonality and prediction is from recent observation
# caution takes a while to compute
KL_months_tr_arima = auto.arima(KL_months_tr)
# This will return best ARIMA model
# Forecast for Test time duration
KL_months_te_forecast_arima = forecast(KL_months_tr_arima, h = 36)
head(KL_months_te_forecast_arima)
## $method
## [1] "ARIMA(0,0,0)(2,1,0)[12]"
##
## $model
## Series: KL_months_tr
## ARIMA(0,0,0)(2,1,0)[12]
##
## Coefficients:
## sar1 sar2
## -0.6334 -0.2965
## s.e. 0.0261 0.0262
##
## sigma^2 estimated as 18100: log likelihood=-8572.72
## AIC=17151.44 AICc=17151.45 BIC=17167.07
##
## $level
## [1] 80 95
##
## $mean
## Jan Feb Mar Apr May Jun
## 2015 5.194208 20.549496 29.602358 102.417396 160.460773 645.499981
## 2016 4.610289 22.891609 31.676385 84.406497 178.767989 698.857940
## 2017 4.804007 18.369435 26.893310 93.823883 194.012874 608.404994
## Jul Aug Sep Oct Nov Dec
## 2015 635.710835 542.304197 288.366338 273.479233 122.141221 25.816980
## 2016 707.552603 555.697399 300.845388 297.092780 124.223333 30.408710
## 2017 674.522930 604.014433 296.033831 306.451021 116.192182 33.839370
##
## $lower
## 80% 95%
## Jan 2015 -167.222371 -258.4941940
## Feb 2015 -151.867083 -243.1389060
## Mar 2015 -142.814221 -234.0860441
## Apr 2015 -69.999182 -161.2710057
## May 2015 -11.955805 -103.2276285
## Jun 2015 473.083403 381.8115798
## Jul 2015 463.294257 372.0224333
## Aug 2015 369.887618 278.6157949
## Sep 2015 115.949759 24.6779362
## Oct 2015 101.062655 9.7908313
## Nov 2015 -50.275357 -141.5471805
## Dec 2015 -146.599598 -237.8714213
## Jan 2016 -179.024756 -276.2352764
## Feb 2016 -160.743436 -257.9539566
## Mar 2016 -151.958660 -249.1691807
## Apr 2016 -99.228548 -196.4390687
## May 2016 -4.867055 -102.0775761
## Jun 2016 515.222895 418.0123743
## Jul 2016 523.917558 426.7070372
## Aug 2016 372.062354 274.8518332
## Sep 2016 117.210343 19.9998222
## Oct 2016 113.457736 16.2472149
## Nov 2016 -59.411712 -156.6222329
## Dec 2016 -153.226335 -250.4368556
## Jan 2017 -196.009980 -302.3144834
## Feb 2017 -182.444552 -288.7490552
## Mar 2017 -173.920677 -280.2251808
## Apr 2017 -106.990104 -213.2946078
## May 2017 -6.801113 -113.1056163
## Jun 2017 407.591007 301.2865032
## Jul 2017 473.708943 367.4044397
## Aug 2017 403.200446 296.8959430
## Sep 2017 95.219844 -11.0846590
## Oct 2017 105.637034 -0.6674695
## Nov 2017 -84.621806 -190.9263089
## Dec 2017 -166.974617 -273.2791208
##
## $upper
## 80% 95%
## Jan 2015 177.6108 268.8826
## Feb 2015 192.9661 284.2379
## Mar 2015 202.0189 293.2908
## Apr 2015 274.8340 366.1058
## May 2015 332.8774 424.1492
## Jun 2015 817.9166 909.1884
## Jul 2015 808.1274 899.3992
## Aug 2015 714.7208 805.9926
## Sep 2015 460.7829 552.0547
## Oct 2015 445.8958 537.1676
## Nov 2015 294.5578 385.8296
## Dec 2015 198.2336 289.5054
## Jan 2016 188.2453 285.4559
## Feb 2016 206.5267 303.7372
## Mar 2016 215.3114 312.5220
## Apr 2016 268.0415 365.2521
## May 2016 362.4030 459.6136
## Jun 2016 882.4930 979.7035
## Jul 2016 891.1876 988.3982
## Aug 2016 739.3324 836.5430
## Sep 2016 484.4804 581.6910
## Oct 2016 480.7278 577.9383
## Nov 2016 307.8584 405.0689
## Dec 2016 214.0438 311.2543
## Jan 2017 205.6180 311.9225
## Feb 2017 219.1834 325.4879
## Mar 2017 227.7073 334.0118
## Apr 2017 294.6379 400.9424
## May 2017 394.8269 501.1314
## Jun 2017 809.2190 915.5235
## Jul 2017 875.3369 981.6414
## Aug 2017 804.8284 911.1329
## Sep 2017 496.8478 603.1523
## Oct 2017 507.2650 613.5695
## Nov 2017 317.0062 423.3107
## Dec 2017 234.6534 340.9579
# Visualization
plot(KL_months_te_forecast_arima,
xlim = c(2013,2018),
xlab = "Time",
ylab = "rainfall(mm)",
main = "Forecast for Test Time duration")
lines(KL_months_te,col= 6,lty=2)
legend("topright",lty = 1,bty = "n",col = c(6,"darkblue","lightblue","lightgray"),
c("Testdata","ArimaPred","80% Confidence Interval","95% Confidence Interval")
,cex = 0.5)

KL_months_te_pointforecast_arima = KL_months_te_forecast_arima$mean
KL_months_te_pointforecast_arima = round(KL_months_te_pointforecast_arima,digits = 1)
print(KL_months_te_pointforecast_arima)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2015 5.2 20.5 29.6 102.4 160.5 645.5 635.7 542.3 288.4 273.5 122.1
## 2016 4.6 22.9 31.7 84.4 178.8 698.9 707.6 555.7 300.8 297.1 124.2
## 2017 4.8 18.4 26.9 93.8 194.0 608.4 674.5 604.0 296.0 306.5 116.2
## Dec
## 2015 25.8
## 2016 30.4
## 2017 33.8
print(round(KL_months_te, digits =1))
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6
## 2016 3.0 16.4 22.4 33.3 258.4 595.7 441.5 231.0 84.1 105.1 57.9
## 2017 12.7 0.3 87.8 52.8 213.3 579.8 378.5 462.6 435.5 228.0 152.1
## Dec
## 2015 79.4
## 2016 22.0
## 2017 61.4
# Test data vs ARIMA Point Forecast
plot( KL_months_te,
ylim = c(-500, 1500),
xlim = c(2016,2018),
main = "Monthly : Testdata vs ARIMA PF",
ylab = 'Test data/ARIMA PF',col= 6,lty=2)
lines(KL_months_te_pointforecast_arima,col="4")
legend("topleft",legend = c("Test data","ARIMA PF"),
col = c(6,4),lty = c(2,1),cex = 0.8)

# Evaluate the forecast/prediction accuracy (MAPE)
MAPE = function(actual, predicted) {
abs_percent_diff = abs((actual - predicted) / actual)*100
# percentage difference could be infinite if actual has value 0
abs_percent_diff = abs_percent_diff[is.finite(abs_percent_diff)]
mape = c(mean(abs_percent_diff), median(abs_percent_diff))
names(mape) = c("Mean_APE", "Median_APE")
return (mape)
}
MAPE(KL_months_te, KL_months_te_pointforecast_arima)
## Mean_APE Median_APE
## 232.68920 48.78272
# 232.69 Mean_APE
# 48.79 Median_APE
# Forecast for future times
KL_months_arima = auto.arima(KL_months_ts)
KL_months_forecast_arima = forecast(KL_months_arima, h= 24)
head(KL_months_forecast_arima)
## $method
## [1] "ARIMA(2,0,2)(2,1,0)[12] with drift"
##
## $model
## Series: KL_months_ts
## ARIMA(2,0,2)(2,1,0)[12] with drift
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ar1 ar2 ma1 ma2 sar1 sar2 drift
## 0.0149 -0.5456 0.0161 0.5289 -0.6370 -0.3022 -0.0533
## s.e. NaN 0.1879 NaN 0.2839 0.0231 0.0257 0.1564
##
## sigma^2 estimated as 17986: log likelihood=-8793.4
## AIC=17602.8 AICc=17602.9 BIC=17644.71
##
## $level
## [1] 80 95
##
## $mean
## Jan Feb Mar Apr May Jun
## 2018 8.464241 7.403198 51.572192 93.053648 224.599399 579.398620
## 2019 7.068231 6.546110 53.601847 60.256067 229.815288 583.233204
## Jul Aug Sep Oct Nov Dec
## 2018 406.167441 320.008014 273.793798 209.958299 140.792775 52.337839
## 2019 406.332777 339.593634 269.360857 183.070445 118.281786 44.960466
##
## $lower
## 80% 95%
## Jan 2018 -163.40776067 -254.39130
## Feb 2018 -164.55146272 -255.57876
## Mar 2018 -120.40520881 -211.44455
## Apr 2018 -78.94905807 -170.00179
## May 2018 52.59030971 -38.46580
## Jun 2018 407.38179062 316.32158
## Jul 2018 234.14882398 143.08767
## Aug 2018 147.98703171 56.92462
## Sep 2018 101.77231712 10.70965
## Oct 2018 37.93609608 -53.12696
## Nov 2018 -31.22956704 -122.29269
## Dec 2018 -119.68472313 -210.74797
## Jan 2019 -175.95837302 -272.84680
## Feb 2019 -176.49239356 -273.38712
## Mar 2019 -129.43981736 -226.33622
## Apr 2019 -122.78923997 -219.68757
## May 2019 46.76909461 -50.12971
## Jun 2019 400.18589704 303.28651
## Jul 2019 223.28522183 126.38570
## Aug 2019 156.54573893 59.64604
## Sep 2019 86.31289227 -10.58685
## Oct 2019 0.02237678 -96.87742
## Nov 2019 -64.76630123 -161.66611
## Dec 2019 -138.08765362 -234.98747
##
## $upper
## 80% 95%
## Jan 2018 180.3362 271.3198
## Feb 2018 179.3579 270.3852
## Mar 2018 223.5496 314.5889
## Apr 2018 265.0564 356.1091
## May 2018 396.6085 487.6646
## Jun 2018 751.4154 842.4757
## Jul 2018 578.1861 669.2472
## Aug 2018 492.0290 583.0914
## Sep 2018 445.8153 536.8780
## Oct 2018 381.9805 473.0436
## Nov 2018 312.8151 403.8782
## Dec 2018 224.3604 315.4236
## Jan 2019 190.0948 286.9833
## Feb 2019 189.5846 286.4793
## Mar 2019 236.6435 333.5399
## Apr 2019 243.3014 340.1997
## May 2019 412.8615 509.7603
## Jun 2019 766.2805 863.1799
## Jul 2019 589.3803 686.2799
## Aug 2019 522.6415 619.5412
## Sep 2019 452.4088 549.3086
## Oct 2019 366.1185 463.0183
## Nov 2019 301.3299 398.2297
## Dec 2019 228.0086 324.9084
KL_months_pointforecast_arima = round(KL_months_forecast_arima$mean,digits = 1)
print(KL_months_pointforecast_arima)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2018 8.5 7.4 51.6 93.1 224.6 579.4 406.2 320.0 273.8 210.0 140.8
## 2019 7.1 6.5 53.6 60.3 229.8 583.2 406.3 339.6 269.4 183.1 118.3
## Dec
## 2018 52.3
## 2019 45.0
# Visualization
plot(KL_months_forecast_arima,xlab = "Time",ylab = "rainfall(mm)")

plot(KL_months_forecast_arima, xlim = c(2011,2020),ylim = c(-500,3000),
ylab = "rainfall(mm)", main = "Forecast from Arima for 2018 and 2019")
lines(KL_months_te_pointforecast_arima,col="red")
legend("topright",legend = c("Actual","Test Time Forecast","Future Forecast"
,"80% Confidence Interval","95% Confidence Interval"),
col = c("black","red","blue","lightblue","lightgrey"),lty = 1,cex = 0.8)

boxplot(KL_months_pointforecast_arima ~ cycle(KL_months_pointforecast_arima),
col = "yellow",names = month.abb,xlab ="Month",
ylab ="Rainfall in mm", main = "KL monthly Rain summary")

# The results revealed that the region experience much rainfall in the months of
# June, July and least amount of rainfall in the months of January,February and March .
# Auto.ARIMA has been identified as an appropriate model for predicting monthly average
# rainfall figures for the Kerala Subdivision of India.
###################################>>>> Seasonal <<<<<###################################
# Our aim is to find out the rainfall pattern in Kerala over last 115 years
# and predict the rainfall in upcoming Quaterly
KL_seasonal = rain_india[rain_india$SUBDIVISION == "Kerala", 16:19]
# This will filter Kerala subdivision and slice out columns 16 to 19
rownames(KL_seasonal) = c(1901:2017)
colnames(KL_seasonal) = c("JF" = "Qtr1", "MAM"="Qtr2","JJAS"="Qtr3","OND"="Qtr4")
head(KL_seasonal)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1901 73.4 386.2 2122.8 666.1
## 1902 9.3 275.7 2403.4 638.2
## 1903 21.7 336.3 2343.0 570.1
## 1904 26.7 339.4 2398.2 365.3
## 1905 23.4 378.5 1881.5 458.1
## 1906 34.1 230.0 1943.1 500.8
# View structure of data
str(KL_seasonal)
## 'data.frame': 117 obs. of 4 variables:
## $ Qtr1: num 73.4 9.3 21.7 26.7 23.4 34.1 23.7 28.8 65.9 28.4 ...
## $ Qtr2: num 386 276 336 339 378 ...
## $ Qtr3: num 2123 2403 2343 2398 1882 ...
## $ Qtr4: num 666 638 570 365 458 ...
# Data has 117 obs of 12 variables
# Are there any unusual or No data values?
summary(KL_seasonal)
## Qtr1 Qtr2 Qtr3 Qtr4
## Min. : 0.30 Min. : 89.9 Min. :1104 Min. :166.6
## 1st Qu.:10.30 1st Qu.:277.8 1st Qu.:1749 1st Qu.:406.5
## Median :20.50 Median :342.0 Median :1948 Median :500.8
## Mean :27.54 Mean :376.5 Mean :2016 Mean :494.5
## 3rd Qu.:40.60 3rd Qu.:436.9 3rd Qu.:2231 3rd Qu.:584.0
## Max. :98.10 Max. :915.2 Max. :3451 Max. :823.3
# Converting Multivariate to Univariate data
KL_seasonal_univ = melt(t(KL_seasonal),varnames = c("QUATERS","YEAR"))
head(KL_seasonal_univ)
## QUATERS YEAR value
## 1 Qtr1 1901 73.4
## 2 Qtr2 1901 386.2
## 3 Qtr3 1901 2122.8
## 4 Qtr4 1901 666.1
## 5 Qtr1 1902 9.3
## 6 Qtr2 1902 275.7
class(KL_seasonal_univ)
## [1] "data.frame"
# Type of the class is data.frame
# Converting data to Time Series object
KL_seasonal_ts = ts(KL_seasonal_univ$value,start= min(KL_seasonal_univ$YEAR),frequency=4)
# As quarterly forecast the freq is set to 4
class(KL_seasonal_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_seasonal_ts, n=12)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1901 73.4 386.2 2122.8 666.1
## 1902 9.3 275.7 2403.4 638.2
## 1903 21.7 336.3 2343.0 570.1
tail(KL_seasonal_ts, n=12)
## Qtr1 Qtr2 Qtr3 Qtr4
## 2015 8.9 465.9 1514.7 611.1
## 2016 19.3 314.2 1352.3 185.0
## 2017 13.0 353.9 1856.5 441.5
# Visualization
# 1. PLOT
plot(KL_seasonal_ts, main = "KL Seasonal rainfall from 1901 to 2017",
ylab ="rainfall(mm)")
abline(reg = lm(KL_seasonal_ts ~ time(KL_seasonal_ts)))

# 2. BOXPLOT
boxplot(split(KL_seasonal_ts,cycle(KL_seasonal_ts)),col = "gold",
xlab ="Quarterly",ylab ="Rainfall in mm", main = "KL Seasonal Rainfall summary")

# The data considered on Quaterly basis has some outliers
# Q3 has recorded the highest amount of rainfall,followed closely by Q4
# Q1 has recorded the least amount of rainfall.
# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_seasonal_ts)
## Warning in tseries::adf.test(KL_seasonal_ts): p-value smaller than printed
## p-value
##
## Augmented Dickey-Fuller Test
##
## data: KL_seasonal_ts
## Dickey-Fuller = -6.104, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -6.104
# p-value = 0.01
# Data is stationary
# Build model on Training time duration
# Remaining 113 years for training
KL_seasonal_tr = window(KL_seasonal_ts,start = min(KL_seasonal_univ$YEAR), end = c(2013,4))
# 4 years for Testing
KL_seasonal_te = window(KL_seasonal_ts,start = c(2014,1), end = c(2017,4))
KL_seasonal_tr_arima = auto.arima(KL_seasonal_tr)
arimaorder(KL_seasonal_tr_arima)
## p d q P D Q Frequency
## 1 0 0 2 1 0 4
# Forecast for Test time duration
KL_seasonal_te_forecast_arima = forecast(KL_seasonal_tr_arima, h = 16)
head(KL_seasonal_te_forecast_arima)
## $method
## [1] "ARIMA(1,0,0)(2,1,0)[4] with drift"
##
## $model
## Series: KL_seasonal_tr
## ARIMA(1,0,0)(2,1,0)[4] with drift
##
## Coefficients:
## ar1 sar1 sar2 drift
## -0.0769 -0.6148 -0.3248 -0.2281
## s.e. 0.0473 0.0451 0.0450 1.3686
##
## sigma^2 estimated as 58395: log likelihood=-3093.01
## AIC=6196.02 AICc=6196.15 BIC=6216.54
##
## $level
## [1] 80 95
##
## $mean
## Qtr1 Qtr2 Qtr3 Qtr4
## 2014 48.29709 267.08073 2147.65216 399.35304
## 2015 35.50982 257.82068 2067.03777 377.90846
## 2016 40.17410 245.96602 2249.14728 399.86196
## 2017 39.69008 254.49259 2161.59559 391.55995
##
## $lower
## 80% 95%
## 2014 Q1 -261.39170 -425.33105
## 2014 Q2 -43.52148 -207.94436
## 2014 Q3 1837.04456 1672.61883
## 2014 Q4 88.74541 -75.68034
## 2015 Q1 -297.21936 -473.35555
## 2015 Q2 -75.03482 -251.23788
## 2015 Q3 1734.18153 1557.97807
## 2015 Q4 45.05221 -131.15124
## 2016 Q1 -319.30729 -509.60524
## 2016 Q2 -113.66681 -304.04493
## 2016 Q3 1889.51356 1699.13497
## 2016 Q4 40.22823 -150.15036
## 2017 Q1 -365.87718 -580.57152
## 2017 Q2 -151.33058 -366.16039
## 2017 Q3 1755.77090 1540.94029
## 2017 Q4 -14.26475 -229.09536
##
## $upper
## 80% 95%
## 2014 Q1 357.9859 521.9252
## 2014 Q2 577.6829 742.1058
## 2014 Q3 2458.2598 2622.6855
## 2014 Q4 709.9607 874.3864
## 2015 Q1 368.2390 544.3752
## 2015 Q2 590.6762 766.8792
## 2015 Q3 2399.8940 2576.0975
## 2015 Q4 710.7647 886.9682
## 2016 Q1 399.6555 589.9534
## 2016 Q2 605.5988 795.9770
## 2016 Q3 2608.7810 2799.1596
## 2016 Q4 759.4957 949.8743
## 2017 Q1 445.2573 659.9517
## 2017 Q2 660.3158 875.1456
## 2017 Q3 2567.4203 2782.2509
## 2017 Q4 797.3846 1012.2153
plot(KL_seasonal_te_forecast_arima, xlab = "Time", ylab = "rainfall(mm)",
main = "Seasonal : Forecast for Test Time Duration")

plot(KL_seasonal_te_forecast_arima, xlim= c(2014,2018),
xlab = "Time", ylab = "rainfall(mm)", main = "Seasonal : Forecast for Test Time Duration")

KL_seasonal_te_pointforecast_arima = KL_seasonal_te_forecast_arima$mean
print(round(KL_seasonal_te_pointforecast_arima,digits = 1))
## Qtr1 Qtr2 Qtr3 Qtr4
## 2014 48.3 267.1 2147.7 399.4
## 2015 35.5 257.8 2067.0 377.9
## 2016 40.2 246.0 2249.1 399.9
## 2017 39.7 254.5 2161.6 391.6
print(KL_seasonal_te)
## Qtr1 Qtr2 Qtr3 Qtr4
## 2014 14.9 364.5 2164.8 502.1
## 2015 8.9 465.9 1514.7 611.1
## 2016 19.3 314.2 1352.3 185.0
## 2017 13.0 353.9 1856.5 441.5
plot(
KL_seasonal_te,
ylim = c(-500, 3500),
ylab = "Actual/Forecast",
main = "Seasonal: Actual Vs ARIMA PF")
lines(KL_seasonal_te_pointforecast_arima, col = "blue")
legend("topright", legend= c("Actual","ARIMA PF"),
col= c("black","blue"),lty = 1, cex=0.8)

# Evaluate the forecast/prediction accuracy (MAPE)
MAPE = function(actual, predicted) {
abs_percent_diff = abs((actual - predicted) / actual)*100
# percentage difference could be infinite if actual has value 0
abs_percent_diff = abs_percent_diff[is.finite(abs_percent_diff)]
mape = c(mean(abs_percent_diff), median(abs_percent_diff))
names(mape) = c("Mean_APE", "Median_APE")
return (mape)
}
MAPE(KL_seasonal_te, KL_seasonal_te_pointforecast_arima)
## Mean_APE Median_APE
## 78.99214 37.31223
# Mean_APE - 78.99
# Median_APE - 37.31
# Forecast for Future duration
KL_seasonal_ts_arima = auto.arima(KL_seasonal_ts)
KL_seasonal_ts_forecast_arima = forecast(KL_seasonal_ts_arima, h = 8)
# Predicting seasonally for future years 2018 and 2019
print(KL_seasonal_ts_forecast_arima)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2018 Q1 6.719950 -303.52118 316.9611 -467.75292 481.1928
## 2018 Q2 376.694313 65.60932 687.7793 -99.06913 852.4578
## 2018 Q3 1603.672804 1292.58322 1914.7624 1127.90234 2079.4433
## 2018 Q4 424.465015 113.37540 735.5546 -51.30549 900.2355
## 2019 Q1 8.912556 -325.96489 343.7900 -503.23830 521.0634
## 2019 Q2 346.245674 11.24327 681.2481 -166.09629 858.5876
## 2019 Q3 1585.409912 1250.40683 1920.4130 1073.06691 2097.7529
## 2019 Q4 346.374005 11.37092 681.3771 -165.96900 858.7170
KL_seasonal_pointforecast_arima = KL_seasonal_ts_forecast_arima$mean
print(round(KL_seasonal_pointforecast_arima,digits = 1))
## Qtr1 Qtr2 Qtr3 Qtr4
## 2018 6.7 376.7 1603.7 424.5
## 2019 8.9 346.2 1585.4 346.4
print(KL_seasonal_te)
## Qtr1 Qtr2 Qtr3 Qtr4
## 2014 14.9 364.5 2164.8 502.1
## 2015 8.9 465.9 1514.7 611.1
## 2016 19.3 314.2 1352.3 185.0
## 2017 13.0 353.9 1856.5 441.5
#Visualization
plot(KL_seasonal_ts_forecast_arima,xlab = "Time", ylab = "rainfall(mm)")

plot(KL_seasonal_ts_forecast_arima, xlim = c(2011,2020),xlab = "Time",
ylab = "rainfall(mm)",main = "Seasonal: Forecast for 2018 and 2019")
lines(KL_seasonal_te_pointforecast_arima, col = "red")
legend("topright", legend= c("Actual","Test Time forecast","Future forecast"),
col= c("black","red","blue"),lty = 1, cex=0.8)

boxplot(KL_seasonal_pointforecast_arima ~ cycle(KL_seasonal_pointforecast_arima),
col = "yellow",xlab ="Quarterly",
ylab ="Rainfall in mm", main = "KL Seasonal Rain summary")

# The result shows that kerala has experienced heavy rainfall on Q3 i.e. JJAS,
# followed by Q4 i.e. OND and Q1 i.e. Jan and Feb experienced lowest rainfall.
###################################>>>> Annual <<<<<###################################
# Kerala rain data from 1901 to 2017
KL_annual = rain_india[rain_india$SUBDIVISION == "Kerala", c("YEAR","ANNUAL")]
# Renaming row names
rownames(KL_annual) = KL_annual$YEAR
head(KL_annual,n=3)
## YEAR ANNUAL
## 1901 1901 3248.6
## 1902 1902 3326.6
## 1903 1903 3271.2
# View structure of data
str(KL_annual)
## 'data.frame': 117 obs. of 2 variables:
## $ YEAR : int 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 ...
## $ ANNUAL: num 3249 3327 3271 3130 2742 ...
# Data has 117 obs of 12 variables
# Are there any unusual or No data values?
summary(KL_annual)
## YEAR ANNUAL
## Min. :1901 Min. :1871
## 1st Qu.:1930 1st Qu.:2622
## Median :1959 Median :2931
## Mean :1959 Mean :2914
## 3rd Qu.:1988 3rd Qu.:3152
## Max. :2017 Max. :4258
# Converting data to Time Series object
KL_annual_ts = ts(KL_annual$ANNUAL,start= min(KL_annual$YEAR),frequency=1)
# As monthly forecast the freq is set to 12
class(KL_annual_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_annual_ts)
## Time Series:
## Start = 1901
## End = 1906
## Frequency = 1
## [1] 3248.6 3326.6 3271.2 3129.7 2741.6 2708.0
tail(KL_annual_ts)
## Time Series:
## Start = 2012
## End = 2017
## Frequency = 1
## [1] 2151.1 3255.4 3046.4 2600.6 1870.9 2664.9
# Visualization
# Plot the Time Series
plot(KL_annual_ts,
main = "KL yearly Rain from 1901 to 2017",
ylab = "Rainfall in mm")
abline(reg = lm(KL_annual_ts ~ time(KL_annual_ts)))

# Also the mean and variance is not constant
# The year on year trend clearly shows that the rainfall have been decreasing.
boxplot(KL_annual_ts ~ cycle(KL_annual_ts),col = "yellow",
xlab ="Year",ylab ="Rainfall in mm", main = "KL yearly Rain summary")

# The data considered on yearly basis has some outliers
# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_annual_ts)
## Warning in tseries::adf.test(KL_annual_ts): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: KL_annual_ts
## Dickey-Fuller = -5.0367, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -6.47
# p-value = 0.01
# Data is stationary
# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
# kpss.test(KL_annual_ts)
# Build model on Training time duration
# Training Set
KL_annual_tr = window(KL_annual_ts,start = min(KL_annual$YEAR),end = 2010)
# 110 years for training
# Test Set
KL_annual_te = window(KL_annual_ts,start = 2011,end =2017)
# Remainning 2 years for Test
# caution takes a while to compute
KL_annual_tr_arima = auto.arima(KL_annual_tr)
# This will return best ARIMA model
# Forecast for Test time duration
KL_annual_te_forecast_arima = forecast(KL_annual_tr_arima, h = 7)
head(KL_annual_te_forecast_arima)
## $method
## [1] "ARIMA(1,1,2)"
##
## $model
## Series: KL_annual_tr
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## -0.7550 -0.0177 -0.8673
## s.e. 0.0915 0.0680 0.0599
##
## sigma^2 estimated as 170165: log likelihood=-810.74
## AIC=1629.48 AICc=1629.87 BIC=1640.25
##
## $level
## [1] 80 95
##
## $mean
## Time Series:
## Start = 2011
## End = 2017
## Frequency = 1
## [1] 2818.030 2896.636 2837.286 2882.097 2848.263 2873.809 2854.521
##
## $lower
## Time Series:
## Start = 2011
## End = 2017
## Frequency = 1
## 80% 95%
## 2011 2289.376 2009.523
## 2012 2354.502 2067.514
## 2013 2294.326 2006.901
## 2014 2332.773 2041.979
## 2015 2298.935 2008.138
## 2016 2320.946 2028.278
## 2017 2301.490 2008.732
##
## $upper
## Time Series:
## Start = 2011
## End = 2017
## Frequency = 1
## 80% 95%
## 2011 3346.684 3626.537
## 2012 3438.770 3725.758
## 2013 3380.245 3667.671
## 2014 3431.421 3722.216
## 2015 3397.592 3688.388
## 2016 3426.672 3719.340
## 2017 3407.553 3700.310
# Visualization
plot(KL_annual_te_forecast_arima,
xlim = c(2011,2018),
ylim = c(1800,6000),
xlab = "Time",
ylab = "rainfall(mm)",
main = "Forecast for Test Time duration")
lines(KL_annual_te,col= 6,lty=2)
legend("topright",lty = 1,bty = "n",col = c(6,"darkblue","lightblue","lightgray"),
c("Testdata","ArimaPred","80% Confidence Interval","95% Confidence Interval")
,cex = 0.8)

KL_annual_te_pointforecast_arima = KL_annual_te_forecast_arima$mean
KL_annual_te_pointforecast_arima=(round(KL_annual_te_pointforecast_arima, digits = 1))
print(KL_annual_te_pointforecast_arima)
## Time Series:
## Start = 2011
## End = 2017
## Frequency = 1
## [1] 2818.0 2896.6 2837.3 2882.1 2848.3 2873.8 2854.5
print(KL_annual_te)
## Time Series:
## Start = 2011
## End = 2017
## Frequency = 1
## [1] 3035.1 2151.1 3255.4 3046.4 2600.6 1870.9 2664.9
# Test data vs ARIMA Point Forecast
plot( KL_annual_te,
ylim = c(1500, 4000),
xlim = c(2010,2018),
main = "Annual: Testdata vs ARIMA PF",
ylab = 'Test data/ARIMA PF',col= 6,lty=2)
lines(KL_annual_te_pointforecast_arima,col="4")
legend("topleft",legend = c("Test data","ARIMA PF"),
col = c(6,4),lty = c(2,1),cex = 0.8)

# Evaluate the forecast/prediction accuracy (MAPE)
MAPE(KL_annual_te, KL_annual_te_pointforecast_arima)
## Mean_APE Median_APE
## 18.612978 9.524725
# 18.612 Mean_APE
# 9.54 Median_APE
# Forecast for future times
KL_annual_arima = auto.arima(KL_annual_ts)
KL_annual_forecast_arima = forecast(KL_annual_arima, h= 2)
head(KL_annual_forecast_arima)
## $method
## [1] "ARIMA(0,1,2)"
##
## $model
## Series: KL_annual_ts
## ARIMA(0,1,2)
##
## Coefficients:
## ma1 ma2
## -0.7613 -0.1857
## s.e. 0.0943 0.0936
##
## sigma^2 estimated as 178574: log likelihood=-866.04
## AIC=1738.08 AICc=1738.29 BIC=1746.34
##
## $level
## [1] 80 95
##
## $mean
## Time Series:
## Start = 2018
## End = 2019
## Frequency = 1
## [1] 2808.666 2801.776
##
## $lower
## Time Series:
## Start = 2018
## End = 2019
## Frequency = 1
## 80% 95%
## 2018 2267.107 1980.423
## 2019 2245.002 1950.264
##
## $upper
## Time Series:
## Start = 2018
## End = 2019
## Frequency = 1
## 80% 95%
## 2018 3350.225 3636.908
## 2019 3358.549 3653.288
KL_annual_pointforecast_arima = KL_annual_forecast_arima$mean
KL_annual_pointforecast_arima = round(KL_annual_pointforecast_arima, digits = 1)
print(KL_annual_pointforecast_arima)
## Time Series:
## Start = 2018
## End = 2019
## Frequency = 1
## [1] 2808.7 2801.8
# Visualization
plot(KL_annual_forecast_arima,xlab = "Time",ylab = "rainfall(mm)")

plot(KL_annual_forecast_arima, xlim = c(2011,2020),ylim = c(1500,4000),
ylab = "rainfall(mm)", main = "Forecast from Arima for 2018 and 2019")
lines(KL_annual_pointforecast_arima,col="blue")
lines(KL_annual_te_pointforecast_arima,col="red")
legend("topleft",lty = 1,cex = 0.5,
legend = c("Actual","Test Time Forecast","Future Forecast",
"80% Confidence Interval","95% Confidence Interval"),
col = c("black","red","blue","lightblue","lightgrey"))

# Conclusion:
# This article aims at examining the rainfall pattern & fit a suitable model for rainfall
# prediction in the Kerala (KL) Subdivision Region of India. Data from 1901 to 2017 were
# collected from the Department of Meteorology and predicted for the year 2018 and 2019.
# The results revealed that the region experience much rainfall in the months of
# July and June,and least amount of rainfall in the months of January, February and March .
# Auto.ARIMA has been identified as an appropriate model for predicting monthly average
# rainfall figures for the Kerala Subdivision of India and hope that when adopted by the
# Kerala Metrological Agency and other relevant governmental organisations like the
# National Disaster Management Organisation (NADMO), it will in the long run help in
# accurate forecasting and education of the populace on rainfall expectancies.
##################################>>>> End Of Forecasting <<<<<##############################