# This data aims at examining the rainfall pattern and fit a suitable model for rainfall 
# prediction in Kerala (KL) subdivision of India. Data from 1901 to 2017 is downloaded from 
# data.gov.in. Our aim is to find  out the rainfall pattern in Kerala over last 117 years 
# and predict the rainfall in upcoming Months/Quarters/Annual.

# Language used R Programming

# Load the necessary package
library(reshape)
library(forecast)
library(tseries)
library(rmarkdown)
library(knitr)

# Load the data
#setwd('D:\\share\\Hackathon')
setwd('~/R Programming Class')
getwd()
## [1] "C:/Users/PC/Documents/R Programming Class"
rain_india = read.csv("data/Sub_Division_IMD_2017.csv")

# View structure of data
str(rain_india)
## 'data.frame':    4188 obs. of  19 variables:
##  $ SUBDIVISION: Factor w/ 36 levels "Andaman & Nicobar Islands",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ YEAR       : int  1901 1902 1903 1904 1905 1906 1907 1908 1910 1911 ...
##  $ JAN        : num  49.2 0 12.7 9.4 1.3 ...
##  $ FEB        : num  87.1 159.8 144 14.7 0 ...
##  $ MAR        : num  29.2 12.2 0 0 3.3 ...
##  $ APR        : num  2.3 0 1 202.4 26.9 ...
##  $ MAY        : num  529 446 235 304 280 ...
##  $ JUN        : num  518 537 480 495 629 ...
##  $ JUL        : num  365 229 728 502 369 ...
##  $ AUG        : num  481 754 327 160 330 ...
##  $ SEP        : num  333 666 339 820 297 ...
##  $ OCT        : num  388 197 181 222 261 ...
##  $ NOV        : num  558.2 359 284.4 308.7 25.4 ...
##  $ DEC        : num  33.6 160.5 225 40.1 344.7 ...
##  $ ANNUAL     : num  3373 3521 2957 3080 2567 ...
##  $ JF         : num  136.3 159.8 156.7 24.1 1.3 ...
##  $ MAM        : num  560 458 236 507 310 ...
##  $ JJAS       : num  1696 2186 1874 1978 1625 ...
##  $ OND        : num  980 717 691 571 631 ...
# Data has 4188 obs of 19 variables

# Kerala rain data from Jan 1901 to Dec 2017
KL_months = rain_india[rain_india$SUBDIVISION == "Kerala", 
                       c("YEAR","JAN","FEB","MAR","APR","MAY","JUN",
                         "JUL","AUG","SEP" ,"OCT","NOV","DEC")]
# Renaming row names
rownames(KL_months) = KL_months$YEAR
head(KL_months,n=3)
##      YEAR  JAN  FEB  MAR   APR   MAY   JUN    JUL   AUG   SEP   OCT   NOV
## 1901 1901 28.7 44.7 51.6 160.0 174.7 824.6  743.0 357.5 197.7 266.9 350.8
## 1902 1902  6.7  2.6 57.3  83.9 134.5 390.9 1205.0 315.8 491.6 358.4 158.3
## 1903 1903  3.2 18.6  3.1  83.6 249.7 558.6 1022.5 420.2 341.8 354.1 157.0
##        DEC
## 1901  48.4
## 1902 121.5
## 1903  59.0
# View structure of data
str(KL_months)
## 'data.frame':    117 obs. of  13 variables:
##  $ YEAR: int  1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 ...
##  $ JAN : num  28.7 6.7 3.2 23.7 1.2 26.7 18.8 8 54.1 2.7 ...
##  $ FEB : num  44.7 2.6 18.6 3 22.3 7.4 4.8 20.8 11.8 25.7 ...
##  $ MAR : num  51.6 57.3 3.1 32.2 9.4 9.9 55.7 38.2 61.3 23.3 ...
##  $ APR : num  160 83.9 83.6 71.5 105.9 ...
##  $ MAY : num  175 134 250 236 263 ...
##  $ JUN : num  825 391 559 1098 850 ...
##  $ JUL : num  743 1205 1022 726 520 ...
##  $ AUG : num  358 316 420 352 294 ...
##  $ SEP : num  198 492 342 223 217 ...
##  $ OCT : num  267 358 354 328 384 ...
##  $ NOV : num  350.8 158.3 157 33.9 74.4 ...
##  $ DEC : num  48.4 121.5 59 3.3 0.2 ...
# Data has 117 obs of 12 variables

# Are there any unusual or No data values?
summary(KL_months)
##       YEAR           JAN             FEB             MAR        
##  Min.   :1901   Min.   : 0.00   Min.   : 0.00   Min.   :  0.10  
##  1st Qu.:1930   1st Qu.: 2.40   1st Qu.: 4.70   1st Qu.: 18.10  
##  Median :1959   Median : 6.00   Median : 8.40   Median : 28.30  
##  Mean   :1959   Mean   :12.17   Mean   :15.37   Mean   : 37.13  
##  3rd Qu.:1988   3rd Qu.:16.90   3rd Qu.:21.40   3rd Qu.: 50.10  
##  Max.   :2017   Max.   :83.50   Max.   :79.00   Max.   :217.20  
##       APR             MAY             JUN              JUL        
##  Min.   : 13.1   Min.   : 53.4   Min.   : 196.8   Min.   : 167.5  
##  1st Qu.: 71.5   1st Qu.:124.5   1st Qu.: 541.7   1st Qu.: 532.0  
##  Median :108.4   Median :190.6   Median : 625.8   Median : 687.3  
##  Mean   :109.4   Mean   :230.0   Mean   : 653.2   Mean   : 696.0  
##  3rd Qu.:135.1   3rd Qu.:265.4   3rd Qu.: 788.5   3rd Qu.: 831.6  
##  Max.   :238.0   Max.   :738.8   Max.   :1098.2   Max.   :1526.5  
##       AUG              SEP             OCT             NOV       
##  Min.   : 178.6   Min.   : 41.3   Min.   : 68.5   Min.   : 31.5  
##  1st Qu.: 315.3   1st Qu.:150.1   1st Qu.:221.6   1st Qu.: 92.9  
##  Median : 385.2   Median :223.9   Median :282.6   Median :153.0  
##  Mean   : 420.7   Mean   :245.9   Mean   :291.9   Mean   :162.6  
##  3rd Qu.: 495.0   3rd Qu.:335.6   3rd Qu.:354.1   3rd Qu.:219.1  
##  Max.   :1199.2   Max.   :526.7   Max.   :567.9   Max.   :365.6  
##       DEC        
##  Min.   :  0.10  
##  1st Qu.: 10.20  
##  Median : 31.10  
##  Mean   : 39.98  
##  3rd Qu.: 54.10  
##  Max.   :202.30
# Converting Multivariate to Univariate data
KL_months_univ = melt(t(KL_months[,c(2:13)]),
                      varnames = c("MONTH","YEAR"))

head(KL_months_univ)
##   MONTH YEAR value
## 1   JAN 1901  28.7
## 2   FEB 1901  44.7
## 3   MAR 1901  51.6
## 4   APR 1901 160.0
## 5   MAY 1901 174.7
## 6   JUN 1901 824.6
class(KL_months_univ)
## [1] "data.frame"
# Type of the class is data.frame

# Converting data to Time Series object
KL_months_ts = ts(KL_months_univ$value,start= min(KL_months_univ$YEAR),frequency=12)
# As monthly forecast the freq is set to 12
class(KL_months_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_months_ts, n = 36)
##         Jan    Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
## 1901   28.7   44.7   51.6  160.0  174.7  824.6  743.0  357.5  197.7  266.9
## 1902    6.7    2.6   57.3   83.9  134.5  390.9 1205.0  315.8  491.6  358.4
## 1903    3.2   18.6    3.1   83.6  249.7  558.6 1022.5  420.2  341.8  354.1
##         Nov    Dec
## 1901  350.8   48.4
## 1902  158.3  121.5
## 1903  157.0   59.0
tail(KL_months_ts, n = 36)
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2015   3.1   5.8  50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6
## 2016   3.0  16.4  22.4  33.3 258.4 595.7 441.5 231.0  84.1 105.1  57.9
## 2017  12.7   0.3  87.8  52.8 213.3 579.8 378.5 462.6 435.5 228.0 152.1
##        Dec
## 2015  79.4
## 2016  22.0
## 2017  61.4
# Visualization
# Plot the Time Series
plot(KL_months_ts,
     main = "KL monthly Rain from 1901 to 2017",
     ylab = "Rainfall in mm")
abline(reg = lm(KL_months_ts ~ time(KL_months_ts)))

# The data looks stationary by observation
# Also the mean and variance is constant

boxplot(KL_months_ts ~ cycle(KL_months_ts),col = "yellow",names = month.abb,
        xlab ="Month",ylab ="Rainfall in mm", main = "KL monthly Rain summary")

# The data considered on monthly basis has some outliers
# The month of July recorded the highest amount of rainfall, followed closely by June
# The months of January,February and March recorded the least amount of rainfall.
# KL_months_ts = tsclean(KL_months_ts)

# Decomposition
decomp_KL_months_ts = decompose(KL_months_ts)
plot(decomp_KL_months_ts)

# The data has seasonal effect, with a usual rise and fall pattern being experienced 
# yearly over the period. This implies that regular monthly rainfall figures recorded 
# each year was influenced by the rise and fall pattern of the seasonality component. 
# However, the trend seems to be very constant over time, although there are 
# a few ups and downs over some periods. 
# The random effect is very stable over the time period.

# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_months_ts)
## Warning in tseries::adf.test(KL_months_ts): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  KL_months_ts
## Dickey-Fuller = -8.2612, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -8.2612, p-value = 0.01
# p-value < 0.05 , so our data is stationary

# Build model on Training time duration

# Training Set
KL_months_tr = window(KL_months_ts,start = c(1901,1),end = c(2014,12))
# 114 years for training

# Test Set
KL_months_te = window(KL_months_ts,start = c(2015,1),end =c(2017,12))
# Remainning 3 years for Test

# plot forecasting for 3 years according to 4 methods
plot(KL_months_tr,main ="KL Rainfall",ylab ="Rainfall in mm",
     xlim = c(2013,2018),ylim=c(0,2000))
#Mean Method
lines(meanf(KL_months_tr,h=36)$mean,col=5)
# Naive Method
lines(rwf(KL_months_tr,h=36)$mean,col=2)
# Drift Method
lines(rwf(KL_months_tr,drift = TRUE,h=36)$mean,col=3)
# Seasonal Naive Method
lines(snaive(KL_months_tr,h=36)$mean,col=6)

# test set
lines(KL_months_te,col="darkblue")

#Legend
legend("topleft",lty=1,col = c(5,2,3,6,"darkblue"),bty='n', cex = 0.8,
legend = c("Mean Method","Naive Method","Drift Method","Seasonal Naive Method",
           "Actual Test"))

# Accuracy
accuracy(meanf(KL_months_tr,h=36),KL_months_te)
##                         ME     RMSE      MAE       MPE    MAPE     MASE
## Training set -8.310238e-15 257.9332 205.9044      -Inf     Inf 2.014211
## Test set     -4.579846e+01 185.9648 159.4902 -3021.489 3045.45 1.560176
##                   ACF1 Theil's U
## Training set 0.5779024        NA
## Test set     0.6434968  1.827923
accuracy(rwf(KL_months_tr,h=36),KL_months_te)
##                        ME     RMSE      MAE       MPE     MAPE     MASE
## Training set   0.01353328 236.9446 158.7688      -Inf      Inf 1.553118
## Test set     151.03055556 235.1502 168.0194 -503.7572 605.6595 1.643611
##                     ACF1 Theil's U
## Training set -0.04801825        NA
## Test set      0.64349684 0.4684641
accuracy(rwf(KL_months_tr,drift = TRUE,h=36),KL_months_te)
##                        ME     RMSE      MAE       MPE     MAPE     MASE
## Training set 6.011742e-15 236.9446 158.7688      -Inf      Inf 1.553119
## Test set     1.507802e+02 234.9791 167.8713 -507.5336 609.1558 1.642162
##                     ACF1 Theil's U
## Training set -0.04801825        NA
## Test set      0.64345563 0.4643569
accuracy(snaive(KL_months_tr,h=36),KL_months_te)
##                       ME     RMSE      MAE       MPE     MAPE    MASE
## Training set  -0.1489676 161.2041 102.2258      -Inf      Inf 1.00000
## Test set     -55.6527778 168.5781 110.2250 -129.7359 161.5687 1.07825
##                     ACF1 Theil's U
## Training set -0.02145137        NA
## Test set      0.23549073 0.7986112
# Mean,Naive, Drift Method do not detect nor the trend neither the seasonality
# Seasonal Naive method detects the seasonality and prediction is from recent observation


# caution takes a while to compute
KL_months_tr_arima = auto.arima(KL_months_tr)
# This will return best ARIMA model

# Forecast for Test time duration
KL_months_te_forecast_arima = forecast(KL_months_tr_arima, h = 36)
head(KL_months_te_forecast_arima)
## $method
## [1] "ARIMA(0,0,0)(2,1,0)[12]"
## 
## $model
## Series: KL_months_tr 
## ARIMA(0,0,0)(2,1,0)[12] 
## 
## Coefficients:
##          sar1     sar2
##       -0.6334  -0.2965
## s.e.   0.0261   0.0262
## 
## sigma^2 estimated as 18100:  log likelihood=-8572.72
## AIC=17151.44   AICc=17151.45   BIC=17167.07
## 
## $level
## [1] 80 95
## 
## $mean
##             Jan        Feb        Mar        Apr        May        Jun
## 2015   5.194208  20.549496  29.602358 102.417396 160.460773 645.499981
## 2016   4.610289  22.891609  31.676385  84.406497 178.767989 698.857940
## 2017   4.804007  18.369435  26.893310  93.823883 194.012874 608.404994
##             Jul        Aug        Sep        Oct        Nov        Dec
## 2015 635.710835 542.304197 288.366338 273.479233 122.141221  25.816980
## 2016 707.552603 555.697399 300.845388 297.092780 124.223333  30.408710
## 2017 674.522930 604.014433 296.033831 306.451021 116.192182  33.839370
## 
## $lower
##                  80%          95%
## Jan 2015 -167.222371 -258.4941940
## Feb 2015 -151.867083 -243.1389060
## Mar 2015 -142.814221 -234.0860441
## Apr 2015  -69.999182 -161.2710057
## May 2015  -11.955805 -103.2276285
## Jun 2015  473.083403  381.8115798
## Jul 2015  463.294257  372.0224333
## Aug 2015  369.887618  278.6157949
## Sep 2015  115.949759   24.6779362
## Oct 2015  101.062655    9.7908313
## Nov 2015  -50.275357 -141.5471805
## Dec 2015 -146.599598 -237.8714213
## Jan 2016 -179.024756 -276.2352764
## Feb 2016 -160.743436 -257.9539566
## Mar 2016 -151.958660 -249.1691807
## Apr 2016  -99.228548 -196.4390687
## May 2016   -4.867055 -102.0775761
## Jun 2016  515.222895  418.0123743
## Jul 2016  523.917558  426.7070372
## Aug 2016  372.062354  274.8518332
## Sep 2016  117.210343   19.9998222
## Oct 2016  113.457736   16.2472149
## Nov 2016  -59.411712 -156.6222329
## Dec 2016 -153.226335 -250.4368556
## Jan 2017 -196.009980 -302.3144834
## Feb 2017 -182.444552 -288.7490552
## Mar 2017 -173.920677 -280.2251808
## Apr 2017 -106.990104 -213.2946078
## May 2017   -6.801113 -113.1056163
## Jun 2017  407.591007  301.2865032
## Jul 2017  473.708943  367.4044397
## Aug 2017  403.200446  296.8959430
## Sep 2017   95.219844  -11.0846590
## Oct 2017  105.637034   -0.6674695
## Nov 2017  -84.621806 -190.9263089
## Dec 2017 -166.974617 -273.2791208
## 
## $upper
##               80%      95%
## Jan 2015 177.6108 268.8826
## Feb 2015 192.9661 284.2379
## Mar 2015 202.0189 293.2908
## Apr 2015 274.8340 366.1058
## May 2015 332.8774 424.1492
## Jun 2015 817.9166 909.1884
## Jul 2015 808.1274 899.3992
## Aug 2015 714.7208 805.9926
## Sep 2015 460.7829 552.0547
## Oct 2015 445.8958 537.1676
## Nov 2015 294.5578 385.8296
## Dec 2015 198.2336 289.5054
## Jan 2016 188.2453 285.4559
## Feb 2016 206.5267 303.7372
## Mar 2016 215.3114 312.5220
## Apr 2016 268.0415 365.2521
## May 2016 362.4030 459.6136
## Jun 2016 882.4930 979.7035
## Jul 2016 891.1876 988.3982
## Aug 2016 739.3324 836.5430
## Sep 2016 484.4804 581.6910
## Oct 2016 480.7278 577.9383
## Nov 2016 307.8584 405.0689
## Dec 2016 214.0438 311.2543
## Jan 2017 205.6180 311.9225
## Feb 2017 219.1834 325.4879
## Mar 2017 227.7073 334.0118
## Apr 2017 294.6379 400.9424
## May 2017 394.8269 501.1314
## Jun 2017 809.2190 915.5235
## Jul 2017 875.3369 981.6414
## Aug 2017 804.8284 911.1329
## Sep 2017 496.8478 603.1523
## Oct 2017 507.2650 613.5695
## Nov 2017 317.0062 423.3107
## Dec 2017 234.6534 340.9579
# Visualization
plot(KL_months_te_forecast_arima,
     xlim = c(2013,2018),
     xlab = "Time",
     ylab = "rainfall(mm)",
     main = "Forecast for Test Time duration")

lines(KL_months_te,col= 6,lty=2)

legend("topright",lty = 1,bty = "n",col = c(6,"darkblue","lightblue","lightgray"),
       c("Testdata","ArimaPred","80% Confidence Interval","95% Confidence Interval")
       ,cex = 0.5)

KL_months_te_pointforecast_arima = KL_months_te_forecast_arima$mean
KL_months_te_pointforecast_arima = round(KL_months_te_pointforecast_arima,digits = 1)
print(KL_months_te_pointforecast_arima)
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2015   5.2  20.5  29.6 102.4 160.5 645.5 635.7 542.3 288.4 273.5 122.1
## 2016   4.6  22.9  31.7  84.4 178.8 698.9 707.6 555.7 300.8 297.1 124.2
## 2017   4.8  18.4  26.9  93.8 194.0 608.4 674.5 604.0 296.0 306.5 116.2
##        Dec
## 2015  25.8
## 2016  30.4
## 2017  33.8
print(round(KL_months_te, digits =1))
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2015   3.1   5.8  50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6
## 2016   3.0  16.4  22.4  33.3 258.4 595.7 441.5 231.0  84.1 105.1  57.9
## 2017  12.7   0.3  87.8  52.8 213.3 579.8 378.5 462.6 435.5 228.0 152.1
##        Dec
## 2015  79.4
## 2016  22.0
## 2017  61.4
# Test data vs ARIMA Point Forecast
plot( KL_months_te,
      ylim = c(-500, 1500),
      xlim = c(2016,2018),
      main = "Monthly : Testdata vs ARIMA PF",
      ylab = 'Test data/ARIMA PF',col= 6,lty=2)

lines(KL_months_te_pointforecast_arima,col="4")

legend("topleft",legend = c("Test data","ARIMA PF"),
       col = c(6,4),lty = c(2,1),cex = 0.8)

# Evaluate the forecast/prediction accuracy (MAPE)
MAPE = function(actual, predicted) {
  abs_percent_diff = abs((actual - predicted) / actual)*100
  # percentage difference could be infinite if actual has value 0
  abs_percent_diff = abs_percent_diff[is.finite(abs_percent_diff)]
  mape = c(mean(abs_percent_diff), median(abs_percent_diff))
  names(mape) = c("Mean_APE", "Median_APE")
  return (mape)
}

MAPE(KL_months_te, KL_months_te_pointforecast_arima)
##   Mean_APE Median_APE 
##  232.68920   48.78272
# 232.69 Mean_APE
# 48.79 Median_APE

# Forecast for future times
KL_months_arima = auto.arima(KL_months_ts)
KL_months_forecast_arima = forecast(KL_months_arima, h= 24)
head(KL_months_forecast_arima)
## $method
## [1] "ARIMA(2,0,2)(2,1,0)[12] with drift"
## 
## $model
## Series: KL_months_ts 
## ARIMA(2,0,2)(2,1,0)[12] with drift 
## 
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
##          ar1      ar2     ma1     ma2     sar1     sar2    drift
##       0.0149  -0.5456  0.0161  0.5289  -0.6370  -0.3022  -0.0533
## s.e.     NaN   0.1879     NaN  0.2839   0.0231   0.0257   0.1564
## 
## sigma^2 estimated as 17986:  log likelihood=-8793.4
## AIC=17602.8   AICc=17602.9   BIC=17644.71
## 
## $level
## [1] 80 95
## 
## $mean
##             Jan        Feb        Mar        Apr        May        Jun
## 2018   8.464241   7.403198  51.572192  93.053648 224.599399 579.398620
## 2019   7.068231   6.546110  53.601847  60.256067 229.815288 583.233204
##             Jul        Aug        Sep        Oct        Nov        Dec
## 2018 406.167441 320.008014 273.793798 209.958299 140.792775  52.337839
## 2019 406.332777 339.593634 269.360857 183.070445 118.281786  44.960466
## 
## $lower
##                    80%        95%
## Jan 2018 -163.40776067 -254.39130
## Feb 2018 -164.55146272 -255.57876
## Mar 2018 -120.40520881 -211.44455
## Apr 2018  -78.94905807 -170.00179
## May 2018   52.59030971  -38.46580
## Jun 2018  407.38179062  316.32158
## Jul 2018  234.14882398  143.08767
## Aug 2018  147.98703171   56.92462
## Sep 2018  101.77231712   10.70965
## Oct 2018   37.93609608  -53.12696
## Nov 2018  -31.22956704 -122.29269
## Dec 2018 -119.68472313 -210.74797
## Jan 2019 -175.95837302 -272.84680
## Feb 2019 -176.49239356 -273.38712
## Mar 2019 -129.43981736 -226.33622
## Apr 2019 -122.78923997 -219.68757
## May 2019   46.76909461  -50.12971
## Jun 2019  400.18589704  303.28651
## Jul 2019  223.28522183  126.38570
## Aug 2019  156.54573893   59.64604
## Sep 2019   86.31289227  -10.58685
## Oct 2019    0.02237678  -96.87742
## Nov 2019  -64.76630123 -161.66611
## Dec 2019 -138.08765362 -234.98747
## 
## $upper
##               80%      95%
## Jan 2018 180.3362 271.3198
## Feb 2018 179.3579 270.3852
## Mar 2018 223.5496 314.5889
## Apr 2018 265.0564 356.1091
## May 2018 396.6085 487.6646
## Jun 2018 751.4154 842.4757
## Jul 2018 578.1861 669.2472
## Aug 2018 492.0290 583.0914
## Sep 2018 445.8153 536.8780
## Oct 2018 381.9805 473.0436
## Nov 2018 312.8151 403.8782
## Dec 2018 224.3604 315.4236
## Jan 2019 190.0948 286.9833
## Feb 2019 189.5846 286.4793
## Mar 2019 236.6435 333.5399
## Apr 2019 243.3014 340.1997
## May 2019 412.8615 509.7603
## Jun 2019 766.2805 863.1799
## Jul 2019 589.3803 686.2799
## Aug 2019 522.6415 619.5412
## Sep 2019 452.4088 549.3086
## Oct 2019 366.1185 463.0183
## Nov 2019 301.3299 398.2297
## Dec 2019 228.0086 324.9084
KL_months_pointforecast_arima = round(KL_months_forecast_arima$mean,digits = 1)
print(KL_months_pointforecast_arima)
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2018   8.5   7.4  51.6  93.1 224.6 579.4 406.2 320.0 273.8 210.0 140.8
## 2019   7.1   6.5  53.6  60.3 229.8 583.2 406.3 339.6 269.4 183.1 118.3
##        Dec
## 2018  52.3
## 2019  45.0
# Visualization
plot(KL_months_forecast_arima,xlab = "Time",ylab = "rainfall(mm)")

plot(KL_months_forecast_arima, xlim = c(2011,2020),ylim = c(-500,3000),
     ylab = "rainfall(mm)", main = "Forecast from Arima for 2018 and 2019")

lines(KL_months_te_pointforecast_arima,col="red")

legend("topright",legend = c("Actual","Test Time Forecast","Future Forecast"
                            ,"80% Confidence Interval","95% Confidence Interval"),
       col = c("black","red","blue","lightblue","lightgrey"),lty = 1,cex = 0.8)

boxplot(KL_months_pointforecast_arima ~ cycle(KL_months_pointforecast_arima),
        col = "yellow",names = month.abb,xlab ="Month",
        ylab ="Rainfall in mm", main = "KL monthly Rain summary")

# The results revealed that the region experience much rainfall in the months of 
# June, July and least amount of rainfall in the months of January,February and March . 
# Auto.ARIMA has been identified as an appropriate model for predicting monthly average 
# rainfall figures for the Kerala Subdivision of India.

###################################>>>> Seasonal <<<<<###################################

# Our aim is to find  out the rainfall pattern in Kerala over last 115 years 
# and predict the rainfall in upcoming Quaterly

KL_seasonal = rain_india[rain_india$SUBDIVISION == "Kerala", 16:19]
# This will filter Kerala subdivision and slice out columns 16 to 19

rownames(KL_seasonal) = c(1901:2017)
colnames(KL_seasonal) = c("JF" = "Qtr1", "MAM"="Qtr2","JJAS"="Qtr3","OND"="Qtr4")
head(KL_seasonal)
##      Qtr1  Qtr2   Qtr3  Qtr4
## 1901 73.4 386.2 2122.8 666.1
## 1902  9.3 275.7 2403.4 638.2
## 1903 21.7 336.3 2343.0 570.1
## 1904 26.7 339.4 2398.2 365.3
## 1905 23.4 378.5 1881.5 458.1
## 1906 34.1 230.0 1943.1 500.8
# View structure of data
str(KL_seasonal)
## 'data.frame':    117 obs. of  4 variables:
##  $ Qtr1: num  73.4 9.3 21.7 26.7 23.4 34.1 23.7 28.8 65.9 28.4 ...
##  $ Qtr2: num  386 276 336 339 378 ...
##  $ Qtr3: num  2123 2403 2343 2398 1882 ...
##  $ Qtr4: num  666 638 570 365 458 ...
# Data has 117 obs of 12 variables

# Are there any unusual or No data values?
summary(KL_seasonal)
##       Qtr1            Qtr2            Qtr3           Qtr4      
##  Min.   : 0.30   Min.   : 89.9   Min.   :1104   Min.   :166.6  
##  1st Qu.:10.30   1st Qu.:277.8   1st Qu.:1749   1st Qu.:406.5  
##  Median :20.50   Median :342.0   Median :1948   Median :500.8  
##  Mean   :27.54   Mean   :376.5   Mean   :2016   Mean   :494.5  
##  3rd Qu.:40.60   3rd Qu.:436.9   3rd Qu.:2231   3rd Qu.:584.0  
##  Max.   :98.10   Max.   :915.2   Max.   :3451   Max.   :823.3
# Converting Multivariate to Univariate data
KL_seasonal_univ = melt(t(KL_seasonal),varnames = c("QUATERS","YEAR"))
head(KL_seasonal_univ)
##   QUATERS YEAR  value
## 1    Qtr1 1901   73.4
## 2    Qtr2 1901  386.2
## 3    Qtr3 1901 2122.8
## 4    Qtr4 1901  666.1
## 5    Qtr1 1902    9.3
## 6    Qtr2 1902  275.7
class(KL_seasonal_univ)
## [1] "data.frame"
# Type of the class is data.frame

# Converting data to Time Series object
KL_seasonal_ts = ts(KL_seasonal_univ$value,start= min(KL_seasonal_univ$YEAR),frequency=4)
# As quarterly forecast the freq is set to 4
class(KL_seasonal_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_seasonal_ts, n=12)
##        Qtr1   Qtr2   Qtr3   Qtr4
## 1901   73.4  386.2 2122.8  666.1
## 1902    9.3  275.7 2403.4  638.2
## 1903   21.7  336.3 2343.0  570.1
tail(KL_seasonal_ts, n=12)
##        Qtr1   Qtr2   Qtr3   Qtr4
## 2015    8.9  465.9 1514.7  611.1
## 2016   19.3  314.2 1352.3  185.0
## 2017   13.0  353.9 1856.5  441.5
# Visualization 
# 1. PLOT
plot(KL_seasonal_ts, main = "KL Seasonal rainfall from 1901 to 2017",
     ylab ="rainfall(mm)")
abline(reg = lm(KL_seasonal_ts ~ time(KL_seasonal_ts)))

# 2. BOXPLOT
boxplot(split(KL_seasonal_ts,cycle(KL_seasonal_ts)),col = "gold",
        xlab ="Quarterly",ylab ="Rainfall in mm", main = "KL Seasonal Rainfall summary")

# The data considered on Quaterly basis has some outliers
# Q3 has recorded the highest amount of rainfall,followed closely by Q4
# Q1 has recorded the least amount of rainfall.

# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_seasonal_ts)
## Warning in tseries::adf.test(KL_seasonal_ts): p-value smaller than printed
## p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  KL_seasonal_ts
## Dickey-Fuller = -6.104, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -6.104
# p-value = 0.01
# Data is stationary

# Build model on Training time duration
# Remaining 113 years for training
KL_seasonal_tr = window(KL_seasonal_ts,start = min(KL_seasonal_univ$YEAR), end = c(2013,4))
# 4 years for Testing
KL_seasonal_te = window(KL_seasonal_ts,start = c(2014,1), end = c(2017,4))

KL_seasonal_tr_arima = auto.arima(KL_seasonal_tr)
arimaorder(KL_seasonal_tr_arima)
##         p         d         q         P         D         Q Frequency 
##         1         0         0         2         1         0         4
# Forecast for Test time duration
KL_seasonal_te_forecast_arima = forecast(KL_seasonal_tr_arima, h = 16)
head(KL_seasonal_te_forecast_arima)
## $method
## [1] "ARIMA(1,0,0)(2,1,0)[4] with drift"
## 
## $model
## Series: KL_seasonal_tr 
## ARIMA(1,0,0)(2,1,0)[4] with drift 
## 
## Coefficients:
##           ar1     sar1     sar2    drift
##       -0.0769  -0.6148  -0.3248  -0.2281
## s.e.   0.0473   0.0451   0.0450   1.3686
## 
## sigma^2 estimated as 58395:  log likelihood=-3093.01
## AIC=6196.02   AICc=6196.15   BIC=6216.54
## 
## $level
## [1] 80 95
## 
## $mean
##            Qtr1       Qtr2       Qtr3       Qtr4
## 2014   48.29709  267.08073 2147.65216  399.35304
## 2015   35.50982  257.82068 2067.03777  377.90846
## 2016   40.17410  245.96602 2249.14728  399.86196
## 2017   39.69008  254.49259 2161.59559  391.55995
## 
## $lower
##                80%        95%
## 2014 Q1 -261.39170 -425.33105
## 2014 Q2  -43.52148 -207.94436
## 2014 Q3 1837.04456 1672.61883
## 2014 Q4   88.74541  -75.68034
## 2015 Q1 -297.21936 -473.35555
## 2015 Q2  -75.03482 -251.23788
## 2015 Q3 1734.18153 1557.97807
## 2015 Q4   45.05221 -131.15124
## 2016 Q1 -319.30729 -509.60524
## 2016 Q2 -113.66681 -304.04493
## 2016 Q3 1889.51356 1699.13497
## 2016 Q4   40.22823 -150.15036
## 2017 Q1 -365.87718 -580.57152
## 2017 Q2 -151.33058 -366.16039
## 2017 Q3 1755.77090 1540.94029
## 2017 Q4  -14.26475 -229.09536
## 
## $upper
##               80%       95%
## 2014 Q1  357.9859  521.9252
## 2014 Q2  577.6829  742.1058
## 2014 Q3 2458.2598 2622.6855
## 2014 Q4  709.9607  874.3864
## 2015 Q1  368.2390  544.3752
## 2015 Q2  590.6762  766.8792
## 2015 Q3 2399.8940 2576.0975
## 2015 Q4  710.7647  886.9682
## 2016 Q1  399.6555  589.9534
## 2016 Q2  605.5988  795.9770
## 2016 Q3 2608.7810 2799.1596
## 2016 Q4  759.4957  949.8743
## 2017 Q1  445.2573  659.9517
## 2017 Q2  660.3158  875.1456
## 2017 Q3 2567.4203 2782.2509
## 2017 Q4  797.3846 1012.2153
plot(KL_seasonal_te_forecast_arima, xlab = "Time", ylab = "rainfall(mm)", 
     main = "Seasonal : Forecast for Test Time Duration")

plot(KL_seasonal_te_forecast_arima, xlim= c(2014,2018), 
     xlab = "Time", ylab = "rainfall(mm)", main = "Seasonal : Forecast for Test Time Duration")

KL_seasonal_te_pointforecast_arima = KL_seasonal_te_forecast_arima$mean
print(round(KL_seasonal_te_pointforecast_arima,digits = 1))
##        Qtr1   Qtr2   Qtr3   Qtr4
## 2014   48.3  267.1 2147.7  399.4
## 2015   35.5  257.8 2067.0  377.9
## 2016   40.2  246.0 2249.1  399.9
## 2017   39.7  254.5 2161.6  391.6
print(KL_seasonal_te)
##        Qtr1   Qtr2   Qtr3   Qtr4
## 2014   14.9  364.5 2164.8  502.1
## 2015    8.9  465.9 1514.7  611.1
## 2016   19.3  314.2 1352.3  185.0
## 2017   13.0  353.9 1856.5  441.5
plot(
  KL_seasonal_te,
  ylim = c(-500, 3500),
  ylab = "Actual/Forecast",
  main = "Seasonal: Actual Vs ARIMA PF")
lines(KL_seasonal_te_pointforecast_arima, col = "blue")
legend("topright", legend= c("Actual","ARIMA PF"),
       col= c("black","blue"),lty = 1, cex=0.8)

# Evaluate the forecast/prediction accuracy (MAPE)
MAPE = function(actual, predicted) {
  abs_percent_diff = abs((actual - predicted) / actual)*100
  # percentage difference could be infinite if actual has value 0
  abs_percent_diff = abs_percent_diff[is.finite(abs_percent_diff)]
  mape = c(mean(abs_percent_diff), median(abs_percent_diff))
  names(mape) = c("Mean_APE", "Median_APE")
  return (mape)
}
MAPE(KL_seasonal_te, KL_seasonal_te_pointforecast_arima)
##   Mean_APE Median_APE 
##   78.99214   37.31223
# Mean_APE - 78.99
# Median_APE - 37.31 

# Forecast for Future duration
KL_seasonal_ts_arima = auto.arima(KL_seasonal_ts)
KL_seasonal_ts_forecast_arima = forecast(KL_seasonal_ts_arima, h = 8)
# Predicting seasonally for future years 2018 and 2019
print(KL_seasonal_ts_forecast_arima)
##         Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## 2018 Q1       6.719950 -303.52118  316.9611 -467.75292  481.1928
## 2018 Q2     376.694313   65.60932  687.7793  -99.06913  852.4578
## 2018 Q3    1603.672804 1292.58322 1914.7624 1127.90234 2079.4433
## 2018 Q4     424.465015  113.37540  735.5546  -51.30549  900.2355
## 2019 Q1       8.912556 -325.96489  343.7900 -503.23830  521.0634
## 2019 Q2     346.245674   11.24327  681.2481 -166.09629  858.5876
## 2019 Q3    1585.409912 1250.40683 1920.4130 1073.06691 2097.7529
## 2019 Q4     346.374005   11.37092  681.3771 -165.96900  858.7170
KL_seasonal_pointforecast_arima = KL_seasonal_ts_forecast_arima$mean
print(round(KL_seasonal_pointforecast_arima,digits = 1))
##        Qtr1   Qtr2   Qtr3   Qtr4
## 2018    6.7  376.7 1603.7  424.5
## 2019    8.9  346.2 1585.4  346.4
print(KL_seasonal_te)
##        Qtr1   Qtr2   Qtr3   Qtr4
## 2014   14.9  364.5 2164.8  502.1
## 2015    8.9  465.9 1514.7  611.1
## 2016   19.3  314.2 1352.3  185.0
## 2017   13.0  353.9 1856.5  441.5
#Visualization
plot(KL_seasonal_ts_forecast_arima,xlab = "Time", ylab = "rainfall(mm)")

plot(KL_seasonal_ts_forecast_arima, xlim = c(2011,2020),xlab = "Time",
     ylab = "rainfall(mm)",main = "Seasonal: Forecast for 2018 and 2019")
lines(KL_seasonal_te_pointforecast_arima, col = "red")
legend("topright", legend= c("Actual","Test Time forecast","Future forecast"),
       col= c("black","red","blue"),lty = 1, cex=0.8)

boxplot(KL_seasonal_pointforecast_arima ~ cycle(KL_seasonal_pointforecast_arima),
        col = "yellow",xlab ="Quarterly",
        ylab ="Rainfall in mm", main = "KL Seasonal Rain summary")

# The result shows that kerala has experienced heavy rainfall on Q3 i.e. JJAS,
# followed by Q4 i.e. OND and Q1 i.e. Jan and Feb experienced lowest rainfall.

###################################>>>> Annual <<<<<###################################

# Kerala rain data from 1901 to 2017
KL_annual = rain_india[rain_india$SUBDIVISION == "Kerala", c("YEAR","ANNUAL")]

# Renaming row names
rownames(KL_annual) = KL_annual$YEAR
head(KL_annual,n=3)
##      YEAR ANNUAL
## 1901 1901 3248.6
## 1902 1902 3326.6
## 1903 1903 3271.2
# View structure of data
str(KL_annual)
## 'data.frame':    117 obs. of  2 variables:
##  $ YEAR  : int  1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 ...
##  $ ANNUAL: num  3249 3327 3271 3130 2742 ...
# Data has 117 obs of 12 variables

# Are there any unusual or No data values?
summary(KL_annual)
##       YEAR          ANNUAL    
##  Min.   :1901   Min.   :1871  
##  1st Qu.:1930   1st Qu.:2622  
##  Median :1959   Median :2931  
##  Mean   :1959   Mean   :2914  
##  3rd Qu.:1988   3rd Qu.:3152  
##  Max.   :2017   Max.   :4258
# Converting data to Time Series object
KL_annual_ts = ts(KL_annual$ANNUAL,start= min(KL_annual$YEAR),frequency=1)
# As monthly forecast the freq is set to 12
class(KL_annual_ts)
## [1] "ts"
# Type of the class should be time series
head(KL_annual_ts)
## Time Series:
## Start = 1901 
## End = 1906 
## Frequency = 1 
## [1] 3248.6 3326.6 3271.2 3129.7 2741.6 2708.0
tail(KL_annual_ts)
## Time Series:
## Start = 2012 
## End = 2017 
## Frequency = 1 
## [1] 2151.1 3255.4 3046.4 2600.6 1870.9 2664.9
# Visualization
# Plot the Time Series
plot(KL_annual_ts,
     main = "KL yearly Rain from 1901 to 2017",
     ylab = "Rainfall in mm")
abline(reg = lm(KL_annual_ts ~ time(KL_annual_ts)))

# Also the mean and variance is not constant
# The year on year trend clearly shows that the rainfall have been decreasing.

boxplot(KL_annual_ts ~ cycle(KL_annual_ts),col = "yellow",
        xlab ="Year",ylab ="Rainfall in mm", main = "KL yearly Rain summary")

# The data considered on yearly basis has some outliers

# Is Stationary?
# Augmented Dickey Fuller Test
tseries::adf.test(KL_annual_ts)
## Warning in tseries::adf.test(KL_annual_ts): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  KL_annual_ts
## Dickey-Fuller = -5.0367, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
# Dickey-Fuller = -6.47
# p-value = 0.01
# Data is stationary

# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
# kpss.test(KL_annual_ts)


# Build model on Training time duration

# Training Set
KL_annual_tr = window(KL_annual_ts,start = min(KL_annual$YEAR),end = 2010)
# 110 years for training

# Test Set
KL_annual_te = window(KL_annual_ts,start = 2011,end =2017)
  # Remainning 2 years for Test


# caution takes a while to compute
KL_annual_tr_arima = auto.arima(KL_annual_tr)
# This will return best ARIMA model

# Forecast for Test time duration
KL_annual_te_forecast_arima = forecast(KL_annual_tr_arima, h = 7)
head(KL_annual_te_forecast_arima)
## $method
## [1] "ARIMA(1,1,2)"
## 
## $model
## Series: KL_annual_tr 
## ARIMA(1,1,2) 
## 
## Coefficients:
##           ar1      ma1      ma2
##       -0.7550  -0.0177  -0.8673
## s.e.   0.0915   0.0680   0.0599
## 
## sigma^2 estimated as 170165:  log likelihood=-810.74
## AIC=1629.48   AICc=1629.87   BIC=1640.25
## 
## $level
## [1] 80 95
## 
## $mean
## Time Series:
## Start = 2011 
## End = 2017 
## Frequency = 1 
## [1] 2818.030 2896.636 2837.286 2882.097 2848.263 2873.809 2854.521
## 
## $lower
## Time Series:
## Start = 2011 
## End = 2017 
## Frequency = 1 
##           80%      95%
## 2011 2289.376 2009.523
## 2012 2354.502 2067.514
## 2013 2294.326 2006.901
## 2014 2332.773 2041.979
## 2015 2298.935 2008.138
## 2016 2320.946 2028.278
## 2017 2301.490 2008.732
## 
## $upper
## Time Series:
## Start = 2011 
## End = 2017 
## Frequency = 1 
##           80%      95%
## 2011 3346.684 3626.537
## 2012 3438.770 3725.758
## 2013 3380.245 3667.671
## 2014 3431.421 3722.216
## 2015 3397.592 3688.388
## 2016 3426.672 3719.340
## 2017 3407.553 3700.310
# Visualization
plot(KL_annual_te_forecast_arima,
     xlim = c(2011,2018),
     ylim = c(1800,6000),
     xlab = "Time",
     ylab = "rainfall(mm)",
     main = "Forecast for Test Time duration")

lines(KL_annual_te,col= 6,lty=2)

legend("topright",lty = 1,bty = "n",col = c(6,"darkblue","lightblue","lightgray"),
       c("Testdata","ArimaPred","80% Confidence Interval","95% Confidence Interval")
       ,cex = 0.8)

KL_annual_te_pointforecast_arima = KL_annual_te_forecast_arima$mean
KL_annual_te_pointforecast_arima=(round(KL_annual_te_pointforecast_arima, digits = 1))
print(KL_annual_te_pointforecast_arima)
## Time Series:
## Start = 2011 
## End = 2017 
## Frequency = 1 
## [1] 2818.0 2896.6 2837.3 2882.1 2848.3 2873.8 2854.5
print(KL_annual_te)
## Time Series:
## Start = 2011 
## End = 2017 
## Frequency = 1 
## [1] 3035.1 2151.1 3255.4 3046.4 2600.6 1870.9 2664.9
# Test data vs ARIMA Point Forecast
plot( KL_annual_te,
      ylim = c(1500, 4000),
      xlim = c(2010,2018),
      main = "Annual: Testdata vs ARIMA PF",
      ylab = 'Test data/ARIMA PF',col= 6,lty=2)

lines(KL_annual_te_pointforecast_arima,col="4")

legend("topleft",legend = c("Test data","ARIMA PF"),
       col = c(6,4),lty = c(2,1),cex = 0.8)

# Evaluate the forecast/prediction accuracy (MAPE)

MAPE(KL_annual_te, KL_annual_te_pointforecast_arima)
##   Mean_APE Median_APE 
##  18.612978   9.524725
# 18.612 Mean_APE
# 9.54 Median_APE

# Forecast for future times
KL_annual_arima = auto.arima(KL_annual_ts)
KL_annual_forecast_arima = forecast(KL_annual_arima, h= 2)
head(KL_annual_forecast_arima)
## $method
## [1] "ARIMA(0,1,2)"
## 
## $model
## Series: KL_annual_ts 
## ARIMA(0,1,2) 
## 
## Coefficients:
##           ma1      ma2
##       -0.7613  -0.1857
## s.e.   0.0943   0.0936
## 
## sigma^2 estimated as 178574:  log likelihood=-866.04
## AIC=1738.08   AICc=1738.29   BIC=1746.34
## 
## $level
## [1] 80 95
## 
## $mean
## Time Series:
## Start = 2018 
## End = 2019 
## Frequency = 1 
## [1] 2808.666 2801.776
## 
## $lower
## Time Series:
## Start = 2018 
## End = 2019 
## Frequency = 1 
##           80%      95%
## 2018 2267.107 1980.423
## 2019 2245.002 1950.264
## 
## $upper
## Time Series:
## Start = 2018 
## End = 2019 
## Frequency = 1 
##           80%      95%
## 2018 3350.225 3636.908
## 2019 3358.549 3653.288
KL_annual_pointforecast_arima = KL_annual_forecast_arima$mean
KL_annual_pointforecast_arima = round(KL_annual_pointforecast_arima, digits = 1)
print(KL_annual_pointforecast_arima)
## Time Series:
## Start = 2018 
## End = 2019 
## Frequency = 1 
## [1] 2808.7 2801.8
# Visualization
plot(KL_annual_forecast_arima,xlab = "Time",ylab = "rainfall(mm)")

plot(KL_annual_forecast_arima, xlim = c(2011,2020),ylim = c(1500,4000),
     ylab = "rainfall(mm)", main = "Forecast from Arima for 2018 and 2019")

lines(KL_annual_pointforecast_arima,col="blue")
lines(KL_annual_te_pointforecast_arima,col="red")

legend("topleft",lty = 1,cex = 0.5,
       legend = c("Actual","Test Time Forecast","Future Forecast",
                  "80% Confidence Interval","95% Confidence Interval"),
       col = c("black","red","blue","lightblue","lightgrey"))

# Conclusion:
# This article aims at examining the rainfall pattern & fit a suitable model for rainfall 
# prediction in the Kerala (KL) Subdivision  Region of India. Data from 1901 to 2017 were 
# collected from the Department of Meteorology and predicted for the year 2018 and 2019. 
# The results revealed that the region experience much rainfall in the months of
# July and June,and least amount of rainfall in the months of January, February and March . 
# Auto.ARIMA has been identified as an appropriate model for predicting monthly average 
# rainfall figures for the Kerala Subdivision of India and hope that when adopted by the 
# Kerala Metrological Agency and other relevant governmental organisations like the 
# National Disaster Management Organisation (NADMO), it will in the long run help in 
# accurate forecasting and education of the populace on  rainfall expectancies.

##################################>>>> End Of Forecasting <<<<<##############################