Libraries

Libraries used throughout the report are displayed below.

library(docxtractr) #Extracting tables from EPDK Reports
library(data.table) #Data manipulation
library(lubridate)  #Setting Date for Consumption
library(dplyr)      #Data manipulation
library(dendextend) #Dendrogram visualization
library(rpart)      #Classification
library(rpart.plot) #Visualizatin of Classification
library(tidyverse)  #Data manipulation
library(sp)         #Manipulation for Cities' names on Turkey Map
library(stringr)    #String Manipulation
library(stringi)    #String Manipulation
library(mapproj)    #Visualization of Turkey Map

Changing Local Language

Due to Turkish characters in cities’ names, local language is changed as Turkish.

Sys.setlocale(category = "LC_ALL", locale = "Turkish")
## [1] "LC_COLLATE=Turkish_Turkey.1254;LC_CTYPE=Turkish_Turkey.1254;LC_MONETARY=Turkish_Turkey.1254;LC_NUMERIC=C;LC_TIME=Turkish_Turkey.1254"

Extracting Tables From EPDK

Since extracting and manipulation on tables by hand for 28 months(From January 2017 to April 2019) need an effort, writing functions is the best way to do this. Functions are displayed below. The target table is the 21st table in some months. In the other months, this table is the 22nd table.

Since the EPDK reports must be downloaded if you have not the data, the table consisting of all months can be downloaded in the process below.

all_data_avg <- readRDS(gzcon(url("https://alkimcelik.github.io/Monthly_Cities_Electricity_Consumptions.rds")))

Clustering

Cities are clustered with 6 different approaches.

#Cities are clustered by using all values with k-means algorithm

set.seed(50)

group_number_all_data <- kmeans(all_data_avg[,c(2:49)],7)

group_number_all_data <- as.data.frame(group_number_all_data$cluster)

group_numbers <- group_number_all_data

group_numbers <- cbind(all_data_avg$Cities,group_numbers)

#Cities are clustered by using agricultural share values with k-means algorithm

set.seed(5)


group_number_agricultural <- kmeans(all_data_avg[,c(seq(4,49,4))],7)

group_number_agricultural <- as.data.frame(group_number_agricultural$cluster)

group_numbers <- cbind(group_numbers,group_number_agricultural)

#Cities are clustered by using dwelling share values with k-means algorithm


set.seed(1)


group_number_Dwelling <- kmeans(all_data_avg[,c(seq(2,49,4))],7)

group_number_Dwelling <- as.data.frame(group_number_Dwelling$cluster)

group_numbers <- cbind(group_numbers,group_number_Dwelling)

#Cities are clustered by using industry share values with k-means algorithm


set.seed(2)

group_number_Industry <- kmeans(all_data_avg[,c(seq(3,49,4))],7)

group_number_Industry <- as.data.frame(group_number_Industry$cluster)

group_numbers <- cbind(group_numbers,group_number_Industry)


#Cities are clustered by using businesses share values with k-means algorithm

set.seed(3)

group_number_Businesses <- kmeans(all_data_avg[,c(seq(5,49,4))],7)

group_number_Businesses <- as.data.frame(group_number_Businesses$cluster)

group_numbers <- cbind(group_numbers,group_number_Businesses)

row.names(all_data_avg) <- all_data_avg$Cities

#Hierarchical clustering is applied

set.seed(90)

cluster <- hclust(dist(all_data_avg[,c(2:49)]), method = "complete")

cluster_cut <- cutree(cluster,7)

cluster_cut1 <- cutree(cluster,40)

cluster_cut1 <- as.data.frame(cluster_cut1)

cluster_cut1 <- cbind(all_data_avg$Cities,cluster_cut1)

group_numbers <- cbind(group_numbers,as.data.frame(cluster_cut))

colnames(group_numbers)[colnames(group_numbers)=="all_data_avg$Cities"] <- "Cities"

Visualization of Hierarchical Clustering

dend <- as.dendrogram(cluster)

dend <- color_labels(dend, k = 7, col =c("black", "red", "blue", "brown", "darkolivegreen","gold4", "gray27",
                                         "mediumvioletred","lightsteelblue4","seagreen","slateblue4",
                                         "darkmagenta", "sienna4","orangered3","darkorchid4"))

dend <- set(dend,"labels_cex",.7)

labels(dend) <- all_data_avg$Cities[labels(dend)]

plot(dend, main = "Cluster Dendrogram", xlab = "Cities")

Constructing Similarity Matrix

The logic behind creating similarity matrix is explained in the in detail. It can be reached via this link.

#Creating similarity matrix with 81x81

similarity_matrix <- matrix(0, nrow=81, ncol=81)

rownames(similarity_matrix) <- all_data_avg$Cities

colnames(similarity_matrix) <- all_data_avg$Cities

#Cells equal to -1 when row's and column's names are the same, which means they belong to same city.
for(i in 1:81){
  similarity_matrix[i,i] <- -1
}

#Cells belonging to cities which are in the same cluster increases 1. There are 6 clustering method.

for(i in 2:7){
  for(j in 1:81){
    for(k in 1:81){
      if(j+k<=81){
        if(group_numbers[j,i]==group_numbers[j+k,i]){
          similarity_matrix[j,j+k] = similarity_matrix[j,j+k] + 1
        }
      }
    }
  }
}

similarity_matrix <- melt(similarity_matrix)

Choosing The Most Similar Cities

After 6 clustering approaches, cities in the same group for all clustering approaches are matched and displayed below.

#Cities which are in the same cluster whenever a clustering method is applied are closest cities.

closest_cities <- similarity_matrix %>% filter( value == 6 )

colnames(closest_cities) <- c("Cities 1", "Cities 2")

closest_cities <- paste0(closest_cities$`Cities 1`,"-",closest_cities$`Cities 2`)

closest_cities
##  [1] "ARTVİN-BALIKESİR"    "ELAZIĞ-ERZİNCAN"     "BAYBURT-HAKKARİ"    
##  [4] "BAYBURT-IĞDIR"       "HAKKARİ-IĞDIR"       "DENİZLİ-KAYSERİ"    
##  [7] "ARTVİN-KIRIKKALE"    "BALIKESİR-KIRIKKALE" "BİLECİK-KIRKLARELİ" 
## [10] "BİLECİK-KOCAELİ"     "KIRKLARELİ-KOCAELİ"  "BURSA-KÜTAHYA"      
## [13] "ISPARTA-MALATYA"     "ESKİŞEHİR-MANİSA"    "AKSARAY-NEVŞEHİR"   
## [16] "MUŞ-ORDU"            "BİLECİK-OSMANİYE"    "KIRKLARELİ-OSMANİYE"
## [19] "KOCAELİ-OSMANİYE"    "MUŞ-RİZE"            "ORDU-RİZE"          
## [22] "BURSA-SAKARYA"       "KÜTAHYA-SAKARYA"     "BARTIN-SAMSUN"      
## [25] "ADANA-SİVAS"         "ARTVİN-SİİRT"        "BALIKESİR-SİİRT"    
## [28] "KIRIKKALE-SİİRT"     "BİLECİK-TEKİRDAĞ"    "KIRKLARELİ-TEKİRDAĞ"
## [31] "KOCAELİ-TEKİRDAĞ"    "OSMANİYE-TEKİRDAĞ"   "MUŞ-TOKAT"          
## [34] "ORDU-TOKAT"          "RİZE-TOKAT"          "GİRESUN-TRABZON"    
## [37] "UŞAK-ÇANAKKALE"      "AMASYA-ÇORUM"        "ADANA-İZMİR"        
## [40] "SİVAS-İZMİR"

Changing Date Index from Months to Seasons

Since month by month classification cannot show clear and sufficient results and also, taking average of all months cannot display accurate results, classification is made by taking average of months in the same season. Time periods are seasons. Finally, all tables arranged by terms are merged.

Summer <- all_data_avg[,c(1,6:9,22:29)]

Summer$Dwelling = rowMeans(Summer[,c(2,6,10)])

Summer$Businesses = rowMeans(Summer[,c(5,9,13)])

Summer$Industry = rowMeans(Summer[,c(3,7,11)])

Summer$Agricultural_Irrigation = rowMeans(Summer[,c(4,8,12)])

Summer[,c(2:13)]=NULL

Summer$report_period = "Summer"



Fall <- all_data_avg[,c(1,38:49)]

Fall$Dwelling = rowMeans(Fall[,c(2,6,10)])

Fall$Businesses = rowMeans(Fall[,c(5,9,13)])

Fall$Industry = rowMeans(Fall[,c(3,7,11)])

Fall$Agricultural_Irrigation = rowMeans(Fall[,c(4,8,12)])

Fall[,c(2:13)]=NULL

Fall$report_period <- "Fall"



Winter <- all_data_avg[,c(1,10:21)]

Winter$Dwelling = rowMeans(Winter[,c(2,6,10)])

Winter$Businesses = rowMeans(Winter[,c(5,9,13)])

Winter$Industry = rowMeans(Winter[,c(3,7,11)])

Winter$Agricultural_Irrigation = rowMeans(Winter[,c(4,8,12)])

Winter[,c(2:13)]=NULL

Winter$report_period = "Winter"


Spring <- all_data_avg[,c(1:5,30:37)]

Spring$Dwelling = rowMeans(Spring[,c(2,6,10)])

Spring$Businesses = rowMeans(Spring[,c(5,9,13)])

Spring$Industry = rowMeans(Spring[,c(3,7,11)])

Spring$Agricultural_Irrigation = rowMeans(Spring[,c(4,8,12)])

Spring[,c(2:13)]=NULL

Spring$report_period = "Spring"

all_data_terms <- rbind(Fall,Winter,Spring,Summer)

all_data_terms <- melt(all_data_terms,id.vars = c("Cities","report_period"))

all_data_terms <- dcast(all_data_terms,Cities~report_period+variable)

Clustering and Classification by Terms and Features

Since classification needs labels, k-means clustering is applied. After that, classification is made.

#Cities arranged in seasonal format are clustered by using k-means algorithm

set.seed(7)

all_data_terms_kmeans <- kmeans(all_data_terms[,c(2:17)],7)

all_data_terms_groups_number <- all_data_terms_kmeans$cluster

all_data_terms_groups <- cbind(all_data_terms,as.data.frame(all_data_terms_groups_number))

cities_of_classification <- all_data_terms_groups[,c(1,18)]

all_data_terms_groups$Cities = NULL

#Clusters in seasonal format are classified

set.seed(10)

classification_all_data_terms <- rpart(all_data_terms_groups_number~., all_data_terms_groups, 
                                       method = "class",control = rpart.control(minbucket = 5,cp=0))

classification_summary <- cbind(all_data_avg$Cities,as.data.frame(all_data_terms_groups_number))

colnames(classification_summary) <- c("Cities","Group_Number")

classification_summary$Cities <- as.character(classification_summary$Cities)

rpart.plot(classification_all_data_terms,type = 5,cex=.8, box.palette=list("firebrick4","dodgerblue4","sienna4",
                              "darkslategray4", "gray27","orangered3","rosybrown4"))

Demonstration of Clusters in Turkey Map

Since characters of names of cities in turkey map do not match with names of cities in our data, character transformations are applied. Cities in the same cluster are displayed with the same color.

#Turkey map coordinates are extracted

turkey <- readRDS(gzcon(url("https://biogeo.ucdavis.edu/data/gadm3.6/Rsp/gadm36_TUR_1_sp.rds")))


colnames(turkey@data)[colnames(turkey@data)=="NAME_1"] <- "Cities"

turkey@data$Cities <- gsub("g","ğ",turkey@data$Cities)

turkey@data$Cities <- stri_trans_toupper(turkey@data$Cities,locale = "tr")

turkey@data$Cities <- gsub("AFYON","AFYONKARAHİSAR",turkey@data$Cities)

turkey$Cities<-gsub("AĞRİ","AĞRI",turkey$Cities)

turkey$Cities<-gsub("ADİYAMAN","ADIYAMAN",turkey$Cities)

turkey$Cities<-gsub("BALİKESİR","BALIKESİR",turkey$Cities)

turkey$Cities<-gsub("BİNĞÖL","BİNGÖL",turkey$Cities)

turkey$Cities<-gsub("ÇANKİRİ","ÇANKIRI",turkey$Cities)

turkey$Cities<-gsub("DİYARBAKİR","DİYARBAKIR",turkey$Cities)

turkey$Cities<-gsub("ESKİSEHİR","ESKİŞEHİR",turkey$Cities)

turkey$Cities<-gsub("GÜMÜSHANE","GÜMÜŞHANE",turkey$Cities)

turkey$Cities<-gsub("K. MARAS","KAHRAMANMARAŞ",turkey$Cities)

turkey$Cities<-gsub("AYDİN","AYDIN",turkey$Cities)

turkey$Cities<-gsub("KİNKKALE","KIRIKKALE",turkey$Cities)

turkey$Cities<-gsub("KİRKLARELİ","KIRKLARELİ",turkey$Cities)

turkey$Cities<-gsub("KİRSEHİR","KIRŞEHİR",turkey$Cities)

turkey$Cities<-gsub("MUS","MUŞ",turkey$Cities)

turkey$Cities<-gsub("NEVSEHİR","NEVŞEHİR",turkey$Cities)

turkey$Cities<-gsub("ISTANBUL","İSTANBUL",turkey$Cities)

turkey$Cities<-gsub("IZMİR","İZMİR",turkey$Cities)

turkey$Cities<-gsub("USAK","UŞAK",turkey$Cities)

turkey$Cities<-gsub("ZİNĞULDAK","ZONGULDAK",turkey$Cities)

turkey$Cities<-gsub("SANLİURFA","ŞANLIURFA",turkey$Cities)

turkey$Cities<-gsub("SİRNAK","ŞIRNAK",turkey$Cities)

turkey$Cities<-gsub("YOZĞAT","YOZGAT",turkey$Cities)

#Cities' information is hold.

turkey_for <- fortify(turkey)

#Cluster numbers obtained from classification are merged with cities'coordinates

cities_and_id <- data_frame(id = rownames(turkey@data),
                                Cities = turkey@data$Cities) %>% 
  left_join(classification_summary, by = "Cities")

#Final map is obtained

final_map <- left_join(turkey_for, cities_and_id, by = "id")

ggplot(final_map) +
  geom_polygon( aes(x = long, y = lat, group = group, fill =as.factor(Group_Number)),color="white") +
  coord_map() +
  theme_void() + 
  labs(title = "Clusters Shown in Turkey Map", fill="Cluster Numbers") +
  scale_fill_manual(values=c("firebrick4","dodgerblue4","sienna4",
                              "darkslategray4", "gray27","orangered3","rosybrown4"))+
  theme(plot.title = element_text(size = 20, hjust = .5), legend.text = element_text(size = 13), legend.title = element_text(size=16))

For any questions, you can send an e-mail to alkimcancelik33@gmail.com