Libraries used throughout the report are displayed below.
library(docxtractr) #Extracting tables from EPDK Reports
library(data.table) #Data manipulation
library(lubridate) #Setting Date for Consumption
library(dplyr) #Data manipulation
library(dendextend) #Dendrogram visualization
library(rpart) #Classification
library(rpart.plot) #Visualizatin of Classification
library(tidyverse) #Data manipulation
library(sp) #Manipulation for Cities' names on Turkey Map
library(stringr) #String Manipulation
library(stringi) #String Manipulation
library(mapproj) #Visualization of Turkey Map
Due to Turkish characters in cities’ names, local language is changed as Turkish.
Sys.setlocale(category = "LC_ALL", locale = "Turkish")
## [1] "LC_COLLATE=Turkish_Turkey.1254;LC_CTYPE=Turkish_Turkey.1254;LC_MONETARY=Turkish_Turkey.1254;LC_NUMERIC=C;LC_TIME=Turkish_Turkey.1254"
Since extracting and manipulation on tables by hand for 28 months(From January 2017 to April 2019) need an effort, writing functions is the best way to do this. Functions are displayed below. The target table is the 21st table in some months. In the other months, this table is the 22nd table.
Since the EPDK reports must be downloaded if you have not the data, the table consisting of all months can be downloaded in the process below.
all_data_avg <- readRDS(gzcon(url("https://alkimcelik.github.io/Monthly_Cities_Electricity_Consumptions.rds")))
Cities are clustered with 6 different approaches.
#Cities are clustered by using all values with k-means algorithm
set.seed(50)
group_number_all_data <- kmeans(all_data_avg[,c(2:49)],7)
group_number_all_data <- as.data.frame(group_number_all_data$cluster)
group_numbers <- group_number_all_data
group_numbers <- cbind(all_data_avg$Cities,group_numbers)
#Cities are clustered by using agricultural share values with k-means algorithm
set.seed(5)
group_number_agricultural <- kmeans(all_data_avg[,c(seq(4,49,4))],7)
group_number_agricultural <- as.data.frame(group_number_agricultural$cluster)
group_numbers <- cbind(group_numbers,group_number_agricultural)
#Cities are clustered by using dwelling share values with k-means algorithm
set.seed(1)
group_number_Dwelling <- kmeans(all_data_avg[,c(seq(2,49,4))],7)
group_number_Dwelling <- as.data.frame(group_number_Dwelling$cluster)
group_numbers <- cbind(group_numbers,group_number_Dwelling)
#Cities are clustered by using industry share values with k-means algorithm
set.seed(2)
group_number_Industry <- kmeans(all_data_avg[,c(seq(3,49,4))],7)
group_number_Industry <- as.data.frame(group_number_Industry$cluster)
group_numbers <- cbind(group_numbers,group_number_Industry)
#Cities are clustered by using businesses share values with k-means algorithm
set.seed(3)
group_number_Businesses <- kmeans(all_data_avg[,c(seq(5,49,4))],7)
group_number_Businesses <- as.data.frame(group_number_Businesses$cluster)
group_numbers <- cbind(group_numbers,group_number_Businesses)
row.names(all_data_avg) <- all_data_avg$Cities
#Hierarchical clustering is applied
set.seed(90)
cluster <- hclust(dist(all_data_avg[,c(2:49)]), method = "complete")
cluster_cut <- cutree(cluster,7)
cluster_cut1 <- cutree(cluster,40)
cluster_cut1 <- as.data.frame(cluster_cut1)
cluster_cut1 <- cbind(all_data_avg$Cities,cluster_cut1)
group_numbers <- cbind(group_numbers,as.data.frame(cluster_cut))
colnames(group_numbers)[colnames(group_numbers)=="all_data_avg$Cities"] <- "Cities"
dend <- as.dendrogram(cluster)
dend <- color_labels(dend, k = 7, col =c("black", "red", "blue", "brown", "darkolivegreen","gold4", "gray27",
"mediumvioletred","lightsteelblue4","seagreen","slateblue4",
"darkmagenta", "sienna4","orangered3","darkorchid4"))
dend <- set(dend,"labels_cex",.7)
labels(dend) <- all_data_avg$Cities[labels(dend)]
plot(dend, main = "Cluster Dendrogram", xlab = "Cities")
The logic behind creating similarity matrix is explained in the in detail. It can be reached via this link.
#Creating similarity matrix with 81x81
similarity_matrix <- matrix(0, nrow=81, ncol=81)
rownames(similarity_matrix) <- all_data_avg$Cities
colnames(similarity_matrix) <- all_data_avg$Cities
#Cells equal to -1 when row's and column's names are the same, which means they belong to same city.
for(i in 1:81){
similarity_matrix[i,i] <- -1
}
#Cells belonging to cities which are in the same cluster increases 1. There are 6 clustering method.
for(i in 2:7){
for(j in 1:81){
for(k in 1:81){
if(j+k<=81){
if(group_numbers[j,i]==group_numbers[j+k,i]){
similarity_matrix[j,j+k] = similarity_matrix[j,j+k] + 1
}
}
}
}
}
similarity_matrix <- melt(similarity_matrix)
After 6 clustering approaches, cities in the same group for all clustering approaches are matched and displayed below.
#Cities which are in the same cluster whenever a clustering method is applied are closest cities.
closest_cities <- similarity_matrix %>% filter( value == 6 )
colnames(closest_cities) <- c("Cities 1", "Cities 2")
closest_cities <- paste0(closest_cities$`Cities 1`,"-",closest_cities$`Cities 2`)
closest_cities
## [1] "ARTVİN-BALIKESİR" "ELAZIĞ-ERZİNCAN" "BAYBURT-HAKKARİ"
## [4] "BAYBURT-IĞDIR" "HAKKARİ-IĞDIR" "DENİZLİ-KAYSERİ"
## [7] "ARTVİN-KIRIKKALE" "BALIKESİR-KIRIKKALE" "BİLECİK-KIRKLARELİ"
## [10] "BİLECİK-KOCAELİ" "KIRKLARELİ-KOCAELİ" "BURSA-KÜTAHYA"
## [13] "ISPARTA-MALATYA" "ESKİŞEHİR-MANİSA" "AKSARAY-NEVŞEHİR"
## [16] "MUŞ-ORDU" "BİLECİK-OSMANİYE" "KIRKLARELİ-OSMANİYE"
## [19] "KOCAELİ-OSMANİYE" "MUŞ-RİZE" "ORDU-RİZE"
## [22] "BURSA-SAKARYA" "KÜTAHYA-SAKARYA" "BARTIN-SAMSUN"
## [25] "ADANA-SİVAS" "ARTVİN-SİİRT" "BALIKESİR-SİİRT"
## [28] "KIRIKKALE-SİİRT" "BİLECİK-TEKİRDAĞ" "KIRKLARELİ-TEKİRDAĞ"
## [31] "KOCAELİ-TEKİRDAĞ" "OSMANİYE-TEKİRDAĞ" "MUŞ-TOKAT"
## [34] "ORDU-TOKAT" "RİZE-TOKAT" "GİRESUN-TRABZON"
## [37] "UŞAK-ÇANAKKALE" "AMASYA-ÇORUM" "ADANA-İZMİR"
## [40] "SİVAS-İZMİR"
Since month by month classification cannot show clear and sufficient results and also, taking average of all months cannot display accurate results, classification is made by taking average of months in the same season. Time periods are seasons. Finally, all tables arranged by terms are merged.
Summer <- all_data_avg[,c(1,6:9,22:29)]
Summer$Dwelling = rowMeans(Summer[,c(2,6,10)])
Summer$Businesses = rowMeans(Summer[,c(5,9,13)])
Summer$Industry = rowMeans(Summer[,c(3,7,11)])
Summer$Agricultural_Irrigation = rowMeans(Summer[,c(4,8,12)])
Summer[,c(2:13)]=NULL
Summer$report_period = "Summer"
Fall <- all_data_avg[,c(1,38:49)]
Fall$Dwelling = rowMeans(Fall[,c(2,6,10)])
Fall$Businesses = rowMeans(Fall[,c(5,9,13)])
Fall$Industry = rowMeans(Fall[,c(3,7,11)])
Fall$Agricultural_Irrigation = rowMeans(Fall[,c(4,8,12)])
Fall[,c(2:13)]=NULL
Fall$report_period <- "Fall"
Winter <- all_data_avg[,c(1,10:21)]
Winter$Dwelling = rowMeans(Winter[,c(2,6,10)])
Winter$Businesses = rowMeans(Winter[,c(5,9,13)])
Winter$Industry = rowMeans(Winter[,c(3,7,11)])
Winter$Agricultural_Irrigation = rowMeans(Winter[,c(4,8,12)])
Winter[,c(2:13)]=NULL
Winter$report_period = "Winter"
Spring <- all_data_avg[,c(1:5,30:37)]
Spring$Dwelling = rowMeans(Spring[,c(2,6,10)])
Spring$Businesses = rowMeans(Spring[,c(5,9,13)])
Spring$Industry = rowMeans(Spring[,c(3,7,11)])
Spring$Agricultural_Irrigation = rowMeans(Spring[,c(4,8,12)])
Spring[,c(2:13)]=NULL
Spring$report_period = "Spring"
all_data_terms <- rbind(Fall,Winter,Spring,Summer)
all_data_terms <- melt(all_data_terms,id.vars = c("Cities","report_period"))
all_data_terms <- dcast(all_data_terms,Cities~report_period+variable)
Since classification needs labels, k-means clustering is applied. After that, classification is made.
#Cities arranged in seasonal format are clustered by using k-means algorithm
set.seed(7)
all_data_terms_kmeans <- kmeans(all_data_terms[,c(2:17)],7)
all_data_terms_groups_number <- all_data_terms_kmeans$cluster
all_data_terms_groups <- cbind(all_data_terms,as.data.frame(all_data_terms_groups_number))
cities_of_classification <- all_data_terms_groups[,c(1,18)]
all_data_terms_groups$Cities = NULL
#Clusters in seasonal format are classified
set.seed(10)
classification_all_data_terms <- rpart(all_data_terms_groups_number~., all_data_terms_groups,
method = "class",control = rpart.control(minbucket = 5,cp=0))
classification_summary <- cbind(all_data_avg$Cities,as.data.frame(all_data_terms_groups_number))
colnames(classification_summary) <- c("Cities","Group_Number")
classification_summary$Cities <- as.character(classification_summary$Cities)
rpart.plot(classification_all_data_terms,type = 5,cex=.8, box.palette=list("firebrick4","dodgerblue4","sienna4",
"darkslategray4", "gray27","orangered3","rosybrown4"))
Since characters of names of cities in turkey map do not match with names of cities in our data, character transformations are applied. Cities in the same cluster are displayed with the same color.
#Turkey map coordinates are extracted
turkey <- readRDS(gzcon(url("https://biogeo.ucdavis.edu/data/gadm3.6/Rsp/gadm36_TUR_1_sp.rds")))
colnames(turkey@data)[colnames(turkey@data)=="NAME_1"] <- "Cities"
turkey@data$Cities <- gsub("g","ğ",turkey@data$Cities)
turkey@data$Cities <- stri_trans_toupper(turkey@data$Cities,locale = "tr")
turkey@data$Cities <- gsub("AFYON","AFYONKARAHİSAR",turkey@data$Cities)
turkey$Cities<-gsub("AĞRİ","AĞRI",turkey$Cities)
turkey$Cities<-gsub("ADİYAMAN","ADIYAMAN",turkey$Cities)
turkey$Cities<-gsub("BALİKESİR","BALIKESİR",turkey$Cities)
turkey$Cities<-gsub("BİNĞÖL","BİNGÖL",turkey$Cities)
turkey$Cities<-gsub("ÇANKİRİ","ÇANKIRI",turkey$Cities)
turkey$Cities<-gsub("DİYARBAKİR","DİYARBAKIR",turkey$Cities)
turkey$Cities<-gsub("ESKİSEHİR","ESKİŞEHİR",turkey$Cities)
turkey$Cities<-gsub("GÜMÜSHANE","GÜMÜŞHANE",turkey$Cities)
turkey$Cities<-gsub("K. MARAS","KAHRAMANMARAŞ",turkey$Cities)
turkey$Cities<-gsub("AYDİN","AYDIN",turkey$Cities)
turkey$Cities<-gsub("KİNKKALE","KIRIKKALE",turkey$Cities)
turkey$Cities<-gsub("KİRKLARELİ","KIRKLARELİ",turkey$Cities)
turkey$Cities<-gsub("KİRSEHİR","KIRŞEHİR",turkey$Cities)
turkey$Cities<-gsub("MUS","MUŞ",turkey$Cities)
turkey$Cities<-gsub("NEVSEHİR","NEVŞEHİR",turkey$Cities)
turkey$Cities<-gsub("ISTANBUL","İSTANBUL",turkey$Cities)
turkey$Cities<-gsub("IZMİR","İZMİR",turkey$Cities)
turkey$Cities<-gsub("USAK","UŞAK",turkey$Cities)
turkey$Cities<-gsub("ZİNĞULDAK","ZONGULDAK",turkey$Cities)
turkey$Cities<-gsub("SANLİURFA","ŞANLIURFA",turkey$Cities)
turkey$Cities<-gsub("SİRNAK","ŞIRNAK",turkey$Cities)
turkey$Cities<-gsub("YOZĞAT","YOZGAT",turkey$Cities)
#Cities' information is hold.
turkey_for <- fortify(turkey)
#Cluster numbers obtained from classification are merged with cities'coordinates
cities_and_id <- data_frame(id = rownames(turkey@data),
Cities = turkey@data$Cities) %>%
left_join(classification_summary, by = "Cities")
#Final map is obtained
final_map <- left_join(turkey_for, cities_and_id, by = "id")
ggplot(final_map) +
geom_polygon( aes(x = long, y = lat, group = group, fill =as.factor(Group_Number)),color="white") +
coord_map() +
theme_void() +
labs(title = "Clusters Shown in Turkey Map", fill="Cluster Numbers") +
scale_fill_manual(values=c("firebrick4","dodgerblue4","sienna4",
"darkslategray4", "gray27","orangered3","rosybrown4"))+
theme(plot.title = element_text(size = 20, hjust = .5), legend.text = element_text(size = 13), legend.title = element_text(size=16))
For any questions, you can send an e-mail to alkimcancelik33@gmail.com