# loading packages

import os

import pandas as pd
import numpy as np

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# Kmeans algorithm from scikit-learn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score


# load raw data
DATA_FOLDER = './'
raw = pd.read_csv(os.path.join(DATA_FOLDER, 'country risk 2019 data.csv'))

# check the raw data
print("Size of the dataset (row, col): ", raw.shape)
print("\nFirst 5 rows\n", raw.head(n=5))

Size of the dataset (row, col):  (121, 6)

First 5 rows
      Country Abbrev  Corruption  Peace  Legal  GDP Growth
0    Albania     AL          35  1.821  4.546       2.983
1    Algeria     DZ          35  2.219  4.435       2.553
2  Argentina     AR          45  1.989  5.087      -3.061
3    Armenia     AM          42  2.294  4.812       6.000
4  Australia     AU          77  1.419  8.363       1.713


# print summary statistics
print("\nSummary statistics\n", raw.describe())
print("\nCorrelation matrix\n", raw.corr())

Summary statistics
        Corruption       Peace       Legal  GDP Growth
count  121.000000  121.000000  121.000000  121.000000
mean    46.842975    2.001017    5.752529    2.657529
std     18.702499    0.461485    1.373932    2.563741
min     15.000000    1.072000    2.671000   -9.459000
25%     33.000000    1.699000    4.785000    1.249000
50%     41.000000    1.939000    5.455000    2.600000
75%     60.000000    2.294000    6.488000    4.000000
max     87.000000    3.369000    8.712000    7.800000

Correlation matrix
             Corruption     Peace     Legal  GDP Growth
Corruption    1.000000 -0.705002  0.938512   -0.123545
Peace        -0.705002  1.000000 -0.662233   -0.004428
Legal         0.938512 -0.662233  1.000000   -0.150369
GDP Growth   -0.123545 -0.004428 -0.150369    1.000000


# plot histograms
plt.figure(1)
raw['Corruption'].plot(kind = 'hist', title = 'Corruption', alpha = 0.5)

plt.figure(2)
raw['Peace'].plot(kind = 'hist', title = 'Peace', alpha = 0.5)

plt.figure(3)
raw['Legal'].plot(kind = 'hist', title = 'Legal', alpha = 0.5)

plt.figure(4)
raw['GDP Growth'].plot(kind = 'hist', title = 'GDP Growth', alpha = 0.5)

plt.show()


X = raw[['Peace', 'Legal', 'GDP Growth']]
X = (X - X.mean()) / X.std()
print(X.head(5))

      Peace     Legal  GDP Growth
0 -0.390081 -0.878158    0.126952
1  0.472352 -0.958948   -0.040772
2 -0.026039 -0.484397   -2.230541
3  0.634871 -0.684553    1.303747
4 -1.261182  1.900001   -0.368418


# https://stackoverflow.com/questions/41540751/sklearn-kmeans-equivalent-of-elbow-method

Ks = range(1, 10)
inertia = [KMeans(i).fit(X).inertia_ for i in Ks]

fig = plt.figure()
plt.plot(Ks, inertia, '-bo')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (within-cluster sum of squares)')
plt.show()


k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 161.40859912320002
cluster centers:  [[ 1.35385404 -0.83412368 -1.11084884]
 [ 0.2194138  -0.55102769  0.60381864]
 [-0.85097477  1.02149992 -0.23897931]]
cluster labels:  [1 1 0 1 2 2 1 1 1 2 1 1 1 2 0 1 0 1 2 0 2 1 1 2 1 2 2 0 2 1 0 1 1 2 1 2 2
 1 1 2 1 1 1 1 2 2 1 1 0 2 1 2 2 2 2 1 1 2 2 2 0 1 2 1 1 2 1 1 2 0 1 1 1 1
 1 2 2 0 0 2 2 0 1 1 1 1 2 2 2 2 0 1 0 1 1 1 2 2 2 0 2 1 2 2 2 1 1 1 0 1 0
 1 0 2 2 2 2 1 0 1 0]


# set up the color
norm = clrs.Normalize(vmin=0.,vmax=y.max() + 0.8)
cmap = cm.viridis

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X.iloc[:,0], X.iloc[:,1], X.iloc[:,2], c=cmap(norm(y)), marker='o')

centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.5)

ax.set_xlabel('Peace')
ax.set_ylabel('Legal')
ax.set_zlabel('GDP Growth')

plt.show()


%matplotlib inline
import matplotlib.pyplot as plt

figs = [(0, 1), (0, 2), (1, 2)]
labels = ['Peace', 'Legal', 'GDP Growth']

for i in range(3):
    fig = plt.figure(i)
    plt.scatter(X.iloc[:,figs[i][0]], X.iloc[:,figs[i][1]], c=cmap(norm(y)), s=50)
    plt.scatter(centers[:, figs[i][0]], centers[:, figs[i][1]], c='black', s=200, alpha=0.5)
    plt.xlabel(labels[figs[i][0]])
    plt.ylabel(labels[figs[i][1]])

plt.show()


%matplotlib inline
import matplotlib.pyplot as plt

figs = [(0, 1), (0, 2), (1, 2)]
labels = ['Peace', 'Legal', 'GDP Growth']
colors = ['blue','green', 'red']

for i in range(3):
    fig = plt.figure(i, figsize=(8, 8))
    x_1 = figs[i][0]
    x_2 = figs[i][1]
    plt.scatter(X.iloc[:, x_1], X.iloc[:, x_2], c=y, s=0, alpha=0)
    plt.scatter(centers[:, x_1], centers[:, x_2], c='black', s=200, alpha=0.5)
    for j in range(X.shape[0]):
        plt.text(X.iloc[j, x_1], X.iloc[j, x_2], raw['Abbrev'].iloc[j], 
                 color=colors[y[j]], weight='semibold', horizontalalignment = 'center', verticalalignment = 'center')
    plt.xlabel(labels[x_1])
    plt.ylabel(labels[x_2])

plt.show()


result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
60                        Lebanon     LB      0
118                         Yemen     YE      0
112                       Ukraine     UA      0
110                        Turkey     TR      0
108           Trinidad and Tobago     TT      0
99                   South Africa     ZA      0
92                   Saudi Arabia     SA      0
90                         Russia     RU      0
81                       Pakistan     PK      0
78                        Nigeria     NG      0
77                      Nicaragua     NI      0
69                         Mexico     MX      0
48                           Iran     IR      0
30                        Ecuador     EC      0
27   Democratic Republic of Congo     CD      0
19                           Chad     TD      0
120                      Zimbabwe     ZW      0
2                       Argentina     AR      0
16                        Burundi     BI      0
14                         Brazil     BR      0
17                       Cameroon     CM      1
8                      Bangladesh     BD      1
10                          Benin     BJ      1
74                          Nepal     NP      1
73                     Mozambique     MZ      1
72                        Morocco     MA      1
71                     Montenegro     ME      1
70                        Moldova     FM      1
11                        Bolivia     BO      1
67                     Mauritania     MR      1
66                           Mali     ML      1
64                         Malawi     MW      1
63                     Madagascar     MG      1
61                        Liberia     LR      1
82                         Panama     PA      1
83                       Paraguay     PY      1
85                    Philippines     PH      1
119                        Zambia     ZM      1
1                         Algeria     DZ      1
117                       Vietnam     VI      1
111                        Uganda     UG      1
3                         Armenia     AM      1
109                       Tunisia     TN      1
107          The FYR of Macedonia     MK      1
106                      Thailand     TH      1
105                      Tanzania     TZ      1
101                     Sri Lanka     LK      1
95                   Sierra Leone     SL      1
94                         Serbia     RS      1
93                        Senegal     SN      1
6                      Azerbaijan     AZ      1
91                         Rwanda     RW      1
7                         Bahrain     BH      1
84                           Peru     PE      1
56                          Kenya     KE      1
0                         Albania     AL      1
24                        Croatia     HR      1
32                    El Salvador     SV      1
34                       Ethiopia     ET      1
55                     Kazakhstan     KZ      1
37                          Gabon     GA      1
38                        Georgia     GE      1
40                          Ghana     GH      1
31                          Egypt     EG      1
41                         Greece     GR      1
42                      Guatemala     GT      1
43                       Honduras     HN      1
46                          India     IN      1
29             Dominican Republic     DO      1
12         Bosnia and Herzegovina     BA      1
22                       Colombia     CO      1
21                          China     CN      1
50                         Israel     IL      1
15                       Bulgaria     BG      1
47                      Indonesia     ID      1
18                         Canada     CA      2
96                      Singapore     SG      2
97                       Slovakia     SK      2
116                       Uruguay     UY      2
98                       Slovenia     SI      2
5                         Austria     AT      2
100                         Spain     ES      2
28                        Denmark     DK      2
102                        Sweden     SE      2
114                United Kingdom     GB      2
103                   Switzerland     CH      2
104                        Taiwan     SY      2
26                 Czech Republic     CZ      2
113          United Arab Emirates     AE      2
25                         Cyprus     CY      2
20                          Chile     CL      2
4                       Australia     AU      2
23                     Costa Rica     CR      2
115                 United States     US      2
54                         Jordan     JO      2
88                          Qatar     QA      2
33                        Estonia     EE      2
57                  Korea (South)     KI      2
58                         Kuwait     KW      2
59                         Latvia     LV      2
53                          Japan     JP      2
52                        Jamaica     JM      2
62                      Lithuania     LT      2
51                          Italy     IT      2
65                       Malaysia     MY      2
49                        Ireland     IE      2
45                        Iceland     IS      2
13                       Botswana     BW      2
44                        Hungary     HU      2
76                    New Zealand     NZ      2
9                         Belgium     BE      2
79                         Norway     NO      2
80                           Oman     OM      2
39                        Germany     DE      2
36                         France     FR      2
35                        Finland     FI      2
86                         Poland     PL      2
87                       Portugal     PT      2
89                        Romania     RO      2
75                    Netherlands     NL      2
68                      Mauritius     MU      2


# Silhouette Analysis
range_n_clusters=[2,3,4,5,6,7,8,9,10]
for n_clusters in range_n_clusters:
    clusterer=KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels=clusterer.fit_predict(X)
    silhouette_avg=silhouette_score(X,cluster_labels)
    print("For n_clusters=", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

For n_clusters= 2 The average silhouette_score is : 0.350913952385216
For n_clusters= 3 The average silhouette_score is : 0.36018818321478785
For n_clusters= 4 The average silhouette_score is : 0.3396528908669481
For n_clusters= 5 The average silhouette_score is : 0.3443842097739338
For n_clusters= 6 The average silhouette_score is : 0.34771309340459694
For n_clusters= 7 The average silhouette_score is : 0.35497852344070874
For n_clusters= 8 The average silhouette_score is : 0.3554792385881377
For n_clusters= 9 The average silhouette_score is : 0.3322195951047023
For n_clusters= 10 The average silhouette_score is : 0.3424328346243841

Load raw data¶

Simple exploratory analysis¶

Print summary statistics¶

Plot histogram¶

K means cluster¶

Pick features & normalization¶

Perform elbow method¶

K means with k=3¶

Visualize the result (3D plot)¶

Visualize the result (3 2D plots)¶

Visualize the result (3 2D plots)¶

List the result¶