# loading packages
import os
import pandas as pd
import numpy as np
# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs
# Kmeans algorithm from scikit-learn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# load raw data
DATA_FOLDER = './'
raw = pd.read_csv(os.path.join(DATA_FOLDER, 'country risk 2019 data.csv'))
# check the raw data
print("Size of the dataset (row, col): ", raw.shape)
print("\nFirst 5 rows\n", raw.head(n=5))
Size of the dataset (row, col): (121, 6) First 5 rows Country Abbrev Corruption Peace Legal GDP Growth 0 Albania AL 35 1.821 4.546 2.983 1 Algeria DZ 35 2.219 4.435 2.553 2 Argentina AR 45 1.989 5.087 -3.061 3 Armenia AM 42 2.294 4.812 6.000 4 Australia AU 77 1.419 8.363 1.713
# print summary statistics
print("\nSummary statistics\n", raw.describe())
print("\nCorrelation matrix\n", raw.corr())
Summary statistics Corruption Peace Legal GDP Growth count 121.000000 121.000000 121.000000 121.000000 mean 46.842975 2.001017 5.752529 2.657529 std 18.702499 0.461485 1.373932 2.563741 min 15.000000 1.072000 2.671000 -9.459000 25% 33.000000 1.699000 4.785000 1.249000 50% 41.000000 1.939000 5.455000 2.600000 75% 60.000000 2.294000 6.488000 4.000000 max 87.000000 3.369000 8.712000 7.800000 Correlation matrix Corruption Peace Legal GDP Growth Corruption 1.000000 -0.705002 0.938512 -0.123545 Peace -0.705002 1.000000 -0.662233 -0.004428 Legal 0.938512 -0.662233 1.000000 -0.150369 GDP Growth -0.123545 -0.004428 -0.150369 1.000000
Note that distributions for GDP Growth is quite skewed.
# plot histograms
plt.figure(1)
raw['Corruption'].plot(kind = 'hist', title = 'Corruption', alpha = 0.5)
plt.figure(2)
raw['Peace'].plot(kind = 'hist', title = 'Peace', alpha = 0.5)
plt.figure(3)
raw['Legal'].plot(kind = 'hist', title = 'Legal', alpha = 0.5)
plt.figure(4)
raw['GDP Growth'].plot(kind = 'hist', title = 'GDP Growth', alpha = 0.5)
plt.show()
X = raw[['Peace', 'Legal', 'GDP Growth']]
X = (X - X.mean()) / X.std()
print(X.head(5))
Peace Legal GDP Growth 0 -0.390081 -0.878158 0.126952 1 0.472352 -0.958948 -0.040772 2 -0.026039 -0.484397 -2.230541 3 0.634871 -0.684553 1.303747 4 -1.261182 1.900001 -0.368418
The marginal gain of adding one cluster dropped quite a bit from k=3 to k=4. We will choose k=3 (not a clear cut though).
# https://stackoverflow.com/questions/41540751/sklearn-kmeans-equivalent-of-elbow-method
Ks = range(1, 10)
inertia = [KMeans(i).fit(X).inertia_ for i in Ks]
fig = plt.figure()
plt.plot(Ks, inertia, '-bo')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (within-cluster sum of squares)')
plt.show()
k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(X)
# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)
# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)
inertia for k=3 is 161.40859912320002 cluster centers: [[ 1.35385404 -0.83412368 -1.11084884] [ 0.2194138 -0.55102769 0.60381864] [-0.85097477 1.02149992 -0.23897931]] cluster labels: [1 1 0 1 2 2 1 1 1 2 1 1 1 2 0 1 0 1 2 0 2 1 1 2 1 2 2 0 2 1 0 1 1 2 1 2 2 1 1 2 1 1 1 1 2 2 1 1 0 2 1 2 2 2 2 1 1 2 2 2 0 1 2 1 1 2 1 1 2 0 1 1 1 1 1 2 2 0 0 2 2 0 1 1 1 1 2 2 2 2 0 1 0 1 1 1 2 2 2 0 2 1 2 2 2 1 1 1 0 1 0 1 0 2 2 2 2 1 0 1 0]
# set up the color
norm = clrs.Normalize(vmin=0.,vmax=y.max() + 0.8)
cmap = cm.viridis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X.iloc[:,0], X.iloc[:,1], X.iloc[:,2], c=cmap(norm(y)), marker='o')
centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.5)
ax.set_xlabel('Peace')
ax.set_ylabel('Legal')
ax.set_zlabel('GDP Growth')
plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
figs = [(0, 1), (0, 2), (1, 2)]
labels = ['Peace', 'Legal', 'GDP Growth']
for i in range(3):
fig = plt.figure(i)
plt.scatter(X.iloc[:,figs[i][0]], X.iloc[:,figs[i][1]], c=cmap(norm(y)), s=50)
plt.scatter(centers[:, figs[i][0]], centers[:, figs[i][1]], c='black', s=200, alpha=0.5)
plt.xlabel(labels[figs[i][0]])
plt.ylabel(labels[figs[i][1]])
plt.show()
plot country abbreviations instead of dots.
%matplotlib inline
import matplotlib.pyplot as plt
figs = [(0, 1), (0, 2), (1, 2)]
labels = ['Peace', 'Legal', 'GDP Growth']
colors = ['blue','green', 'red']
for i in range(3):
fig = plt.figure(i, figsize=(8, 8))
x_1 = figs[i][0]
x_2 = figs[i][1]
plt.scatter(X.iloc[:, x_1], X.iloc[:, x_2], c=y, s=0, alpha=0)
plt.scatter(centers[:, x_1], centers[:, x_2], c='black', s=200, alpha=0.5)
for j in range(X.shape[0]):
plt.text(X.iloc[j, x_1], X.iloc[j, x_2], raw['Abbrev'].iloc[j],
color=colors[y[j]], weight='semibold', horizontalalignment = 'center', verticalalignment = 'center')
plt.xlabel(labels[x_1])
plt.ylabel(labels[x_2])
plt.show()
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
print(result.sort_values('Label'))
Country Abbrev Label 60 Lebanon LB 0 118 Yemen YE 0 112 Ukraine UA 0 110 Turkey TR 0 108 Trinidad and Tobago TT 0 99 South Africa ZA 0 92 Saudi Arabia SA 0 90 Russia RU 0 81 Pakistan PK 0 78 Nigeria NG 0 77 Nicaragua NI 0 69 Mexico MX 0 48 Iran IR 0 30 Ecuador EC 0 27 Democratic Republic of Congo CD 0 19 Chad TD 0 120 Zimbabwe ZW 0 2 Argentina AR 0 16 Burundi BI 0 14 Brazil BR 0 17 Cameroon CM 1 8 Bangladesh BD 1 10 Benin BJ 1 74 Nepal NP 1 73 Mozambique MZ 1 72 Morocco MA 1 71 Montenegro ME 1 70 Moldova FM 1 11 Bolivia BO 1 67 Mauritania MR 1 66 Mali ML 1 64 Malawi MW 1 63 Madagascar MG 1 61 Liberia LR 1 82 Panama PA 1 83 Paraguay PY 1 85 Philippines PH 1 119 Zambia ZM 1 1 Algeria DZ 1 117 Vietnam VI 1 111 Uganda UG 1 3 Armenia AM 1 109 Tunisia TN 1 107 The FYR of Macedonia MK 1 106 Thailand TH 1 105 Tanzania TZ 1 101 Sri Lanka LK 1 95 Sierra Leone SL 1 94 Serbia RS 1 93 Senegal SN 1 6 Azerbaijan AZ 1 91 Rwanda RW 1 7 Bahrain BH 1 84 Peru PE 1 56 Kenya KE 1 0 Albania AL 1 24 Croatia HR 1 32 El Salvador SV 1 34 Ethiopia ET 1 55 Kazakhstan KZ 1 37 Gabon GA 1 38 Georgia GE 1 40 Ghana GH 1 31 Egypt EG 1 41 Greece GR 1 42 Guatemala GT 1 43 Honduras HN 1 46 India IN 1 29 Dominican Republic DO 1 12 Bosnia and Herzegovina BA 1 22 Colombia CO 1 21 China CN 1 50 Israel IL 1 15 Bulgaria BG 1 47 Indonesia ID 1 18 Canada CA 2 96 Singapore SG 2 97 Slovakia SK 2 116 Uruguay UY 2 98 Slovenia SI 2 5 Austria AT 2 100 Spain ES 2 28 Denmark DK 2 102 Sweden SE 2 114 United Kingdom GB 2 103 Switzerland CH 2 104 Taiwan SY 2 26 Czech Republic CZ 2 113 United Arab Emirates AE 2 25 Cyprus CY 2 20 Chile CL 2 4 Australia AU 2 23 Costa Rica CR 2 115 United States US 2 54 Jordan JO 2 88 Qatar QA 2 33 Estonia EE 2 57 Korea (South) KI 2 58 Kuwait KW 2 59 Latvia LV 2 53 Japan JP 2 52 Jamaica JM 2 62 Lithuania LT 2 51 Italy IT 2 65 Malaysia MY 2 49 Ireland IE 2 45 Iceland IS 2 13 Botswana BW 2 44 Hungary HU 2 76 New Zealand NZ 2 9 Belgium BE 2 79 Norway NO 2 80 Oman OM 2 39 Germany DE 2 36 France FR 2 35 Finland FI 2 86 Poland PL 2 87 Portugal PT 2 89 Romania RO 2 75 Netherlands NL 2 68 Mauritius MU 2
# Silhouette Analysis
range_n_clusters=[2,3,4,5,6,7,8,9,10]
for n_clusters in range_n_clusters:
clusterer=KMeans(n_clusters=n_clusters, random_state=1)
cluster_labels=clusterer.fit_predict(X)
silhouette_avg=silhouette_score(X,cluster_labels)
print("For n_clusters=", n_clusters,
"The average silhouette_score is :", silhouette_avg)
For n_clusters= 2 The average silhouette_score is : 0.350913952385216 For n_clusters= 3 The average silhouette_score is : 0.36018818321478785 For n_clusters= 4 The average silhouette_score is : 0.3396528908669481 For n_clusters= 5 The average silhouette_score is : 0.3443842097739338 For n_clusters= 6 The average silhouette_score is : 0.34771309340459694 For n_clusters= 7 The average silhouette_score is : 0.35497852344070874 For n_clusters= 8 The average silhouette_score is : 0.3554792385881377 For n_clusters= 9 The average silhouette_score is : 0.3322195951047023 For n_clusters= 10 The average silhouette_score is : 0.3424328346243841