Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit09f1973

Browse files
authored
Clustering using PyGAD
Data clustering using the genetic algorithm. This example uses 3 clusters with artificial (non-real) data.
1 parentece8d80 commit09f1973

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed

‎example_clustering_3.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
importnumpy
2+
importmatplotlib.pyplot
3+
importpygad
4+
5+
cluster1_num_samples=20
6+
cluster1_x1_start=0
7+
cluster1_x1_end=5
8+
cluster1_x2_start=2
9+
cluster1_x2_end=6
10+
cluster1_x1=numpy.random.random(size=(cluster1_num_samples))
11+
cluster1_x1=cluster1_x1* (cluster1_x1_end-cluster1_x1_start)+cluster1_x1_start
12+
cluster1_x2=numpy.random.random(size=(cluster1_num_samples))
13+
cluster1_x2=cluster1_x2* (cluster1_x2_end-cluster1_x2_start)+cluster1_x2_start
14+
15+
cluster2_num_samples=20
16+
cluster2_x1_start=4
17+
cluster2_x1_end=12
18+
cluster2_x2_start=14
19+
cluster2_x2_end=18
20+
cluster2_x1=numpy.random.random(size=(cluster2_num_samples))
21+
cluster2_x1=cluster2_x1* (cluster2_x1_end-cluster2_x1_start)+cluster2_x1_start
22+
cluster2_x2=numpy.random.random(size=(cluster2_num_samples))
23+
cluster2_x2=cluster2_x2* (cluster2_x2_end-cluster2_x2_start)+cluster2_x2_start
24+
25+
cluster3_num_samples=20
26+
cluster3_x1_start=12
27+
cluster3_x1_end=18
28+
cluster3_x2_start=8
29+
cluster3_x2_end=11
30+
cluster3_x1=numpy.random.random(size=(cluster3_num_samples))
31+
cluster3_x1=cluster3_x1* (cluster3_x1_end-cluster3_x1_start)+cluster3_x1_start
32+
cluster3_x2=numpy.random.random(size=(cluster3_num_samples))
33+
cluster3_x2=cluster3_x2* (cluster3_x2_end-cluster3_x2_start)+cluster3_x2_start
34+
35+
c1=numpy.array([cluster1_x1,cluster1_x2]).T
36+
c2=numpy.array([cluster2_x1,cluster2_x2]).T
37+
c3=numpy.array([cluster3_x1,cluster3_x2]).T
38+
39+
data=numpy.concatenate((c1,c2,c3),axis=0)
40+
41+
matplotlib.pyplot.scatter(cluster1_x1,cluster1_x2)
42+
matplotlib.pyplot.scatter(cluster2_x1,cluster2_x2)
43+
matplotlib.pyplot.scatter(cluster3_x1,cluster3_x2)
44+
matplotlib.pyplot.title("Optimal Clustering")
45+
matplotlib.pyplot.show()
46+
47+
defeuclidean_distance(X,Y):
48+
"""
49+
Calculate the euclidean distance between X and Y. It accepts:
50+
:X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
51+
:Y should be of size f. In other words, it is a single sample.
52+
53+
Returns a vector of N elements with the distances between the N samples and the Y.
54+
"""
55+
56+
returnnumpy.sqrt(numpy.sum(numpy.power(X-Y,2),axis=1))
57+
58+
defcluster_data(solution,solution_idx):
59+
"""
60+
Clusters the data based on the current solution.
61+
"""
62+
63+
globalnum_clusters,feature_vector_length,data
64+
cluster_centers= []# A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
65+
all_clusters_dists= []# A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
66+
clusters= []# A list with C elements where each element holds the indices of the samples within a cluster.
67+
clusters_sum_dist= []# A list with C elements where each element represents the sum of distances of the samples with a cluster.
68+
69+
forclust_idxinrange(num_clusters):
70+
# Return the current cluster center.
71+
cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
72+
# Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
73+
cluster_center_dists=euclidean_distance(data,cluster_centers[clust_idx])
74+
all_clusters_dists.append(numpy.array(cluster_center_dists))
75+
76+
cluster_centers=numpy.array(cluster_centers)
77+
all_clusters_dists=numpy.array(all_clusters_dists)
78+
79+
# A 1D array that, for each sample, holds the index of the cluster with the smallest distance.
80+
# In other words, the array holds the sample's cluster index.
81+
cluster_indices=numpy.argmin(all_clusters_dists,axis=0)
82+
forclust_idxinrange(num_clusters):
83+
clusters.append(numpy.where(cluster_indices==clust_idx)[0])
84+
# Calculate the sum of distances for the cluster.
85+
iflen(clusters[clust_idx])==0:
86+
# In case the cluster is empty (i.e. has zero samples).
87+
clusters_sum_dist.append(0)
88+
else:
89+
# When the cluster is not empty (i.e. has at least 1 sample).
90+
clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx,clusters[clust_idx]]))
91+
# clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
92+
93+
clusters_sum_dist=numpy.array(clusters_sum_dist)
94+
95+
returncluster_centers,all_clusters_dists,cluster_indices,clusters,clusters_sum_dist
96+
97+
deffitness_func(solution,solution_idx):
98+
_,_,_,_,clusters_sum_dist=cluster_data(solution,solution_idx)
99+
100+
# The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
101+
fitness=1.0/ (numpy.sum(clusters_sum_dist)+0.00000001)
102+
103+
returnfitness
104+
105+
num_clusters=3
106+
feature_vector_length=data.shape[1]
107+
num_genes=num_clusters*feature_vector_length
108+
109+
ga_instance=pygad.GA(num_generations=100,
110+
sol_per_pop=10,
111+
init_range_low=0,
112+
init_range_high=20,
113+
num_parents_mating=5,
114+
keep_parents=2,
115+
num_genes=num_genes,
116+
fitness_func=fitness_func,
117+
suppress_warnings=True)
118+
119+
ga_instance.run()
120+
121+
best_solution,best_solution_fitness,best_solution_idx=ga_instance.best_solution()
122+
print("Best solution is {bs}".format(bs=best_solution))
123+
print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
124+
print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
125+
126+
cluster_centers,all_clusters_dists,cluster_indices,clusters,clusters_sum_dist=cluster_data(best_solution,best_solution_idx)
127+
128+
forcluster_idxinrange(num_clusters):
129+
cluster_x=data[clusters[cluster_idx],0]
130+
cluster_y=data[clusters[cluster_idx],1]
131+
matplotlib.pyplot.scatter(cluster_x,cluster_y)
132+
matplotlib.pyplot.scatter(cluster_centers[cluster_idx,0],cluster_centers[cluster_idx,1],linewidths=5)
133+
matplotlib.pyplot.title("Clustering using PyGAD")
134+
matplotlib.pyplot.show()

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp