KMR
kmeanspy.py
1 #!/usr/bin/env python3
2 # -*-coding: utf-8;-*-
3 
4 ## K-Means (2015-06-15)
5 
6 ## An example of K-Means implementation.
7 ## It can be run under MPI as follows:
8 ## $ mpiexec -n 4 python3 kmeanspy.py
9 
10 import random
11 from mpi4py import MPI
12 import kmr4py
13 
14 class K_Means:
15  def __init__(self):
16  # Change the following variables
17  self.n_iteration = 10
18  self.grid_size = 1000
19  self.dim = 3
20  self.n_points = 10000
21  self.n_means = 100
22  self.means = None
23  self.points = None
24 
25  def __str__(self):
26  ostr = '#### Configuration ###########################\n'
27  ostr += 'Iteration = %d\n' % (self.n_iteration)
28  ostr += 'Grid size = %d\n' % (self.grid_size)
29  ostr += 'Dimension = %d\n' % (self.dim)
30  ostr += 'Number of clusters = %d\n' % (self.n_means)
31  ostr += 'Number of points = %d\n' % (self.n_points)
32  ostr += '##############################################'
33  return ostr
34 
35  def init_means(self):
36  self.means = []
37  self._fill_randoms(self.means, self.n_means)
38 
39  def init_points(self):
40  self.points = []
41  self._fill_randoms(self.points, self.n_points)
42 
43  def _fill_randoms(self, tlst, count):
44  for _ in range(0, count):
45  lst = []
46  for _ in range(0, self.dim):
47  lst.append(random.randint(0, self.grid_size - 1))
48  tlst.append(lst)
49 
50 def calc_sq_dist(v1, v2):
51  sum_ = 0
52  for (x, y) in zip(v1, v2):
53  sum_ += (x - y) * (x - y)
54  return sum_
55 
56 # Emit Key:id of point(integer), Value:a point(list of integer)
57 def load_points(kv, kvi, kvo, i):
58  del kv, kvi, i
59  for (idp, point) in enumerate(kmeans.points):
60  kvo.add(idp, point)
61 
62 # Emit Key:id of nearest group, Value:a point(list of integer)
63 def calc_cluster(kv, kvi, kvo, i):
64  (k, v) = kv
65  del k, kvi, i
66  min_id = 0
67  min_dst = kmeans.grid_size * kmeans.grid_size
68  for (idm, mean) in enumerate(kmeans.means):
69  dst = calc_sq_dist(v, mean)
70  if dst < min_dst:
71  min_id = idm
72  min_dst = dst
73  kvo.add(min_id, v)
74 
75 # Emit nothing
76 def copy_center(kv, kvi, kvo, i):
77  (k, v) = kv
78  del kvi, kvo, i
79  kmeans.means[k] = v
80 
81 # Emit Key:id of group(integer),
82 # Value:coordinates of center of the group(list of integer)
83 def update_cluster(kvvec, kvi, kvo):
84  del kvi
85  sum_ = []
86  for d in range(0, kmeans.dim):
87  sum_.append(0)
88  for (_, v) in kvvec:
89  for d in range(0, kmeans.dim):
90  sum_[d] += v[d]
91  avg = [x / (len(kvvec)) for x in sum_]
92  kvo.add_kv(kvvec[0][0], avg)
93 
94 
95 #### main
96 comm = MPI.COMM_WORLD
97 kmr = kmr4py.KMR("world")
98 kmeans = K_Means()
99 random.seed(1)
100 
101 if comm.rank == 0:
102  print('Number of processes = %d' % (comm.size))
103  print(kmeans)
104  kmeans.init_means()
105 kmeans.means = comm.bcast(kmeans.means, root=0)
106 kmeans.init_points()
107 
108 for _ in range(0, kmeans.n_iteration):
109  kvs0 = kmr.emptykvs.map_once(False, load_points, key="integer")
110  kvs1 = kvs0.map(calc_cluster, key="integer")
111  kvs2 = kvs1.shuffle()
112  kvs3 = kvs2.reduce(update_cluster, key="integer")
113  kvs4 = kvs3.replicate()
114  kvs4.map(copy_center)
115 
116  if comm.rank == 0:
117  print('Cluster coordinates')
118  for m in kmeans.means:
119  print(m)
120 
121 kmr.dismiss()
122 kmr4py.fin()
def _fill_randoms(self, tlst, count)
Definition: kmeanspy.py:43
def fin()
Definition: kmr4py.py:1357