Skip to content

Instantly share code, notes, and snippets.

@forhappy
Created September 21, 2011 06:27
Show Gist options
  • Save forhappy/1231401 to your computer and use it in GitHub Desktop.
Save forhappy/1231401 to your computer and use it in GitHub Desktop.
python implemtion k-means algorithm
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import cmath
import os.path
class KMeans:
'''
@descriptions: K-means Algorithm implementation.
@filename: Filename of input data.
@knums: Clusters number.
'''
def __init__(self, filename, knums):
self._filename = filename;
self._knums = knums
self._dimension = 0
"""self._samples := [(seqx, x1, x2, ..., xn),
(seqy, y1, y2, ..., yn),
...,
(seqz, z1, z2, ..., zn)]"""
self._samples= []
"""self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
[],
...,
[]]"""
self._clusters = []
self._open(self._filename)
self._normalize()
#print self._samples
self._select(self._knums)
def _normalize(self):
"""
@description: Normalize the attributes of input data.
"""
new_samples = []
for t in xrange(len(self._samples)):
st = list(self._samples[t])
new_samples.append(st)
for t in xrange(len(self._samples)):
self._samples.pop()
for d in xrange(1, (self._dimension + 1)):
container_att = []
for idx in xrange(len(new_samples)):
att = new_samples[idx][d]
container_att.append(att)
max_att = max(container_att)
min_att = min(container_att)
for idx in xrange(len(new_samples)):
new_att = (new_samples[idx][d] - min_att) / (max_att - min_att)
new_samples[idx][d] = new_att
for t in xrange(len(new_samples)):
st = tuple(new_samples[t])
self._samples.append(st)
def _open(self, filename):
"""
@descriptions: Open the data file and fill each item into memory.
@filename : Filename of input data.
"""
data_file= open(self._filename, "r")
data_lines= data_file.readlines();
for line in data_lines:
string_samples = line.split(" ")
integer_samples= []
integer_samples.append(int(string_samples[0]))
for e in string_samples[1:]:
integer_samples.append(float(e))
samples = tuple(integer_samples)
self._samples.append(samples)
#print self._samples
self._dimension = len(self._samples[0]) - 1
#print self._dimension
def _select(self, knums):
"""
@descriptions: Choose the first knums cluster center.
@knums : Clusters number.
"""
for i in xrange(knums):
selected = self._samples[i]
temp = list(selected)
temp[0] = 0
self._clusters.append([])
self._clusters[i].append(temp)
#print self._clusters
def _distance(self, va, vb):
'''
@description: Return the (distance ** 2) of tuple va and tuple vb.
@va : tuple va (x1, x2, ..., xn)
@vb : tuple vb (y1, y2, ..., yn)
'''
distance = 0
for i in xrange(self._dimension):
distance += (va[i] - vb[i]) * (va[i] - vb[i])
#print distance
return distance
def _means(self, va):
"""
@description: Return the means of va.
@va : A tuple of list va, with the form [(flagx, x1, x2, ..., xn),
(flagy, y1, y2, ..., yn),
(flagz, z1, z2, ..., zn), ...]
"""
if (len(va) == 0):
return va
means_cluster = []
means_cluster.append(1)#Indicate that the means has changed.
#print va
for d in xrange(self._dimension):
tmp = 0
for i in xrange(len(va)):
tmp += va[i][d+1]
means_cluster.append(tmp/len(va))
means = tuple(means_cluster)
return means
def _equal(self, ta, tb):
"""
@description: Check if tuple ta equals to tuple tb.
@ta : Tuple ta.(flagx, x1, x2, ..., xn)
@tb : Tuple tb.(flagy, y1, y1, ..., ym)
"""
if (len(ta) != len(tb)):
return False
for i in xrange(1, len(ta)):
if (ta[i] != tb[i]):
return False
return True
def flush(self, filename):
"""
@description: Flush data the disk.
@filename : Filename of output data.
"""
foutput = open(filename, "w")
for c in xrange(self._knums):
foutput.write("Group %d" % c)
for e in self._clusters[c][1:]:
foutput.write("%s" % repr(e))
foutput.write("\n\n\n")
print("Done.")
foutput.close()
def _reconstruct(self, idx):
"""
@description: Reconstruct the cluster points.
@idx : Index of clusters, where clusters has the form as follows:
self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
[],
...,
[]]
"""
new_cluster = []
new_cluster.append(0)
for old_value in self._clusters[idx][0][1:]:
new_cluster.append(old_value)
for i in xrange(len(self._clusters[idx])):
self._clusters[idx].pop()
self._clusters[idx].insert(0, new_cluster)
def process(self):
"""
@description: Process data, calculating k-means and clustering.
"""
while True:
K = 0
for e in self._samples:
#print e
shortest = -1
for k in xrange(self._knums):
#for k in _clusters[]
#print e
#print self._clusters[k][0]
distance = self._distance(e[1:], self._clusters[k][0][1:])
#print distance
if (distance < 0.000001):
# add e to the k-th cluster.
self._clusters[k].append(e)
break
else:
if (shortest == -1):
shortest = distance
else:
if (shortest > distance):
shortest = distance
K = k
if (k != self._knums - 1):
continue
else:
# add e to the k-th cluster
self._clusters[K].append(e)
#print self._clusters
for k in xrange(self._knums):
new_ktuple = self._means(self._clusters[k][1:])
if (len(new_ktuple) == 0):
continue
if (self._equal(self._clusters[k][0], new_ktuple) == False):
self._clusters[k].pop(0)
self._clusters[k].insert(0, new_ktuple)
else:
continue
flag = 0
for idx in xrange(self._knums):
if (self._clusters[idx][0][0] == 1):
flag = 1
break
else:
continue
if (flag == 1):
for idx in xrange(self._knums):
self._reconstruct(idx)
else:
break
if __name__ =="__main__":
ikmeans = KMeans("./iris-1.dat", 3)
ikmeans.process()
ikmeans.flush("./k-means-out.dat")
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5.0 3.6 1.4 0.2
6 5.4 3.9 1.7 0.4
7 4.6 3.4 1.4 0.3
8 5.0 3.4 1.5 0.2
9 4.4 2.9 1.4 0.2
10 4.9 3.1 1.5 0.1
11 5.4 3.7 1.5 0.2
12 4.8 3.4 1.6 0.2
13 4.8 3.0 1.4 0.1
14 4.3 3.0 1.1 0.1
15 5.8 4.0 1.2 0.2
16 5.7 4.4 1.5 0.4
17 5.4 3.9 1.3 0.4
18 5.1 3.5 1.4 0.3
19 5.7 3.8 1.7 0.3
20 5.1 3.8 1.5 0.3
21 5.4 3.4 1.7 0.2
22 5.1 3.7 1.5 0.4
23 4.6 3.6 1.0 0.2
24 5.1 3.3 1.7 0.5
25 4.8 3.4 1.9 0.2
26 5.0 3.0 1.6 0.2
27 5.0 3.4 1.6 0.4
28 5.2 3.5 1.5 0.2
29 5.2 3.4 1.4 0.2
30 4.7 3.2 1.6 0.2
31 4.8 3.1 1.6 0.2
32 5.4 3.4 1.5 0.4
33 5.2 4.1 1.5 0.1
34 5.5 4.2 1.4 0.2
35 4.9 3.1 1.5 0.1
36 5.0 3.2 1.2 0.2
37 5.5 3.5 1.3 0.2
38 4.9 3.1 1.5 0.1
39 4.4 3.0 1.3 0.2
40 5.1 3.4 1.5 0.2
41 5.0 3.5 1.3 0.3
42 4.5 2.3 1.3 0.3
43 4.4 3.2 1.3 0.2
44 5.0 3.5 1.6 0.6
45 5.1 3.8 1.9 0.4
46 4.8 3.0 1.4 0.3
47 5.1 3.8 1.6 0.2
48 4.6 3.2 1.4 0.2
49 5.3 3.7 1.5 0.2
50 5.0 3.3 1.4 0.2
51 7.0 3.2 4.7 1.4
52 6.4 3.2 4.5 1.5
53 6.9 3.1 4.9 1.5
54 5.5 2.3 4.0 1.3
55 6.5 2.8 4.6 1.5
56 5.7 2.8 4.5 1.3
57 6.3 3.3 4.7 1.6
58 4.9 2.4 3.3 1.0
59 6.6 2.9 4.6 1.3
60 5.2 2.7 3.9 1.4
61 5.0 2.0 3.5 1.0
62 5.9 3.0 4.2 1.5
63 6.0 2.2 4.0 1.0
64 6.1 2.9 4.7 1.4
65 5.6 2.9 3.6 1.3
66 6.7 3.1 4.4 1.4
67 5.6 3.0 4.5 1.5
68 5.8 2.7 4.1 1.0
69 6.2 2.2 4.5 1.5
70 5.6 2.5 3.9 1.1
71 5.9 3.2 4.8 1.8
72 6.1 2.8 4.0 1.3
73 6.3 2.5 4.9 1.5
74 6.1 2.8 4.7 1.2
75 6.4 2.9 4.3 1.3
76 6.6 3.0 4.4 1.4
77 6.8 2.8 4.8 1.4
78 6.7 3.0 5.0 1.7
79 6.0 2.9 4.5 1.5
80 5.7 2.6 3.5 1.0
81 5.5 2.4 3.8 1.1
82 5.5 2.4 3.7 1.0
83 5.8 2.7 3.9 1.2
84 6.0 2.7 5.1 1.6
85 5.4 3.0 4.5 1.5
86 6.0 3.4 4.5 1.6
87 6.7 3.1 4.7 1.5
88 6.3 2.3 4.4 1.3
89 5.6 3.0 4.1 1.3
90 5.5 2.5 4.0 1.3
91 5.5 2.6 4.4 1.2
92 6.1 3.0 4.6 1.4
93 5.8 2.6 4.0 1.2
94 5.0 2.3 3.3 1.0
95 5.6 2.7 4.2 1.3
96 5.7 3.0 4.2 1.2
97 5.7 2.9 4.2 1.3
98 6.2 2.9 4.3 1.3
99 5.1 2.5 3.0 1.1
100 5.7 2.8 4.1 1.3
101 6.3 3.3 6.0 2.5
102 5.8 2.7 5.1 1.9
103 7.1 3.0 5.9 2.1
104 6.3 2.9 5.6 1.8
105 6.5 3.0 5.8 2.2
106 7.6 3.0 6.6 2.1
107 4.9 2.5 4.5 1.7
108 7.3 2.9 6.3 1.8
109 6.7 2.5 5.8 1.8
110 7.2 3.6 6.1 2.5
111 6.5 3.2 5.1 2.0
112 6.4 2.7 5.3 1.9
113 6.8 3.0 5.5 2.1
114 5.7 2.5 5.0 2.0
115 5.8 2.8 5.1 2.4
116 6.4 3.2 5.3 2.3
117 6.5 3.0 5.5 1.8
118 7.7 3.8 6.7 2.2
119 7.7 2.6 6.9 2.3
120 6.0 2.2 5.0 1.5
121 6.9 3.2 5.7 2.3
122 5.6 2.8 4.9 2.0
123 7.7 2.8 6.7 2.0
124 6.3 2.7 4.9 1.8
125 6.7 3.3 5.7 2.1
126 7.2 3.2 6.0 1.8
127 6.2 2.8 4.8 1.8
128 6.1 3.0 4.9 1.8
129 6.4 2.8 5.6 2.1
130 7.2 3.0 5.8 1.6
131 7.4 2.8 6.1 1.9
132 7.9 3.8 6.4 2.0
133 6.4 2.8 5.6 2.2
134 6.3 2.8 5.1 1.5
135 6.1 2.6 5.6 1.4
136 7.7 3.0 6.1 2.3
137 6.3 3.4 5.6 2.4
138 6.4 3.1 5.5 1.8
139 6.0 3.0 4.8 1.8
140 6.9 3.1 5.4 2.1
141 6.7 3.1 5.6 2.4
142 6.9 3.1 5.1 2.3
143 5.8 2.7 5.1 1.9
144 6.8 3.2 5.9 2.3
145 6.7 3.3 5.7 2.5
146 6.7 3.0 5.2 2.3
147 6.3 2.5 5.0 1.9
148 6.5 3.0 5.2 2.0
149 6.2 3.4 5.4 2.3
150 5.9 3.0 5.1 1.8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment