Created
September 21, 2011 06:27
-
-
Save forhappy/1231401 to your computer and use it in GitHub Desktop.
python implemtion k-means algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
import sys | |
import cmath | |
import os.path | |
class KMeans: | |
''' | |
@descriptions: K-means Algorithm implementation. | |
@filename: Filename of input data. | |
@knums: Clusters number. | |
''' | |
def __init__(self, filename, knums): | |
self._filename = filename; | |
self._knums = knums | |
self._dimension = 0 | |
"""self._samples := [(seqx, x1, x2, ..., xn), | |
(seqy, y1, y2, ..., yn), | |
..., | |
(seqz, z1, z2, ..., zn)]""" | |
self._samples= [] | |
"""self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)], | |
[], | |
..., | |
[]]""" | |
self._clusters = [] | |
self._open(self._filename) | |
self._normalize() | |
#print self._samples | |
self._select(self._knums) | |
def _normalize(self): | |
""" | |
@description: Normalize the attributes of input data. | |
""" | |
new_samples = [] | |
for t in xrange(len(self._samples)): | |
st = list(self._samples[t]) | |
new_samples.append(st) | |
for t in xrange(len(self._samples)): | |
self._samples.pop() | |
for d in xrange(1, (self._dimension + 1)): | |
container_att = [] | |
for idx in xrange(len(new_samples)): | |
att = new_samples[idx][d] | |
container_att.append(att) | |
max_att = max(container_att) | |
min_att = min(container_att) | |
for idx in xrange(len(new_samples)): | |
new_att = (new_samples[idx][d] - min_att) / (max_att - min_att) | |
new_samples[idx][d] = new_att | |
for t in xrange(len(new_samples)): | |
st = tuple(new_samples[t]) | |
self._samples.append(st) | |
def _open(self, filename): | |
""" | |
@descriptions: Open the data file and fill each item into memory. | |
@filename : Filename of input data. | |
""" | |
data_file= open(self._filename, "r") | |
data_lines= data_file.readlines(); | |
for line in data_lines: | |
string_samples = line.split(" ") | |
integer_samples= [] | |
integer_samples.append(int(string_samples[0])) | |
for e in string_samples[1:]: | |
integer_samples.append(float(e)) | |
samples = tuple(integer_samples) | |
self._samples.append(samples) | |
#print self._samples | |
self._dimension = len(self._samples[0]) - 1 | |
#print self._dimension | |
def _select(self, knums): | |
""" | |
@descriptions: Choose the first knums cluster center. | |
@knums : Clusters number. | |
""" | |
for i in xrange(knums): | |
selected = self._samples[i] | |
temp = list(selected) | |
temp[0] = 0 | |
self._clusters.append([]) | |
self._clusters[i].append(temp) | |
#print self._clusters | |
def _distance(self, va, vb): | |
''' | |
@description: Return the (distance ** 2) of tuple va and tuple vb. | |
@va : tuple va (x1, x2, ..., xn) | |
@vb : tuple vb (y1, y2, ..., yn) | |
''' | |
distance = 0 | |
for i in xrange(self._dimension): | |
distance += (va[i] - vb[i]) * (va[i] - vb[i]) | |
#print distance | |
return distance | |
def _means(self, va): | |
""" | |
@description: Return the means of va. | |
@va : A tuple of list va, with the form [(flagx, x1, x2, ..., xn), | |
(flagy, y1, y2, ..., yn), | |
(flagz, z1, z2, ..., zn), ...] | |
""" | |
if (len(va) == 0): | |
return va | |
means_cluster = [] | |
means_cluster.append(1)#Indicate that the means has changed. | |
#print va | |
for d in xrange(self._dimension): | |
tmp = 0 | |
for i in xrange(len(va)): | |
tmp += va[i][d+1] | |
means_cluster.append(tmp/len(va)) | |
means = tuple(means_cluster) | |
return means | |
def _equal(self, ta, tb): | |
""" | |
@description: Check if tuple ta equals to tuple tb. | |
@ta : Tuple ta.(flagx, x1, x2, ..., xn) | |
@tb : Tuple tb.(flagy, y1, y1, ..., ym) | |
""" | |
if (len(ta) != len(tb)): | |
return False | |
for i in xrange(1, len(ta)): | |
if (ta[i] != tb[i]): | |
return False | |
return True | |
def flush(self, filename): | |
""" | |
@description: Flush data the disk. | |
@filename : Filename of output data. | |
""" | |
foutput = open(filename, "w") | |
for c in xrange(self._knums): | |
foutput.write("Group %d" % c) | |
for e in self._clusters[c][1:]: | |
foutput.write("%s" % repr(e)) | |
foutput.write("\n\n\n") | |
print("Done.") | |
foutput.close() | |
def _reconstruct(self, idx): | |
""" | |
@description: Reconstruct the cluster points. | |
@idx : Index of clusters, where clusters has the form as follows: | |
self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)], | |
[], | |
..., | |
[]] | |
""" | |
new_cluster = [] | |
new_cluster.append(0) | |
for old_value in self._clusters[idx][0][1:]: | |
new_cluster.append(old_value) | |
for i in xrange(len(self._clusters[idx])): | |
self._clusters[idx].pop() | |
self._clusters[idx].insert(0, new_cluster) | |
def process(self): | |
""" | |
@description: Process data, calculating k-means and clustering. | |
""" | |
while True: | |
K = 0 | |
for e in self._samples: | |
#print e | |
shortest = -1 | |
for k in xrange(self._knums): | |
#for k in _clusters[] | |
#print e | |
#print self._clusters[k][0] | |
distance = self._distance(e[1:], self._clusters[k][0][1:]) | |
#print distance | |
if (distance < 0.000001): | |
# add e to the k-th cluster. | |
self._clusters[k].append(e) | |
break | |
else: | |
if (shortest == -1): | |
shortest = distance | |
else: | |
if (shortest > distance): | |
shortest = distance | |
K = k | |
if (k != self._knums - 1): | |
continue | |
else: | |
# add e to the k-th cluster | |
self._clusters[K].append(e) | |
#print self._clusters | |
for k in xrange(self._knums): | |
new_ktuple = self._means(self._clusters[k][1:]) | |
if (len(new_ktuple) == 0): | |
continue | |
if (self._equal(self._clusters[k][0], new_ktuple) == False): | |
self._clusters[k].pop(0) | |
self._clusters[k].insert(0, new_ktuple) | |
else: | |
continue | |
flag = 0 | |
for idx in xrange(self._knums): | |
if (self._clusters[idx][0][0] == 1): | |
flag = 1 | |
break | |
else: | |
continue | |
if (flag == 1): | |
for idx in xrange(self._knums): | |
self._reconstruct(idx) | |
else: | |
break | |
if __name__ =="__main__": | |
ikmeans = KMeans("./iris-1.dat", 3) | |
ikmeans.process() | |
ikmeans.flush("./k-means-out.dat") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 5.1 3.5 1.4 0.2 | |
2 4.9 3.0 1.4 0.2 | |
3 4.7 3.2 1.3 0.2 | |
4 4.6 3.1 1.5 0.2 | |
5 5.0 3.6 1.4 0.2 | |
6 5.4 3.9 1.7 0.4 | |
7 4.6 3.4 1.4 0.3 | |
8 5.0 3.4 1.5 0.2 | |
9 4.4 2.9 1.4 0.2 | |
10 4.9 3.1 1.5 0.1 | |
11 5.4 3.7 1.5 0.2 | |
12 4.8 3.4 1.6 0.2 | |
13 4.8 3.0 1.4 0.1 | |
14 4.3 3.0 1.1 0.1 | |
15 5.8 4.0 1.2 0.2 | |
16 5.7 4.4 1.5 0.4 | |
17 5.4 3.9 1.3 0.4 | |
18 5.1 3.5 1.4 0.3 | |
19 5.7 3.8 1.7 0.3 | |
20 5.1 3.8 1.5 0.3 | |
21 5.4 3.4 1.7 0.2 | |
22 5.1 3.7 1.5 0.4 | |
23 4.6 3.6 1.0 0.2 | |
24 5.1 3.3 1.7 0.5 | |
25 4.8 3.4 1.9 0.2 | |
26 5.0 3.0 1.6 0.2 | |
27 5.0 3.4 1.6 0.4 | |
28 5.2 3.5 1.5 0.2 | |
29 5.2 3.4 1.4 0.2 | |
30 4.7 3.2 1.6 0.2 | |
31 4.8 3.1 1.6 0.2 | |
32 5.4 3.4 1.5 0.4 | |
33 5.2 4.1 1.5 0.1 | |
34 5.5 4.2 1.4 0.2 | |
35 4.9 3.1 1.5 0.1 | |
36 5.0 3.2 1.2 0.2 | |
37 5.5 3.5 1.3 0.2 | |
38 4.9 3.1 1.5 0.1 | |
39 4.4 3.0 1.3 0.2 | |
40 5.1 3.4 1.5 0.2 | |
41 5.0 3.5 1.3 0.3 | |
42 4.5 2.3 1.3 0.3 | |
43 4.4 3.2 1.3 0.2 | |
44 5.0 3.5 1.6 0.6 | |
45 5.1 3.8 1.9 0.4 | |
46 4.8 3.0 1.4 0.3 | |
47 5.1 3.8 1.6 0.2 | |
48 4.6 3.2 1.4 0.2 | |
49 5.3 3.7 1.5 0.2 | |
50 5.0 3.3 1.4 0.2 | |
51 7.0 3.2 4.7 1.4 | |
52 6.4 3.2 4.5 1.5 | |
53 6.9 3.1 4.9 1.5 | |
54 5.5 2.3 4.0 1.3 | |
55 6.5 2.8 4.6 1.5 | |
56 5.7 2.8 4.5 1.3 | |
57 6.3 3.3 4.7 1.6 | |
58 4.9 2.4 3.3 1.0 | |
59 6.6 2.9 4.6 1.3 | |
60 5.2 2.7 3.9 1.4 | |
61 5.0 2.0 3.5 1.0 | |
62 5.9 3.0 4.2 1.5 | |
63 6.0 2.2 4.0 1.0 | |
64 6.1 2.9 4.7 1.4 | |
65 5.6 2.9 3.6 1.3 | |
66 6.7 3.1 4.4 1.4 | |
67 5.6 3.0 4.5 1.5 | |
68 5.8 2.7 4.1 1.0 | |
69 6.2 2.2 4.5 1.5 | |
70 5.6 2.5 3.9 1.1 | |
71 5.9 3.2 4.8 1.8 | |
72 6.1 2.8 4.0 1.3 | |
73 6.3 2.5 4.9 1.5 | |
74 6.1 2.8 4.7 1.2 | |
75 6.4 2.9 4.3 1.3 | |
76 6.6 3.0 4.4 1.4 | |
77 6.8 2.8 4.8 1.4 | |
78 6.7 3.0 5.0 1.7 | |
79 6.0 2.9 4.5 1.5 | |
80 5.7 2.6 3.5 1.0 | |
81 5.5 2.4 3.8 1.1 | |
82 5.5 2.4 3.7 1.0 | |
83 5.8 2.7 3.9 1.2 | |
84 6.0 2.7 5.1 1.6 | |
85 5.4 3.0 4.5 1.5 | |
86 6.0 3.4 4.5 1.6 | |
87 6.7 3.1 4.7 1.5 | |
88 6.3 2.3 4.4 1.3 | |
89 5.6 3.0 4.1 1.3 | |
90 5.5 2.5 4.0 1.3 | |
91 5.5 2.6 4.4 1.2 | |
92 6.1 3.0 4.6 1.4 | |
93 5.8 2.6 4.0 1.2 | |
94 5.0 2.3 3.3 1.0 | |
95 5.6 2.7 4.2 1.3 | |
96 5.7 3.0 4.2 1.2 | |
97 5.7 2.9 4.2 1.3 | |
98 6.2 2.9 4.3 1.3 | |
99 5.1 2.5 3.0 1.1 | |
100 5.7 2.8 4.1 1.3 | |
101 6.3 3.3 6.0 2.5 | |
102 5.8 2.7 5.1 1.9 | |
103 7.1 3.0 5.9 2.1 | |
104 6.3 2.9 5.6 1.8 | |
105 6.5 3.0 5.8 2.2 | |
106 7.6 3.0 6.6 2.1 | |
107 4.9 2.5 4.5 1.7 | |
108 7.3 2.9 6.3 1.8 | |
109 6.7 2.5 5.8 1.8 | |
110 7.2 3.6 6.1 2.5 | |
111 6.5 3.2 5.1 2.0 | |
112 6.4 2.7 5.3 1.9 | |
113 6.8 3.0 5.5 2.1 | |
114 5.7 2.5 5.0 2.0 | |
115 5.8 2.8 5.1 2.4 | |
116 6.4 3.2 5.3 2.3 | |
117 6.5 3.0 5.5 1.8 | |
118 7.7 3.8 6.7 2.2 | |
119 7.7 2.6 6.9 2.3 | |
120 6.0 2.2 5.0 1.5 | |
121 6.9 3.2 5.7 2.3 | |
122 5.6 2.8 4.9 2.0 | |
123 7.7 2.8 6.7 2.0 | |
124 6.3 2.7 4.9 1.8 | |
125 6.7 3.3 5.7 2.1 | |
126 7.2 3.2 6.0 1.8 | |
127 6.2 2.8 4.8 1.8 | |
128 6.1 3.0 4.9 1.8 | |
129 6.4 2.8 5.6 2.1 | |
130 7.2 3.0 5.8 1.6 | |
131 7.4 2.8 6.1 1.9 | |
132 7.9 3.8 6.4 2.0 | |
133 6.4 2.8 5.6 2.2 | |
134 6.3 2.8 5.1 1.5 | |
135 6.1 2.6 5.6 1.4 | |
136 7.7 3.0 6.1 2.3 | |
137 6.3 3.4 5.6 2.4 | |
138 6.4 3.1 5.5 1.8 | |
139 6.0 3.0 4.8 1.8 | |
140 6.9 3.1 5.4 2.1 | |
141 6.7 3.1 5.6 2.4 | |
142 6.9 3.1 5.1 2.3 | |
143 5.8 2.7 5.1 1.9 | |
144 6.8 3.2 5.9 2.3 | |
145 6.7 3.3 5.7 2.5 | |
146 6.7 3.0 5.2 2.3 | |
147 6.3 2.5 5.0 1.9 | |
148 6.5 3.0 5.2 2.0 | |
149 6.2 3.4 5.4 2.3 | |
150 5.9 3.0 5.1 1.8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment