Last active
January 18, 2017 01:10
-
-
Save mthomure/1d6399376d4ff0c90de3fe253fc88174 to your computer and use it in GitHub Desktop.
simple k-means in clojure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns k-means | |
(:import [org.apache.commons.math3.ml.clustering | |
Clusterable KMeansPlusPlusClusterer] | |
[org.apache.commons.math3.ml.distance EuclideanDistance])) | |
;; dependency: [org.apache.commons/commons-math3 "3.6.1"] | |
(defrecord ClusterableWrapper [x] | |
Clusterable | |
(getPoint [this] x)) | |
(defn k-means [features & {:keys [k max-iterations] | |
:or {k 10 | |
max-iterations 10000}}] | |
(let [features (mapv #(ClusterableWrapper. (double-array %)) features) | |
model (KMeansPlusPlusClusterer. k max-iterations) | |
centroids (.cluster model features) | |
;; reverse lookup from point to centroid index | |
cluster-index (into {} | |
(for [[i c] (map vector (range) centroids) | |
p (.getPoints c)] | |
[p i]))] | |
{:centroids (map #(into [] (.getPoint (.getCenter %))) centroids) | |
:assignments (map cluster-index features)})) | |
(defn- k-means-assign-1 | |
[measure centroids point] | |
(->> centroids | |
(map list (range)) | |
(apply min-key #(.compute measure point (second %))) | |
first)) | |
(defn k-means-assign | |
"Index of nearest centroid." | |
[centroids points] | |
(let [measure (EuclideanDistance.) | |
centroids (map double-array centroids)] | |
(map #(k-means-assign-1 measure centroids (double-array %)) points))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(require '[k-means :refer :all]) | |
(defn blob | |
"Isotropic Gaussian blobs for clustering." | |
[& {:keys [num-samples num-features cluster-std center-box] | |
:or {num-samples 12 | |
num-features 2 | |
cluster-std 1.0 | |
center-box [-10.0 10.0]}}] | |
(let [[cmin cmax] center-box | |
crange (- cmax cmin) | |
center (repeatedly num-features #(-> (rand) (* crange) (+ cmin))) | |
points (->> (repeatedly #(* cluster-std (rand))) | |
(partition num-features) | |
(take num-samples) | |
(map #(mapv + center %)))] | |
{:centroid center | |
:points points})) | |
(def blobs (repeatedly 3 #(blob :num-samples 2))) | |
;; ({:centroid (3.6065806787961474 -3.4831673166556465), | |
;; :points | |
;; ([3.938402488879002 -2.574654565134169] | |
;; [4.41362664557822 -3.4196443259839007])} | |
;; {:centroid (-9.33722290094038 -7.7122443338667885), | |
;; :points | |
;; ([-8.60315679311847 -7.352826777403295] | |
;; [-8.58933442539004 -7.634666257054288])} | |
;; {:centroid (-7.611451089070584 -3.6657499059097542), | |
;; :points | |
;; ([-7.331312967407008 -2.7336492340643788] | |
;; [-6.640609061939679 -2.974514030116507])}) | |
(def model (k-means (mapcat :points blobs) :k 3)) | |
;; {:centroids | |
;; ([-8.596245609254256 -7.493746517228791] | |
;; [4.176014567228611 -2.997149445559035] | |
;; [-6.985961014673343 -2.854081632090443]), | |
;; :assignments (1 1 0 0 2 2)} | |
(k-means-assign (:centroids model) (mapcat :points blobs)) | |
;; (1 1 0 0 2 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment