Main MRPT website > C++ reference for MRPT 1.5.7
kmeans++/KMeans.cpp
Go to the documentation of this file.
1 /* +---------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | http://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2017, Individual contributors, see AUTHORS file |
6  | See: http://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See details in http://www.mrpt.org/License |
8  +---------------------------------------------------------------------------+ */
9 // See KMeans.h
10 //
11 // Author: David Arthur (darthur@gmail.com), 2009
12 
13 // Includes
14 #include "KMeans.h"
15 #include "KmTree.h"
16 #include <mrpt/utils/mrpt_macros.h>
17 #include <sstream>
18 #include <time.h>
19 #include <vector>
20 using namespace std;
21 
22 // Logging
23 static vector<ostream*> gLogOutputs;
24 static vector<ostream*> gVerboseLogOutputs;
25 #define LOG(verbose, text) { \
26  vector<ostream*> &outputs = (verbose? gVerboseLogOutputs : gLogOutputs); \
27  if (outputs.size() > 0) { \
28  ostringstream string_stream; \
29  string_stream << text; \
30  for (int i = 0; i < (int)outputs.size(); i++) \
31  *(outputs[i]) << string_stream.str(); \
32  } \
33 }
34 void AddKMeansLogging(std::ostream *out, bool verbose) {
35  if (verbose)
36  gVerboseLogOutputs.push_back(out);
37  gLogOutputs.push_back(out);
38 }
40  gLogOutputs.clear();
41  gVerboseLogOutputs.clear();
42 }
43 
44 // Returns the number of seconds since the program began execution.
45 static double GetSeconds() {
46  return double(clock()) / CLOCKS_PER_SEC;
47 }
48 
49 // See KMeans.h
50 // Performs one full execution of k-means, logging any relevant information, and tracking meta
51 // statistics for the run. If min or max values are negative, they are treated as unset.
52 // best_centers and best_assignment can be 0, in which case they are not set.
53 static void RunKMeansOnce(const KmTree &tree, int n, int k, int d, Scalar *points, Scalar *centers,
54  Scalar *min_cost, Scalar *max_cost, Scalar *total_cost,
55  double start_time, double *min_time, double *max_time,
56  double *total_time, Scalar *best_centers, int *best_assignment) {
58  const Scalar kEpsilon = Scalar(1e-8); // Used to determine when to terminate k-means
59 
60  // Do iterations of k-means until the cost stabilizes
61  Scalar old_cost = 0;
62  bool is_done = false;
63  for (int iteration = 0; !is_done; iteration++) {
64  Scalar new_cost = tree.DoKMeansStep(k, centers, 0);
65  is_done = (iteration > 0 && new_cost >= (1 - kEpsilon) * old_cost);
66  old_cost = new_cost;
67  LOG(true, "Completed iteration #" << (iteration+1) << ", cost=" << new_cost << "..." << endl);
68  }
69  double this_time = GetSeconds() - start_time;
70 
71  // Log the clustering we found
72  LOG(false, "Completed run: cost=" << old_cost << " (" << this_time << " seconds)" << endl);
73 
74  // Handle a new min cost, updating best_centers and best_assignment as appropriate
75  if (*min_cost < 0 || old_cost < *min_cost) {
76  *min_cost = old_cost;
77  if (best_assignment != 0)
78  tree.DoKMeansStep(k, centers, best_assignment);
79  if (best_centers != 0)
80  memcpy(best_centers, centers, sizeof(Scalar)*k*d);
81  }
82 
83  // Update all other aggregate stats
84  if (*max_cost < old_cost) *max_cost = old_cost;
85  *total_cost += old_cost;
86  if (*min_time < 0 || *min_time > this_time)
87  *min_time = this_time;
88  if (*max_time < this_time) *max_time = this_time;
89  *total_time += this_time;
90 }
91 
92 // Outputs all meta-stats for a set of k-means or k-means++ runs.
93 void LogMetaStats(Scalar min_cost, Scalar max_cost, Scalar total_cost,
94  double min_time, double max_time, double total_time, int num_attempts) {
95  LOG(false, "Aggregate info over " << num_attempts << " runs:" << endl);
96  LOG(false, " Cost: min=" << min_cost << " average=" << (total_cost / num_attempts)
97  << " max=" << max_cost << endl);
98  LOG(false, " Time: min=" << min_time << " average=" << (total_time / num_attempts)
99  << " max=" << max_time << endl << endl);
100 }
101 
102 // See KMeans.h
103 Scalar RunKMeans(int n, int k, int d, Scalar *points, int attempts,
104  Scalar *ret_centers, int *ret_assignment) {
105  KM_ASSERT(k >= 1);
106 
107  // Create the tree and log
108  LOG(false, "Running k-means..." << endl);
109  KmTree tree(n, d, points);
110  LOG(false, "Done preprocessing..." << endl);
111 
112  // Initialization
113  Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);
114  int *unused_centers = (int*)malloc(sizeof(int)*n);
115  KM_ASSERT(centers != 0 && unused_centers != 0);
116  Scalar min_cost = -1, max_cost = -1, total_cost = 0;
117  double min_time = -1, max_time = -1, total_time = 0;
118 
119  // Handle k > n
120  if (k > n) {
121  memset(centers + n*d, -1, (k-d)*sizeof(Scalar));
122  k = n;
123  }
124 
125  // Run all the attempts
126  for (int attempt = 0; attempt < attempts; attempt++) {
127  double start_time = GetSeconds();
128 
129  // Choose centers uniformly at random
130  for (int i = 0; i < n; i++)
131  unused_centers[i] = i;
132  int num_unused_centers = n;
133  for (int i = 0; i < k; i++) {
134  int j = GetRandom(num_unused_centers--);
135  memcpy(centers + i*d, points + unused_centers[j]*d, d*sizeof(Scalar));
136  unused_centers[j] = unused_centers[num_unused_centers];
137  }
138 
139  // Run k-means
140  RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,
141  &min_time, &max_time, &total_time, ret_centers, ret_assignment);
142  }
143  LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);
144 
145  // Clean up and return
146  free(unused_centers);
147  free(centers);
148  return min_cost;
149 }
150 
151 // See KMeans.h
152 Scalar RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,
153  Scalar *ret_centers, int *ret_assignment) {
154  KM_ASSERT(k >= 1);
155 
156  // Create the tree and log
157  LOG(false, "Running k-means++..." << endl);
158  KmTree tree(n, d, points);
159  LOG(false, "Done preprocessing..." << endl);
160 
161  // Initialization
162  Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);
163  KM_ASSERT(centers != 0);
164  Scalar min_cost = -1, max_cost = -1, total_cost = 0;
165  double min_time = -1, max_time = -1, total_time = 0;
166 
167  // Run all the attempts
168  for (int attempt = 0; attempt < attempts; attempt++) {
169  double start_time = GetSeconds();
170 
171  // Choose centers using k-means++ seeding
172  tree.SeedKMeansPlusPlus(k, centers);
173 
174  // Run k-means
175  RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,
176  &min_time, &max_time, &total_time, ret_centers, ret_assignment);
177  }
178  LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);
179 
180  // Clean up and return
181  free(centers);
182  return min_cost;
183 }
void BASE_IMPEXP memcpy(void *dest, size_t destSize, const void *src, size_t copyCount) MRPT_NO_THROWS
An OS and compiler independent version of "memcpy".
Definition: os.cpp:358
Scalar RunKMeans(int n, int k, int d, Scalar *points, int attempts, Scalar *ret_centers, int *ret_assignment)
GLenum GLsizei n
Definition: glext.h:4618
STL namespace.
GLsizei const GLfloat * points
Definition: glext.h:4797
#define MRPT_UNUSED_PARAM(a)
Can be used to avoid "not used parameters" warnings from the compiler.
#define LOG(verbose, text)
Scalar RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts, Scalar *ret_centers, int *ret_assignment)
Scalar DoKMeansStep(int k, Scalar *centers, int *assignment) const
Definition: KmTree.cpp:54
void ClearKMeansLogging()
int GetRandom(int n)
Definition: KmUtils.h:96
Definition: KmTree.h:35
static vector< ostream * > gVerboseLogOutputs
#define KM_ASSERT(expression)
Definition: KmUtils.h:84
Scalar SeedKMeansPlusPlus(int k, Scalar *centers) const
Definition: KmTree.cpp:292
void AddKMeansLogging(std::ostream *out, bool verbose)
static void RunKMeansOnce(const KmTree &tree, int n, int k, int d, Scalar *points, Scalar *centers, Scalar *min_cost, Scalar *max_cost, Scalar *total_cost, double start_time, double *min_time, double *max_time, double *total_time, Scalar *best_centers, int *best_assignment)
static double GetSeconds()
static vector< ostream * > gLogOutputs
double Scalar
Definition: KmUtils.h:41
void LogMetaStats(Scalar min_cost, Scalar max_cost, Scalar total_cost, double min_time, double max_time, double total_time, int num_attempts)



Page generated by Doxygen 1.8.14 for MRPT 1.5.7 Git: 5902e14cc Wed Apr 24 15:04:01 2019 +0200 at lun oct 28 01:39:17 CET 2019