gkc.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. /*
  2. * Copyright (c) 2016 Fastly, Inc.
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a copy
  5. * of this software and associated documentation files (the "Software"), to
  6. * deal in the Software without restriction, including without limitation the
  7. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. * sell copies of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. * IN THE SOFTWARE.
  21. */
  22. /*
  23. * A streaming quantile library.
  24. *
  25. *
  26. * A `gkc_summary` structure is used to summarize observations
  27. * within a given error range. Observations are inserted using
  28. * `gkc_insert_value`, quantile queries can then be performed with
  29. * `gkc_query` against the summary.
  30. * Provided two summaries are using the same epsilon, they can be merged
  31. * by calling `gkc_combine`.
  32. *
  33. * The algorithm guaranties a bounded memory usage to:
  34. * (11/(2 x epsilon))*log(2 * epsilon * N)
  35. *
  36. * For epsilon = 0.01 and N = 2^64, this is only 10k max in the
  37. * theoritical worse case. In practice, it's reliably using less:
  38. * inserting random data gets us * ~100 max insertions for > 50 millions
  39. * of entries.
  40. *
  41. * See www.cis.upenn.edu/~sanjeev/papers/sigmod01_quantiles.pdf for
  42. * the paper describing this algorithm and data structure.
  43. *
  44. */
  45. #include <stdio.h>
  46. #include <stdlib.h>
  47. #include <string.h>
  48. #include <limits.h>
  49. #include <inttypes.h>
  50. struct list {
  51. struct list *prev, *next;
  52. };
  53. struct gkc_summary {
  54. size_t nr_elems;
  55. double epsilon;
  56. uint64_t alloced;
  57. uint64_t max_alloced;
  58. struct list head;
  59. struct freelist *fl;
  60. };
  61. static inline int list_empty(struct list *l)
  62. {
  63. return l->next == l;
  64. }
  65. static inline void list_init(struct list *n)
  66. {
  67. n->next = n;
  68. n->prev = n;
  69. }
  70. static inline void list_del(struct list *n)
  71. {
  72. n->next->prev = n->prev;
  73. n->prev->next = n->next;
  74. }
  75. static inline void list_add(struct list *l, struct list *n)
  76. {
  77. n->next = l->next;
  78. n->next->prev = n;
  79. l->next = n;
  80. n->prev = l;
  81. }
  82. static inline void list_add_tail(struct list *l, struct list *n)
  83. {
  84. list_add(l->prev, n);
  85. }
  86. #undef offsetof
  87. #ifdef __compiler_offsetof
  88. #define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
  89. #else
  90. #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
  91. #endif
  92. #define container_of(ptr, type, member) ((type *)((char *)(ptr) - offsetof(type, member)))
  93. struct freelist {
  94. struct freelist *next;
  95. };
  96. static int ullog2(uint64_t x)
  97. {
  98. return 63 - __builtin_clzll(x);
  99. }
  100. struct gkc_tuple {
  101. uint64_t value;
  102. double g;
  103. uint64_t delta;
  104. struct list node;
  105. };
  106. #define list_to_tuple(ln) (container_of((ln), struct gkc_tuple, node))
  107. void gkc_summary_init(struct gkc_summary *s, double epsilon)
  108. {
  109. list_init(&s->head);
  110. s->epsilon = epsilon;
  111. }
  112. struct gkc_summary *gkc_summary_alloc(double epsilon)
  113. {
  114. struct gkc_summary *s;
  115. s = calloc(1, sizeof(*s));
  116. gkc_summary_init(s, epsilon);
  117. return s;
  118. }
  119. #include <assert.h>
  120. /* debug only, checks a number of properties that s must satisfy at all times */
  121. void gkc_sanity_check(struct gkc_summary *s)
  122. {
  123. uint64_t nr_elems, nr_alloced;
  124. struct list *cur;
  125. struct gkc_tuple *tcur;
  126. nr_elems = 0;
  127. nr_alloced = 0;
  128. cur = s->head.next;
  129. while (cur != &s->head) {
  130. tcur = list_to_tuple(cur);
  131. cur = cur->next;
  132. nr_elems += tcur->g;
  133. nr_alloced++;
  134. if (s->nr_elems > (1/s->epsilon)) {
  135. /* there must be enough observations for this to become true */
  136. assert(tcur->g + tcur->delta <= (s->nr_elems * s->epsilon * 2));
  137. }
  138. assert(nr_alloced <= s->alloced);
  139. }
  140. assert(nr_elems == s->nr_elems);
  141. assert(nr_alloced == s->alloced);
  142. }
  143. static struct gkc_tuple *gkc_alloc(struct gkc_summary *s)
  144. {
  145. s->alloced++;
  146. if (s->alloced > s->max_alloced) {
  147. s->max_alloced = s->alloced;
  148. }
  149. if (s->fl) {
  150. void *ret;
  151. ret = s->fl;
  152. s->fl = s->fl->next;
  153. return ret;
  154. }
  155. return malloc(sizeof(struct gkc_tuple));
  156. }
  157. static void gkc_free(struct gkc_summary *s, struct gkc_tuple *p)
  158. {
  159. struct freelist *flp = (void *)p;
  160. s->alloced--;
  161. flp->next = s->fl;
  162. s->fl = flp;
  163. }
  164. void gkc_summary_free(struct gkc_summary *s)
  165. {
  166. struct freelist *fl;
  167. struct list *cur;
  168. cur = s->head.next;
  169. while (cur != &s->head) {
  170. struct list *next;
  171. next = cur->next;
  172. gkc_free(s, list_to_tuple(cur));
  173. cur = next;
  174. }
  175. fl = s->fl;
  176. while (fl) {
  177. void *p;
  178. p = fl;
  179. fl = fl->next;
  180. free(p);
  181. }
  182. free(s);
  183. }
  184. uint64_t gkc_query(struct gkc_summary *s, double q)
  185. {
  186. struct list *cur, *next;
  187. int rank;
  188. double gi;
  189. double ne;
  190. rank = 0.5 + q * s->nr_elems;
  191. ne = s->nr_elems * s->epsilon;
  192. gi = 0;
  193. if (list_empty(&s->head)) {
  194. return 0;
  195. }
  196. cur = s->head.next;
  197. while (1) {
  198. struct gkc_tuple *tcur, *tnext;
  199. tcur = list_to_tuple(cur);
  200. next = cur->next;
  201. if (next == &s->head) {
  202. return tcur->value;
  203. }
  204. tnext = list_to_tuple(next);
  205. gi += tcur->g;
  206. if ((rank + ne) < (gi + tnext->g + tnext->delta)) {
  207. if ((rank + ne) < (gi + tnext->g)) {
  208. return tcur->value;
  209. }
  210. return tnext->value;
  211. }
  212. cur = next;
  213. }
  214. }
  215. static uint64_t band(struct gkc_summary *s, uint64_t delta)
  216. {
  217. uint64_t diff;
  218. diff = 1 + (s->epsilon * s->nr_elems * 2) - delta;
  219. return ullog2(diff);
  220. }
  221. static void gkc_compress(struct gkc_summary *s)
  222. {
  223. int max_compress;
  224. struct list *cur, *prev;
  225. struct gkc_tuple *tcur, *tprev;
  226. uint64_t bi, b_plus_1;
  227. max_compress = 2 * s->epsilon * s->nr_elems;
  228. if (s->nr_elems < 2) {
  229. return;
  230. }
  231. prev = s->head.prev;
  232. cur = prev->prev;
  233. while (cur != &s->head) {
  234. tcur = list_to_tuple(cur);
  235. tprev = list_to_tuple(prev);
  236. b_plus_1 = band(s, tprev->delta);
  237. bi = band(s, tcur->delta);
  238. if (bi <= b_plus_1 && ((tcur->g + tprev->g + tprev->delta) <= max_compress)) {
  239. tprev->g += tcur->g;
  240. list_del(cur);
  241. gkc_free(s, tcur);
  242. cur = prev->prev;
  243. continue;
  244. }
  245. prev = cur;
  246. cur = cur->prev;
  247. }
  248. }
  249. void gkc_insert_value(struct gkc_summary *s, double value)
  250. {
  251. struct list *cur = NULL;
  252. struct gkc_tuple *new, *tcur, *tnext = NULL;
  253. new = gkc_alloc(s);
  254. memset(new, 0, sizeof(*new));
  255. new->value = value;
  256. new->g = 1;
  257. list_init(&new->node);
  258. s->nr_elems++;
  259. /* first insert */
  260. if (list_empty(&s->head)) {
  261. list_add(&s->head, &new->node);
  262. return;
  263. }
  264. cur = s->head.next;
  265. tcur = list_to_tuple(cur);
  266. /* v < v0, new min */
  267. if (tcur->value > new->value) {
  268. list_add(&s->head, &new->node);
  269. goto out;
  270. }
  271. double gi = 0;
  272. while (cur->next != &s->head) {
  273. tnext = list_to_tuple(cur->next);
  274. tcur = list_to_tuple(cur);
  275. gi += tcur->g;
  276. if (tcur->value <= new->value && new->value < tnext->value) {
  277. /* INSERT "(v, 1, Δ)" into S between vi and vi+1; */
  278. new->delta = tcur->g + tcur->delta - 1;
  279. list_add(cur, &new->node);
  280. goto out;
  281. }
  282. cur = cur->next;
  283. }
  284. /* v > vs-1, new max */
  285. list_add_tail(&s->head, &new->node);
  286. out:
  287. if (s->nr_elems % (int)(1/(2*s->epsilon))) {
  288. gkc_compress(s);
  289. }
  290. }
  291. void gkc_print_summary(struct gkc_summary *s)
  292. {
  293. struct gkc_tuple *tcur;
  294. struct list *cur;
  295. fprintf(stderr, "nr_elems: %zu, epsilon: %.02f, alloced: %" PRIu64 ", overfilled: %.02f, max_alloced: %" PRIu64 "\n",
  296. s->nr_elems, s->epsilon, s->alloced, 2 * s->epsilon * s->nr_elems, s->max_alloced);
  297. if (list_empty(&s->head)) {
  298. fprintf(stderr, "Empty summary\n");
  299. return;
  300. }
  301. cur = s->head.next;
  302. while (cur != &s->head) {
  303. tcur = list_to_tuple(cur);
  304. fprintf(stderr, "(v: %" PRIu64 ", g: %.02f, d: %" PRIu64 ") ", tcur->value, tcur->g, tcur->delta);
  305. cur = cur->next;
  306. }
  307. fprintf(stderr, "\n");
  308. }
  309. struct gkc_summary *gkc_combine(struct gkc_summary *s1, struct gkc_summary *s2)
  310. {
  311. struct gkc_summary *snew;
  312. struct list *cur1, *cur2;
  313. struct gkc_tuple *tcur1, *tcur2, *tnew;
  314. if (s1->epsilon != s2->epsilon) {
  315. return NULL;
  316. }
  317. snew = gkc_summary_alloc(s1->epsilon);
  318. cur1 = s1->head.next;
  319. cur2 = s2->head.next;
  320. while (cur1 != &s1->head && cur2 != &s2->head) {
  321. tcur1 = list_to_tuple(cur1);
  322. tcur2 = list_to_tuple(cur2);
  323. tnew = gkc_alloc(snew);
  324. if (tcur1->value < tcur2->value) {
  325. tnew->value = tcur1->value;
  326. tnew->g = tcur1->g;
  327. tnew->delta = tcur1->delta;
  328. cur1 = cur1->next;
  329. } else {
  330. tnew->value = tcur2->value;
  331. tnew->g = tcur2->g;
  332. tnew->delta = tcur2->delta;
  333. cur2 = cur2->next;
  334. }
  335. list_add_tail(&snew->head, &tnew->node);
  336. snew->nr_elems += tnew->g;
  337. }
  338. while (cur1 != &s1->head) {
  339. tcur1 = list_to_tuple(cur1);
  340. tnew = gkc_alloc(snew);
  341. tnew->value = tcur1->value;
  342. tnew->g = tcur1->g;
  343. tnew->delta = tcur1->delta;
  344. list_add_tail(&snew->head, &tnew->node);
  345. snew->nr_elems += tnew->g;
  346. cur1 = cur1->next;
  347. }
  348. while (cur2 != &s2->head) {
  349. tcur2 = list_to_tuple(cur2);
  350. tnew = gkc_alloc(snew);
  351. tnew->value = tcur2->value;
  352. tnew->g = tcur2->g;
  353. tnew->delta = tcur2->delta;
  354. list_add_tail(&snew->head, &tnew->node);
  355. snew->nr_elems += tnew->g;
  356. cur2 = cur2->next;
  357. }
  358. snew->max_alloced = snew->alloced;
  359. gkc_compress(snew);
  360. return snew;
  361. }