/src/arrow/cpp/src/arrow/util/tdigest.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | // approximate quantiles from arbitrary length dataset with O(1) space |
19 | | // based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl |
20 | | // - https://arxiv.org/abs/1902.04023 |
21 | | // - https://github.com/tdunning/t-digest |
22 | | |
23 | | #pragma once |
24 | | |
25 | | #include <cmath> |
26 | | #include <memory> |
27 | | #include <vector> |
28 | | |
29 | | #include "arrow/util/logging.h" |
30 | | #include "arrow/util/macros.h" |
31 | | #include "arrow/util/visibility.h" |
32 | | |
33 | | namespace arrow { |
34 | | |
35 | | class Status; |
36 | | |
37 | | namespace internal { |
38 | | |
39 | | class ARROW_EXPORT TDigest { |
40 | | public: |
41 | | explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500); |
42 | | ~TDigest(); |
43 | | TDigest(TDigest&&); |
44 | | TDigest& operator=(TDigest&&); |
45 | | |
46 | | // reset and re-use this tdigest |
47 | | void Reset(); |
48 | | |
49 | | // validate data integrity |
50 | | Status Validate() const; |
51 | | |
52 | | // dump internal data, only for debug |
53 | | void Dump() const; |
54 | | |
55 | | // buffer a single data point, consume internal buffer if full |
56 | | // this function is intensively called and performance critical |
57 | | // call it only if you are sure no NAN exists in input data |
58 | 0 | void Add(double value) { |
59 | 0 | DCHECK(!std::isnan(value)) << "cannot add NAN"; |
60 | 0 | if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) { |
61 | 0 | MergeInput(); |
62 | 0 | } |
63 | 0 | input_.push_back(value); |
64 | 0 | } |
65 | | |
66 | | // skip NAN on adding |
67 | | template <typename T> |
68 | 0 | typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) { |
69 | 0 | if (!std::isnan(value)) Add(value); |
70 | 0 | } |
71 | | |
72 | | template <typename T> |
73 | | typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) { |
74 | | Add(static_cast<double>(value)); |
75 | | } |
76 | | |
77 | | // merge with other t-digests, called infrequently |
78 | | void Merge(const std::vector<TDigest>& others); |
79 | | void Merge(const TDigest& other); |
80 | | |
81 | | // calculate quantile |
82 | | double Quantile(double q) const; |
83 | | |
84 | 0 | double Min() const { return Quantile(0); } |
85 | 0 | double Max() const { return Quantile(1); } |
86 | | double Mean() const; |
87 | | |
88 | | // check if this tdigest contains no valid data points |
89 | | bool is_empty() const; |
90 | | |
91 | | private: |
92 | | // merge input data with current tdigest |
93 | | void MergeInput() const; |
94 | | |
95 | | // input buffer, size = buffer_size * sizeof(double) |
96 | | mutable std::vector<double> input_; |
97 | | |
98 | | // hide other members with pimpl |
99 | | class TDigestImpl; |
100 | | std::unique_ptr<TDigestImpl> impl_; |
101 | | }; |
102 | | |
103 | | } // namespace internal |
104 | | } // namespace arrow |