-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcountmin.go
135 lines (122 loc) · 3.13 KB
/
countmin.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package countmin
import (
"errors"
"hash/fnv"
"math"
)
// MAXINT64 is max 64 bit integer size
const MAXINT64 = 1<<63 - 1
// CountMin holds count table for calculating
// frequencies from a given input (usually a stream of data)
type CountMin struct {
depth int
width int
size int64
eps float64
confidence float64
table [][]int64
}
// New CountMin given a depth and width of the frequency
// table to use. Returns the newly created CountMin
func New(depth, width int) *CountMin {
cm := &CountMin{
depth: depth,
width: width,
eps: 2.0 / float64(width),
confidence: 1.0 - 1.0/math.Pow(2, float64(depth)),
}
cm.initTable()
return cm
}
// NewWithEpsCount initializes a CountMin given a confidence
// to ensure between 0 < 1.0 which effects number of hashes used
// and an epsilon between 0 and 1 with a smaller value for
// higher precision as it affects the width of the table
func NewWithEpsCount(confidence, eps float64) *CountMin {
if confidence >= 1.0 {
confidence = 0.99999
}
cm := &CountMin{
eps: eps,
confidence: confidence,
width: int(math.Ceil(float64(2.0) / eps)),
depth: int(math.Ceil(-math.Log(1-confidence) / math.Log(2))),
}
cm.initTable()
return cm
}
func (cm *CountMin) initTable() {
cm.table = make([][]int64, cm.depth)
for i := range cm.table {
cm.table[i] = make([]int64, cm.width)
}
}
// RelativeError returns epsilon of CountMin
func (cm *CountMin) RelativeError() float64 {
return cm.eps
}
// Confidence returns confidence of CountMin
func (cm *CountMin) Confidence() float64 {
return cm.confidence
}
// Size returns total size of CountMin
func (cm *CountMin) Size() int64 {
return cm.size
}
// Add inserts an item in bytes and a count associated
// which will also increment size of CountMin
func (cm *CountMin) Add(item []byte, count int64) {
if count < 0 {
return
}
hashed := cm.hasher(item)
for i := 0; i < cm.depth; i++ {
cm.table[i][hashed[i]] += count
}
cm.size += count
}
// Count gets an associated total count for an item
// in the CountMin
func (cm *CountMin) Count(item []byte) int64 {
var answer int64 = MAXINT64
var val int64
hashed := cm.hasher(item)
for i := 0; i < cm.depth; i++ {
val = cm.table[i][hashed[i]]
if val < answer {
answer = val
}
}
return answer
}
// Merge two CountMin tables and return a new table of their union
func Merge(cm1, cm2 *CountMin) (*CountMin, error) {
if cm1.depth != cm2.depth {
return nil, errors.New("Depth does not match for merging")
}
if cm1.width != cm2.width {
return nil, errors.New("Width does not match for merging")
}
newCm := New(cm1.depth, cm2.width)
var val int64
for i := 0; i < newCm.depth; i++ {
for j := 0; j < newCm.width; j++ {
val = cm1.table[i][j] + cm2.table[i][j]
newCm.table[i][j] = val
}
}
newCm.size = cm1.size + cm2.size
return newCm, nil
}
func (cm *CountMin) hasher(item []byte) []int64 {
total := cm.depth
max := int64(cm.width)
result := make([]int64, total)
hash := fnv.New32a()
hash.Write(item)
sum := int64(hash.Sum32())
for i := 0; i < total; i++ {
result[i] = int64(math.Abs(float64((sum * int64(i)) % max)))
}
return result
}