26
26
27
27
import sys
28
28
from decimal import Decimal
29
+ import logging
29
30
import math
30
31
from optparse import OptionParser
32
+ from collections import namedtuple
31
33
32
34
class MVSD (object ):
33
35
""" A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -63,6 +65,8 @@ def sd(self):
63
65
def mean (self ):
64
66
return self .m
65
67
68
+ DataPoint = namedtuple ('DataPoint' , ['value' , 'count' ])
69
+
66
70
def test_mvsd ():
67
71
mvsd = MVSD ()
68
72
for x in range (10 ):
@@ -72,28 +76,35 @@ def test_mvsd():
72
76
assert '%.2f' % mvsd .var () == "8.25"
73
77
assert '%.14f' % mvsd .sd () == "2.87228132326901"
74
78
75
- def load_stream (input_stream ):
79
+ def load_stream (input_stream , agg ):
76
80
for line in input_stream :
77
81
clean_line = line .strip ()
78
82
if not clean_line :
79
83
# skip empty lines (ie: newlines)
80
84
continue
81
85
if clean_line [0 ] in ['"' , "'" ]:
82
- clean_line = clean_line .strip ('"' ). strip ( "'" )
86
+ clean_line = clean_line .strip (" \ " '" )
83
87
try :
84
- yield Decimal (clean_line )
88
+ if agg :
89
+ value , count = line .replace ("\t " , ' ' ).split (' ' , 2 )
90
+ yield DataPoint (Decimal (value ), int (count ))
91
+ continue
92
+ yield DataPoint (Decimal (clean_line ), 1 )
85
93
except :
94
+ logging .exception ('failed %r' , line )
86
95
print >> sys .stderr , "invalid line %r" % line
87
96
88
- def median (values ):
97
+ def median (values , key = None ):
98
+ if not key :
99
+ key = lambda x : x
89
100
length = len (values )
90
101
if length % 2 :
91
102
median_indeces = [length / 2 ]
92
103
else :
93
104
median_indeces = [length / 2 - 1 , length / 2 ]
94
105
95
- values = sorted (values )
96
- return sum ([values [i ] for i in median_indeces ]) / len (median_indeces )
106
+ values = sorted (values , key = key )
107
+ return sum (map ( key , [values [i ] for i in median_indeces ]) ) / len (median_indeces )
97
108
98
109
def test_median ():
99
110
assert 6 == median ([8 ,7 ,9 ,1 ,2 ,6 ,3 ]) # odd-sized list
@@ -117,11 +128,13 @@ def histogram(stream, options):
117
128
if options .min :
118
129
min_v = Decimal (options .min )
119
130
else :
120
- min_v = min (data )
131
+ min_v = min (data , key = lambda x : x .value )
132
+ min_v = min_v .value
121
133
if options .max :
122
134
max_v = Decimal (options .max )
123
135
else :
124
- max_v = max (data )
136
+ max_v = max (data , key = lambda x : x .value )
137
+ max_v = max_v .value
125
138
126
139
if not max_v > min_v :
127
140
raise ValueError ('max must be > min. max:%s min:%s' % (max_v , min_v ))
@@ -163,18 +176,18 @@ def histogram(stream, options):
163
176
samples = 0
164
177
mvsd = MVSD ()
165
178
accepted_data = []
166
- for value in data :
167
- samples += 1
179
+ for record in data :
180
+ samples += record . count
168
181
if options .mvsd :
169
- mvsd .add (value )
170
- accepted_data .append (value )
182
+ mvsd .add (record . value , record . count )
183
+ accepted_data .append (record )
171
184
# find the bucket this goes in
172
- if value < min_v or value > max_v :
173
- skipped += 1
185
+ if record . value < min_v or record . value > max_v :
186
+ skipped += record . count
174
187
continue
175
188
for bucket_postion , boundary in enumerate (boundaries ):
176
- if value <= boundary :
177
- bucket_counts [bucket_postion ] += 1
189
+ if record . value <= boundary :
190
+ bucket_counts [bucket_postion ] += record . count
178
191
break
179
192
180
193
# auto-pick the hash scale
@@ -185,7 +198,7 @@ def histogram(stream, options):
185
198
if skipped :
186
199
print "# %d value%s outside of min/max" % (skipped , skipped > 1 and 's' or '' )
187
200
if options .mvsd :
188
- print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd .mean (), mvsd .var (), mvsd .sd (), median (accepted_data ))
201
+ print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd .mean (), mvsd .var (), mvsd .sd (), median (accepted_data , key = lambda x : x . value ))
189
202
print "# each ∎ represents a count of %d" % bucket_scale
190
203
bucket_min = min_v
191
204
bucket_max = min_v
@@ -202,6 +215,8 @@ def histogram(stream, options):
202
215
if __name__ == "__main__" :
203
216
parser = OptionParser ()
204
217
parser .usage = "cat data | %prog [options]"
218
+ parser .add_option ("-a" , "--agg" , dest = "agg" , default = False , action = "store_true" ,
219
+ help = "Two column input format, space seperated with key<space>value" )
205
220
parser .add_option ("-m" , "--min" , dest = "min" ,
206
221
help = "minimum value for graph" )
207
222
parser .add_option ("-x" , "--max" , dest = "max" ,
@@ -219,5 +234,5 @@ def histogram(stream, options):
219
234
parser .print_usage ()
220
235
print "for more help use --help"
221
236
sys .exit (1 )
222
- histogram (load_stream (sys .stdin ), options )
237
+ histogram (load_stream (sys .stdin , options . agg ), options )
223
238
0 commit comments