Skip to content

Commit 7a761b4

Browse files
committed
Merge pull request #17 from jehiah/histogram_agg_17
Support ingesting Aggregate data from histogram.py
2 parents 3658aec + 8d9143e commit 7a761b4

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

data_hacks/bar_chart.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def run(input_stream, options):
4141
data = defaultdict(lambda:0)
4242
for row in input_stream:
4343
if options.agg_values:
44-
kv = row.split(' ',2);
44+
kv = row.replace('\t', ' ').split(' ',2);
4545
data[kv[0]]+= int(kv[1])
4646
else:
4747
data[row]+=1

data_hacks/histogram.py

100644100755
+33-18
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626

2727
import sys
2828
from decimal import Decimal
29+
import logging
2930
import math
3031
from optparse import OptionParser
32+
from collections import namedtuple
3133

3234
class MVSD(object):
3335
""" A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -63,6 +65,8 @@ def sd(self):
6365
def mean(self):
6466
return self.m
6567

68+
DataPoint = namedtuple('DataPoint', ['value', 'count'])
69+
6670
def test_mvsd():
6771
mvsd = MVSD()
6872
for x in range(10):
@@ -72,28 +76,35 @@ def test_mvsd():
7276
assert '%.2f' % mvsd.var() == "8.25"
7377
assert '%.14f' % mvsd.sd() == "2.87228132326901"
7478

75-
def load_stream(input_stream):
79+
def load_stream(input_stream, agg):
7680
for line in input_stream:
7781
clean_line = line.strip()
7882
if not clean_line:
7983
# skip empty lines (ie: newlines)
8084
continue
8185
if clean_line[0] in ['"', "'"]:
82-
clean_line = clean_line.strip('"').strip("'")
86+
clean_line = clean_line.strip("\"'")
8387
try:
84-
yield Decimal(clean_line)
88+
if agg:
89+
value, count = line.replace("\t", ' ').split(' ', 2)
90+
yield DataPoint(Decimal(value), int(count))
91+
continue
92+
yield DataPoint(Decimal(clean_line), 1)
8593
except:
94+
logging.exception('failed %r', line)
8695
print >>sys.stderr, "invalid line %r" % line
8796

88-
def median(values):
97+
def median(values, key=None):
98+
if not key:
99+
key= lambda x: x
89100
length = len(values)
90101
if length%2:
91102
median_indeces = [length/2]
92103
else:
93104
median_indeces = [length/2-1, length/2]
94105

95-
values = sorted(values)
96-
return sum([values[i] for i in median_indeces]) / len(median_indeces)
106+
values = sorted(values, key=key)
107+
return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)
97108

98109
def test_median():
99110
assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list
@@ -117,11 +128,13 @@ def histogram(stream, options):
117128
if options.min:
118129
min_v = Decimal(options.min)
119130
else:
120-
min_v = min(data)
131+
min_v = min(data, key=lambda x: x.value)
132+
min_v = min_v.value
121133
if options.max:
122134
max_v = Decimal(options.max)
123135
else:
124-
max_v = max(data)
136+
max_v = max(data, key=lambda x: x.value)
137+
max_v = max_v.value
125138

126139
if not max_v > min_v:
127140
raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
@@ -163,18 +176,18 @@ def histogram(stream, options):
163176
samples = 0
164177
mvsd = MVSD()
165178
accepted_data = []
166-
for value in data:
167-
samples +=1
179+
for record in data:
180+
samples += record.count
168181
if options.mvsd:
169-
mvsd.add(value)
170-
accepted_data.append(value)
182+
mvsd.add(record.value, record.count)
183+
accepted_data.append(record)
171184
# find the bucket this goes in
172-
if value < min_v or value > max_v:
173-
skipped +=1
185+
if record.value < min_v or record.value > max_v:
186+
skipped += record.count
174187
continue
175188
for bucket_postion, boundary in enumerate(boundaries):
176-
if value <= boundary:
177-
bucket_counts[bucket_postion] +=1
189+
if record.value <= boundary:
190+
bucket_counts[bucket_postion] += record.count
178191
break
179192

180193
# auto-pick the hash scale
@@ -185,7 +198,7 @@ def histogram(stream, options):
185198
if skipped:
186199
print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
187200
if options.mvsd:
188-
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data))
201+
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value))
189202
print "# each ∎ represents a count of %d" % bucket_scale
190203
bucket_min = min_v
191204
bucket_max = min_v
@@ -202,6 +215,8 @@ def histogram(stream, options):
202215
if __name__ == "__main__":
203216
parser = OptionParser()
204217
parser.usage = "cat data | %prog [options]"
218+
parser.add_option("-a", "--agg", dest="agg", default=False, action="store_true",
219+
help="Two column input format, space seperated with key<space>value")
205220
parser.add_option("-m", "--min", dest="min",
206221
help="minimum value for graph")
207222
parser.add_option("-x", "--max", dest="max",
@@ -219,5 +234,5 @@ def histogram(stream, options):
219234
parser.print_usage()
220235
print "for more help use --help"
221236
sys.exit(1)
222-
histogram(load_stream(sys.stdin), options)
237+
histogram(load_stream(sys.stdin, options.agg), options)
223238

0 commit comments

Comments
 (0)