Skip to content

Commit 1f797ff

Browse files
committed
Add support for importable histogram print function
Running histogram.py on a data file is cool, but most of my data is kept in local variables in scripts. Introduce print_histogram function which accepts a list of samples and prints the same histogram it would as if the list was exported to a file. Issue: #37 Signed-off-by: Gal Pressman <[email protected]>
1 parent f5e933f commit 1f797ff

File tree

2 files changed

+30
-20
lines changed

2 files changed

+30
-20
lines changed

data_hacks/__init__.py

Whitespace-only changes.

data_hacks/histogram.py

+30-20
Original file line numberDiff line numberDiff line change
@@ -121,27 +121,29 @@ def test_median():
121121
assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10])
122122

123123

124-
def histogram(stream, options):
124+
def _histogram(stream, minimum=None, maximum=None, num_buckets=None, logscale=False,
125+
custbuckets=None, calc_mvsd=True,
126+
bucket_format='%10.4f', calc_percentage=False, dot='∎'):
125127
"""
126128
Loop over the stream and add each entry to the dataset, printing out at the
127129
end.
128130
129131
stream yields Decimal()
130132
"""
131-
if not options.min or not options.max:
133+
if not minimum or not maximum:
132134
# glob the iterator here so we can do min/max on it
133135
data = list(stream)
134136
else:
135137
data = stream
136138
bucket_scale = 1
137139

138-
if options.min:
139-
min_v = Decimal(options.min)
140+
if minimum:
141+
min_v = Decimal(minimum)
140142
else:
141143
min_v = min(data, key=lambda x: x.value)
142144
min_v = min_v.value
143-
if options.max:
144-
max_v = Decimal(options.max)
145+
if maximum:
146+
max_v = Decimal(maximum)
145147
else:
146148
max_v = max(data, key=lambda x: x.value)
147149
max_v = max_v.value
@@ -151,11 +153,9 @@ def histogram(stream, options):
151153
diff = max_v - min_v
152154

153155
boundaries = []
154-
bucket_counts = []
155-
buckets = 0
156156

157-
if options.custbuckets:
158-
bound = options.custbuckets.split(',')
157+
if custbuckets:
158+
bound = custbuckets.split(',')
159159
bound_sort = sorted(map(Decimal, bound))
160160

161161
# if the last value is smaller than the maximum, replace it
@@ -174,8 +174,8 @@ def histogram(stream, options):
174174
# so no need to do a -1!
175175
bucket_counts = [0 for x in range(len(boundaries))]
176176
buckets = len(boundaries)
177-
elif options.logscale:
178-
buckets = options.buckets and int(options.buckets) or 10
177+
elif logscale:
178+
buckets = num_buckets and int(num_buckets) or 10
179179
if buckets <= 0:
180180
raise ValueError('# of buckets must be > 0')
181181

@@ -202,7 +202,7 @@ def log_steps(k, n):
202202
for step in log_steps(buckets, diff):
203203
boundaries.append(min_v + step)
204204
else:
205-
buckets = options.buckets and int(options.buckets) or 10
205+
buckets = num_buckets and int(num_buckets) or 10
206206
if buckets <= 0:
207207
raise ValueError('# of buckets must be > 0')
208208
step = diff / buckets
@@ -216,7 +216,7 @@ def log_steps(k, n):
216216
accepted_data = []
217217
for record in data:
218218
samples += record.count
219-
if options.mvsd:
219+
if calc_mvsd:
220220
mvsd.add(record.value, record.count)
221221
accepted_data.append(record)
222222
# find the bucket this goes in
@@ -237,29 +237,39 @@ def log_steps(k, n):
237237
if skipped:
238238
print("# %d value%s outside of min/max" %
239239
(skipped, skipped > 1 and 's' or ''))
240-
if options.mvsd:
240+
if calc_mvsd:
241241
print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
242242
(mvsd.mean(), mvsd.var(), mvsd.sd(),
243243
median(accepted_data, key=lambda x: x.value)))
244-
print "# each " + options.dot + " represents a count of %d" % bucket_scale
245-
bucket_min = min_v
244+
print "# each " + dot + " represents a count of %d" % bucket_scale
246245
bucket_max = min_v
247246
percentage = ""
248-
format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
247+
format_string = bucket_format + ' - ' + bucket_format + ' [%6d]: %s%s'
249248
for bucket in range(buckets):
250249
bucket_min = bucket_max
251250
bucket_max = boundaries[bucket]
252251
bucket_count = bucket_counts[bucket]
253252
star_count = 0
254253
if bucket_count:
255254
star_count = bucket_count / bucket_scale
256-
if options.percentage:
255+
if calc_percentage:
257256
percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
258257
Decimal(samples))
259-
print format_string % (bucket_min, bucket_max, bucket_count, options.dot *
258+
print format_string % (bucket_min, bucket_max, bucket_count, dot *
260259
star_count, percentage)
261260

262261

262+
def histogram(stream, options):
263+
_histogram(stream, options.min, options.max, options.buckets, options.logscale,
264+
options.custbuckets, options.mvsd, options.format, options.percentage,
265+
options.dot)
266+
267+
268+
def print_histogram(samples, **kwargs):
269+
stream = [str(x) for x in samples]
270+
_histogram(load_stream(stream, False, False), **kwargs)
271+
272+
263273
if __name__ == "__main__":
264274
parser = OptionParser()
265275
parser.usage = "cat data | %prog [options]"

0 commit comments

Comments
 (0)