diff --git a/data_hacks/__init__.py b/data_hacks/__init__.py new file mode 100644 index 0000000..8a7df13 --- /dev/null +++ b/data_hacks/__init__.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +import sys +from data_hacks.bar_chart import run as bar_chart +from data_hacks import histogram as hist +from data_hacks.ninety_five_percent import run as ninety_five_percent +from data_hacks.sample import run as sample + + +class BarChartOpt(object): + + def __init__( + self, agg_value_key=False, agg_key_value=False, sort_keys=True, + sort_values=False, reverse_sort=False, numeric_sort=False, + percentage=False, dot="∎"): + self.agg_value_key = agg_value_key + self.agg_key_value = agg_key_value + self.sort_keys = sort_keys + self.sort_values = sort_values + self.reverse_sort = reverse_sort + self.numeric_sort = numeric_sort + self.percentage = percentage + self.dot = dot + + +class HistogramOpt(object): + + def __init__( + self, agg_value_key=False, agg_key_value=False, min=None, + max=None, buckets=None, logscale=False, custbuckets=None, + mvsd=True, format="%10.4f", percentage=False, dot="∎"): + self.agg_value_key = agg_value_key + self.agg_key_value = agg_key_value + self.min = min + self.max = max + self.buckets = buckets + self.logscale = logscale + self.custbuckets = custbuckets + self.mvsd = mvsd + self.format = format + self.percentage = percentage + self.dot = dot + + +def histogram(stream, options, output=sys.stdout): + hist.histogram(hist.load_stream( + stream, options.agg_value_key, options.agg_key_value), options, output) diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py index c68af8b..2d01ca0 100755 --- a/data_hacks/bar_chart.py +++ b/data_hacks/bar_chart.py @@ -20,6 +20,7 @@ https://github.com/bitly/data_hacks """ +from __future__ import print_function import sys import math from collections import defaultdict @@ -37,7 +38,7 @@ def load_stream(input_stream): if clean_line: yield clean_line -def run(input_stream, options): +def run(input_stream, options, output=sys.stdout): data = defaultdict(int) total = 0 for row in input_stream: @@ -54,20 +55,22 @@ def run(input_stream, options): else: data[row] += 1 total += 1 - + if not data: - print "Error: no data" + print("Error: no data", file=output) sys.exit(1) - + max_length = max([len(key) for key in data.keys()]) max_length = min(max_length, 50) value_characters = 80 - max_length max_value = max(data.values()) scale = int(math.ceil(float(max_value) / value_characters)) scale = max(1, scale) - - print "# each " + options.dot + " represents a count of %d. total %d" % (scale, total) - + + print( + "# each " + options.dot + " represents a count of %d. total %d" % (scale, total), + file=output) + if options.sort_values: data = [[value, key] for key, value in data.items()] data.sort(key=lambda x: x[0], reverse=options.reverse_sort) @@ -79,13 +82,15 @@ def run(input_stream, options): data.sort(key=lambda x: (Decimal(x[1])), reverse=options.reverse_sort) else: data.sort(key=lambda x: x[1], reverse=options.reverse_sort) - + str_format = "%" + str(max_length) + "s [%6d] %s%s" percentage = "" for value, key in data: if options.percentage: percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total)) - print str_format % (key[:max_length], value, (value / scale) * options.dot, percentage) + print( + str_format % (key[:max_length], value, (value / scale) * options.dot, percentage), + file=output) if __name__ == "__main__": parser = OptionParser() @@ -107,10 +112,9 @@ def run(input_stream, options): parser.add_option("--dot", dest="dot", default='∎', help="Dot representation") (options, args) = parser.parse_args() - + if sys.stdin.isatty(): parser.print_usage() - print "for more help use --help" + print("for more help use --help") sys.exit(1) run(load_stream(sys.stdin), options) - diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 3d16cc8..f4db5a8 100755 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -24,6 +24,7 @@ https://github.com/bitly/data_hacks """ +from __future__ import print_function import sys from decimal import Decimal import logging @@ -97,7 +98,7 @@ def load_stream(input_stream, agg_value_key, agg_key_value): yield DataPoint(Decimal(clean_line), 1) except: logging.exception('failed %r', line) - print >>sys.stderr, "invalid line %r" % line + print("invalid line %r" % line, file=sys.stderr) def median(values, key=None): @@ -121,7 +122,7 @@ def test_median(): assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10]) -def histogram(stream, options): +def histogram(stream, options, output=sys.stdout): """ Loop over the stream and add each entry to the dataset, printing out at the end. @@ -233,15 +234,17 @@ def log_steps(k, n): bucket_scale = int(max(bucket_counts) / 75) print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % - (samples, min_v, max_v)) + (samples, min_v, max_v), file=output) if skipped: print("# %d value%s outside of min/max" % - (skipped, skipped > 1 and 's' or '')) + (skipped, skipped > 1 and 's' or ''), file=output) if options.mvsd: print("# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), - median(accepted_data, key=lambda x: x.value))) - print "# each " + options.dot + " represents a count of %d" % bucket_scale + median(accepted_data, key=lambda x: x.value)), file=output) + print( + "# each " + options.dot + " represents a count of %d" % bucket_scale, + file=output) bucket_min = min_v bucket_max = min_v percentage = "" @@ -256,8 +259,8 @@ def log_steps(k, n): if options.percentage: percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples)) - print format_string % (bucket_min, bucket_max, bucket_count, options.dot * - star_count, percentage) + print(format_string % (bucket_min, bucket_max, bucket_count, options.dot * + star_count, percentage), file=output) if __name__ == "__main__": @@ -294,7 +297,7 @@ def log_steps(k, n): if sys.stdin.isatty(): # if isatty() that means it's run without anything piped into it parser.print_usage() - print "for more help use --help" + print("for more help use --help") sys.exit(1) histogram(load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options) diff --git a/data_hacks/ninety_five_percent.py b/data_hacks/ninety_five_percent.py index 1410227..0394aae 100755 --- a/data_hacks/ninety_five_percent.py +++ b/data_hacks/ninety_five_percent.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# +# # Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -20,14 +20,15 @@ https://github.com/bitly/data_hacks """ +from __future__ import print_function import sys import os from decimal import Decimal -def run(): +def run(stream=sys.stdin, output=sys.stdout): count = 0 data = {} - for line in sys.stdin: + for line in stream: line = line.strip() if not line: # skip empty lines (ie: newlines) @@ -37,9 +38,9 @@ def run(): count +=1 data[t] = data.get(t, 0) + 1 except: - print >>sys.stderr, "invalid line %r" % line - print calc_95(data, count) - + print("invalid line %r" % line, file=sys.stderr) + print(calc_95(data, count), file=output) + def calc_95(data, count): # find the time it took for x entry, where x is the threshold threshold = Decimal(count) * Decimal('.95') @@ -54,6 +55,6 @@ def calc_95(data, count): if __name__ == "__main__": if sys.stdin.isatty() or '--help' in sys.argv or '-h' in sys.argv: - print "Usage: cat data | %s" % os.path.basename(sys.argv[0]) + print("Usage: cat data | %s" % os.path.basename(sys.argv[0])) sys.exit(1) run() diff --git a/data_hacks/sample.py b/data_hacks/sample.py index c3296ab..9206edb 100755 --- a/data_hacks/sample.py +++ b/data_hacks/sample.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# +# # Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -20,16 +20,16 @@ https://github.com/bitly/data_hacks """ +from __future__ import print_function import sys import random from optparse import OptionParser from decimal import Decimal -def run(sample_rate): - input_stream = sys.stdin +def run(sample_rate, input_stream=sys.stdin, output=sys.stdout): for line in input_stream: if random.randint(1,100) <= sample_rate: - sys.stdout.write(line) + output.write(line) def get_sample_rate(rate_string): """ return a rate as a percentage""" @@ -49,17 +49,17 @@ def get_sample_rate(rate_string): parser = OptionParser(usage="cat data | %prog [options] [sample_rate]") parser.add_option("--verbose", dest="verbose", default=False, action="store_true") (options, args) = parser.parse_args() - + if not args or sys.stdin.isatty(): parser.print_usage() sys.exit(1) - + try: sample_rate = get_sample_rate(sys.argv[-1]) except ValueError, e: - print >>sys.stderr, e + print(e, file=sys.stderr) parser.print_usage() sys.exit(1) if options.verbose: - print >>sys.stderr, "Sample rate is %d%%" % sample_rate + print("Sample rate is %d%%" % sample_rate, file=sys.stderr) run(sample_rate)