Skip to content

Commit fe6752c

Browse files
committed
benchmark: put downloaded sites into a configurable subdir
1 parent 2e5cce1 commit fe6752c

File tree

4 files changed

+73
-34
lines changed

4 files changed

+73
-34
lines changed

splash/benchmark/README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite.
33
To use it, do the following:
44

55
- install ``httrack``
6-
- create a directory for downloaded files, e.g. ``files``
7-
- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
6+
- run ``python download_sites.py``, it will create ``sites`` subdirectory in
7+
current directory and download sites to be used in the benchmark there
88
- run ``python benchmark.py`` to run the benchmark
99

splash/benchmark/benchmark.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from multiprocessing.pool import ThreadPool
1818
from pprint import pformat
1919
from time import time
20+
import re
2021

2122
import requests
2223
from splash.benchmark.file_server import serve_files
@@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params):
6566

6667
#: Port at which static pages will be served.
6768
PORT = 8806
68-
#: Static pages to be used in the benchmark.
69-
PAGES = glob('localhost_8806/*.html')
7069
#: Combinations of width & height to test.
7170
WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
71+
#: Splash log filename.
7272
SPLASH_LOG = 'splash.log'
7373
#: This script is used to collect maxrss & cpu time from splash process.
7474
GET_PERF_STATS_SCRIPT = """
@@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params):
8585
help='Request thread count')
8686
parser.add_argument('--request-count', type=int, default=10,
8787
help='Benchmark request count')
88+
parser.add_argument('--sites-dir', type=str, default='sites',
89+
help='Directory with downloaded sites')
8890

8991

9092
def generate_requests(splash, args):
9193
log = logging.getLogger('generate_requests')
9294
log.info("Using pRNG seed: %s", args.seed)
95+
96+
# Static pages (relative to sites_dir) to be used in the benchmark.
97+
pages = [re.sub('^%s/' % args.sites_dir, '', v)
98+
for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
99+
'*.html'))]
100+
for p in pages:
101+
log.info("Using page for benchmark: %s", p)
102+
93103
rng = random.Random(args.seed)
94104
for i in xrange(args.request_count):
95-
page = rng.choice(PAGES)
105+
page = rng.choice(pages)
96106
width, height = rng.choice(WIDTH_HEIGHT)
97107
req_factory = rng.choice(REQ_FACTORIES)
98108
url = 'http://localhost:%d/%s' % (PORT, page)
@@ -140,7 +150,7 @@ def main():
140150
'--disable-xvfb',
141151
'--max-timeout=600'])
142152

143-
with splash, serve_files(PORT):
153+
with splash, serve_files(PORT, args.sites_dir):
144154
start_time = time()
145155
results = parallel_map(invoke_request, generate_requests(splash, args),
146156
args.thread_count)

splash/benchmark/download_sites.py

100644100755
+39-23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Site downloader script for Splash benchmark suite.
5+
"""
6+
7+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
8+
import errno
19
import json
10+
import os
211
import re
312
import subprocess
413
from urlparse import urlsplit
@@ -9,7 +18,7 @@
918
from splash.benchmark.file_server import serve_files
1019
from splash.tests.stress import lua_runonce
1120

12-
script_html = """
21+
SCRIPT_HTML = """
1322
function main(splash)
1423
splash:set_images_enabled(false)
1524
splash:go(splash.args.url)
@@ -18,24 +27,19 @@
1827
end
1928
"""
2029

21-
script_png = """
22-
23-
function main(splash)
24-
splash:go(splash.args.url)
25-
splash:wait(0.5)
26-
return splash:png()
27-
end
28-
"""
29-
30-
30+
#: This UA is used by httrack to mimic Splash requests when downloading sites.
3131
USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"
3232

33-
3433
PORT = 8806
3534

35+
parser = ArgumentParser(description=__doc__,
36+
formatter_class=ArgumentDefaultsHelpFormatter)
37+
parser.add_argument('--sites-dir', default='sites',
38+
help='Directory for downloaded sites')
3639

37-
def preprocess_main_page(url):
38-
out = json.loads(lua_runonce(script_html, url=url,
40+
41+
def preprocess_main_page(sites_dir, url):
42+
out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
3943
splash_args=['--disable-lua-sandbox',
4044
'--disable-xvfb',
4145
'--max-timeout=600'],
@@ -56,13 +60,13 @@ def preprocess_main_page(url):
5660
out['html'] = html.tostring(root, encoding='utf-8',
5761
doctype='<!DOCTYPE html>')
5862
filename = re.sub(r'[^\w]+', '_', url) + '.html'
59-
with open(filename, 'w') as f:
63+
with open(os.path.join(sites_dir, filename), 'w') as f:
6064
f.write(out['html'])
6165
return filename
6266

6367

64-
def download_sites(sites):
65-
local_files = [preprocess_main_page(s) for s in sites]
68+
def download_sites(sites_dir, sites):
69+
local_files = [preprocess_main_page(sites_dir, s) for s in sites]
6670

6771
local_urls = [
6872
'http://localhost:%(port)d/%(filename)s' % {
@@ -75,12 +79,20 @@ def download_sites(sites):
7579
'-%P', # Try parsing links in non-href/src sections
7680
'-F', USERAGENT, # Emulate splash UA
7781
'--depth=1']
78-
subprocess.check_call(['httrack'] + args + local_urls)
79-
80-
81-
if __name__ == '__main__':
82-
with serve_files(PORT):
83-
download_sites([
82+
subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir)
83+
84+
85+
def main():
86+
args = parser.parse_args()
87+
try:
88+
os.makedirs(args.sites_dir)
89+
except OSError as e:
90+
if e.errno != errno.EEXIST:
91+
raise
92+
elif not os.path.isdir(args.sites_dir):
93+
raise RuntimeError("Not a directory: %s" % args.sites_dir)
94+
with serve_files(PORT, args.sites_dir):
95+
download_sites(args.sites_dir, [
8496
'http://www.wikipedia.org',
8597
'http://www.google.com',
8698
'http://www.reddit.com',
@@ -89,3 +101,7 @@ def download_sites(sites):
89101
# "http://blog.pinterest.com",
90102
# "http://imgur.com",
91103
])
104+
105+
106+
if __name__ == '__main__':
107+
main()

splash/benchmark/file_server.py

100644100755
+18-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Simple static file server.
5+
"""
6+
7+
import argparse
8+
import os
19
import SimpleHTTPServer
210
import SocketServer
311
import subprocess
4-
import sys
512
from contextlib import contextmanager
613

714

15+
parser = argparse.ArgumentParser(description=__doc__)
16+
parser.add_argument('port', type=int, help='Port number to listen at')
17+
parser.add_argument('directory', type=str, help='Directory to serve')
18+
19+
820
class ReusingTCPServer(SocketServer.TCPServer):
921
allow_reuse_address = True
1022

@@ -15,17 +27,18 @@ def address_string(self):
1527

1628

1729
@contextmanager
18-
def serve_files(port):
30+
def serve_files(port, directory):
1931
"""Serve files from current directory statically in a subprocess."""
2032
site_server = subprocess.Popen(['python', '-m', __name__,
21-
str(port)])
33+
str(port), directory])
2234
try:
2335
yield
2436
finally:
2537
site_server.terminate()
2638

2739

2840
if __name__ == '__main__':
29-
port = int(sys.argv[1])
30-
server = ReusingTCPServer(("", port), RequestHandler)
41+
args = parser.parse_args()
42+
os.chdir(args.directory)
43+
server = ReusingTCPServer(("", args.port), RequestHandler)
3144
server.serve_forever()

0 commit comments

Comments
 (0)