Skip to content

Commit d78e5c5

Browse files
committedMar 9, 2023
first commit
0 parents  commit d78e5c5

9 files changed

+294
-0
lines changed
 

‎.devcontainer/devcontainer.json

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"name": "PySpark local cluster",
3+
"dockerComposeFile": ["./docker-compose.yaml"],
4+
"service": "spark",
5+
"workspaceFolder": "/home/jovyan/code",
6+
7+
"customizations": {
8+
"vscode" : {
9+
"settings": {
10+
"terminal.integrated.profiles.linux": {
11+
"bash": {
12+
"path": "/bin/bash"
13+
}
14+
},
15+
"terminal.integrated.defaultProfile.linux": "bash",
16+
"python.linting.enabled": true,
17+
"python.linting.pylintEnabled": true
18+
},
19+
"extensions": [
20+
"ms-python.python"
21+
]
22+
}
23+
}
24+
}

‎.devcontainer/docker-compose.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
version: '3'
2+
services:
3+
spark:
4+
image: jupyter/pyspark-notebook
5+
volumes:
6+
- ..:/home/jovyan/code
7+
ports:
8+
- "8888:8888"
9+
command: start.sh jupyter notebook --NotebookApp.token='' --NotebookApp.disable_check_xsrf=true --NotebookApp.allow_origin='*' --NotebookApp.ip='0.0.0.0'

‎.gitattributes

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* text=auto

‎.gitignore

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/#use-with-ide
110+
.pdm.toml
111+
112+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113+
__pypackages__/
114+
115+
# Celery stuff
116+
celerybeat-schedule
117+
celerybeat.pid
118+
119+
# SageMath parsed files
120+
*.sage.py
121+
122+
# Environments
123+
.env
124+
.venv
125+
env/
126+
venv/
127+
ENV/
128+
env.bak/
129+
venv.bak/
130+
131+
# Spyder project settings
132+
.spyderproject
133+
.spyproject
134+
135+
# Rope project settings
136+
.ropeproject
137+
138+
# mkdocs documentation
139+
/site
140+
141+
# mypy
142+
.mypy_cache/
143+
.dmypy.json
144+
dmypy.json
145+
146+
# Pyre type checker
147+
.pyre/
148+
149+
# pytype static type analyzer
150+
.pytype/
151+
152+
# Cython debug symbols
153+
cython_debug/
154+
155+
# PyCharm
156+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158+
# and can be added to the global gitignore or merged into this file. For a more nuclear
159+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160+
#.idea/

‎README.md

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Local PySpark dev environment
2+
3+
This repo provides everything needed for a self-contained, local PySpark 1-node "cluster" running on your laptop, including a Jupyter notebook environment.
4+
5+
It uses [Visual Studio Code](https://code.visualstudio.com/) and the [devcontainer feature](https://code.visualstudio.com/docs/devcontainers/containers) to run the Spark/Jupyter server in Docker, connected to a VS Code dev environment frontend.
6+
7+
## Requirements
8+
9+
- Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (you don't have to be a Docker super-expert :-))
10+
11+
- Install [Visual Studio Code](https://code.visualstudio.com/download)
12+
13+
- Install the [VS Code Remote Development pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack)
14+
15+
## Setup
16+
17+
1. Install required tools
18+
19+
1. Git clone this repo to your laptop
20+
21+
1. Open the local repo folder in VS Code
22+
23+
1. Open the [VS Code command palette](https://code.visualstudio.com/docs/getstarted/userinterface#_command-palette) and select/type 'Reopen in Container'
24+
25+
1. Wait while the devcontainer is built and initialized, this may take several minutes
26+
27+
1. Open [test.ipynb](./test.ipynb) in VS Code
28+
29+
1. If you get an HTTP warning, click 'Yes'
30+
31+
![HTTP warning](./media/http_warning.png)
32+
33+
1. Wait a few moments for the Jupyter kernel to initialize... if after about 30 seconds or so the button on the upper-right still says 'Select Kernel', click that and select the option with 'ipykernel'
34+
35+
![Choose kernel](./media/select_kernel.png)
36+
37+
![ipykernel](./media/ipykernel.png)
38+
39+
1. Run each cell in the notebook, in order... see the output of cell 3
40+
41+
1. Have fun exploring [PySpark](https://sparkbyexamples.com/pyspark-tutorial/)!

‎media/http_warning.png

40 KB
Loading

‎media/ipykernel.png

89.5 KB
Loading

‎media/select_kernel.png

99.7 KB
Loading

‎test.ipynb

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"\n",
10+
"# Import SparkSession\n",
11+
"from pyspark.sql import SparkSession\n",
12+
"\n",
13+
"# Create SparkSession \n",
14+
"spark = SparkSession.builder \\\n",
15+
" .master(\"local[1]\") \\\n",
16+
" .appName(\"test\") \\\n",
17+
" .getOrCreate()\n"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": null,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"\n",
27+
"data = [\n",
28+
" ('James','','Smith','1991-04-01','M',3000),\n",
29+
" ('Michael','Rose','','2000-05-19','M',4000),\n",
30+
" ('Robert','','Williams','1978-09-05','M',4000),\n",
31+
" ('Maria','Anne','Jones','1967-12-01','F',4000),\n",
32+
" ('Jen','Mary','Brown','1980-02-17','F',-1)\n",
33+
"]\n",
34+
"\n",
35+
"columns = [\"firstname\",\"middlename\",\"lastname\",\"dob\",\"gender\",\"salary\"]"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": null,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"df = spark.createDataFrame(data=data, schema = columns)\n",
45+
"\n",
46+
"df.show()"
47+
]
48+
}
49+
],
50+
"metadata": {
51+
"kernelspec": {
52+
"display_name": "Python 3 (ipykernel)",
53+
"language": "python",
54+
"name": "python3"
55+
}
56+
},
57+
"nbformat": 4,
58+
"nbformat_minor": 2
59+
}

0 commit comments

Comments
 (0)
Please sign in to comment.