Skip to content

Commit d1fa038

Browse files
Lesson 05 updated
1 parent a1cdff9 commit d1fa038

File tree

2 files changed

+126
-162
lines changed

2 files changed

+126
-162
lines changed

Diff for: Lesson05/5Activity.ipynb renamed to Lesson05/Activity01.ipynb

+45-63
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,20 @@
1414
},
1515
"cells": [
1616
{
17+
"cell_type": "code",
1718
"metadata": {
1819
"id": "Y-IWQkicJBML",
1920
"colab_type": "code",
21+
"outputId": "c27555c3-d97c-4b30-85e6-056ee57b620f",
2022
"colab": {
2123
"base_uri": "https://localhost:8080/",
2224
"height": 71
23-
},
24-
"outputId": "c27555c3-d97c-4b30-85e6-056ee57b620f"
25+
}
2526
},
26-
"cell_type": "code",
2727
"source": [
2828
"!pip install pyspark"
2929
],
30-
"execution_count": 1,
30+
"execution_count": 0,
3131
"outputs": [
3232
{
3333
"output_type": "stream",
@@ -40,12 +40,12 @@
4040
]
4141
},
4242
{
43+
"cell_type": "code",
4344
"metadata": {
4445
"id": "HuUYUBZOKJmO",
4546
"colab_type": "code",
4647
"colab": {}
4748
},
48-
"cell_type": "code",
4949
"source": [
5050
"from pyspark import SparkContext\n",
5151
"sc = SparkContext()\n",
@@ -56,54 +56,36 @@
5656
"outputs": []
5757
},
5858
{
59+
"cell_type": "markdown",
5960
"metadata": {
6061
"id": "Fh9724PMLRzi",
6162
"colab_type": "text"
6263
},
63-
"cell_type": "markdown",
6464
"source": [
6565
""
6666
]
6767
},
6868
{
69+
"cell_type": "code",
6970
"metadata": {
7071
"id": "YTuBXHztKLlR",
7172
"colab_type": "code",
72-
"colab": {
73-
"base_uri": "https://localhost:8080/",
74-
"height": 438
75-
},
76-
"outputId": "6d8d25e9-d440-4c60-c78e-d48c3a754351"
73+
"colab": {}
7774
},
78-
"cell_type": "code",
7975
"source": [
8076
"from pyspark import SparkContext\n",
8177
"sc = SparkContext()"
8278
],
83-
"execution_count": 3,
84-
"outputs": [
85-
{
86-
"output_type": "error",
87-
"ename": "ValueError",
88-
"evalue": "ignored",
89-
"traceback": [
90-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
91-
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
92-
"\u001b[0;32m<ipython-input-3-33ce3f59c0b1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
93-
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/context.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)\u001b[0m\n\u001b[1;32m 113\u001b[0m \"\"\"\n\u001b[1;32m 114\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_callsite\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfirst_spark_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mCallSite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 115\u001b[0;31m \u001b[0mSparkContext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_initialized\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgateway\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgateway\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 116\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,\n",
94-
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/context.py\u001b[0m in \u001b[0;36m_ensure_initialized\u001b[0;34m(cls, instance, gateway, conf)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0;34m\" created by %s at %s:%s \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m % (currentAppName, currentMaster,\n\u001b[0;32m--> 314\u001b[0;31m callsite.function, callsite.file, callsite.linenum))\n\u001b[0m\u001b[1;32m 315\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_active_spark_context\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minstance\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
95-
"\u001b[0;31mValueError\u001b[0m: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-2-817b62176d5d>:2 "
96-
]
97-
}
98-
]
79+
"execution_count": 0,
80+
"outputs": []
9981
},
10082
{
83+
"cell_type": "code",
10184
"metadata": {
10285
"id": "n4uh3owiKOxo",
10386
"colab_type": "code",
10487
"colab": {}
10588
},
106-
"cell_type": "code",
10789
"source": [
10890
"from pyspark.sql import SQLContext\n",
10991
"sqlc = SQLContext(sc)"
@@ -112,34 +94,34 @@
11294
"outputs": []
11395
},
11496
{
97+
"cell_type": "code",
11598
"metadata": {
11699
"id": "RYAvQpGuKV6j",
117100
"colab_type": "code",
118101
"colab": {}
119102
},
120-
"cell_type": "code",
121103
"source": [
122104
""
123105
],
124106
"execution_count": 0,
125107
"outputs": []
126108
},
127109
{
110+
"cell_type": "code",
128111
"metadata": {
129112
"id": "vkQ94MizKb-q",
130113
"colab_type": "code",
114+
"outputId": "0919208d-e967-4bee-b010-2347dbb885c1",
131115
"colab": {
132116
"base_uri": "https://localhost:8080/",
133117
"height": 204
134-
},
135-
"outputId": "0919208d-e967-4bee-b010-2347dbb885c1"
118+
}
136119
},
137-
"cell_type": "code",
138120
"source": [
139121
"df = sqlc.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('iris.csv')\n",
140122
"df.show(5)\n"
141123
],
142-
"execution_count": 6,
124+
"execution_count": 0,
143125
"outputs": [
144126
{
145127
"output_type": "stream",
@@ -161,12 +143,12 @@
161143
]
162144
},
163145
{
146+
"cell_type": "code",
164147
"metadata": {
165148
"id": "ivN5jB6kKkL-",
166149
"colab_type": "code",
167150
"colab": {}
168151
},
169-
"cell_type": "code",
170152
"source": [
171153
"from pyspark.sql.functions import mean\n",
172154
"avg_sl = df.select(mean('Sepallength')).toPandas()['avg(Sepallength)']\n"
@@ -175,22 +157,22 @@
175157
"outputs": []
176158
},
177159
{
160+
"cell_type": "code",
178161
"metadata": {
179162
"id": "6dPaqDT5Kt-t",
180163
"colab_type": "code",
164+
"outputId": "4a87a76b-b7b7-4aad-c3d5-0de36350fcb0",
181165
"colab": {
182166
"base_uri": "https://localhost:8080/",
183167
"height": 136
184-
},
185-
"outputId": "4a87a76b-b7b7-4aad-c3d5-0de36350fcb0"
168+
}
186169
},
187-
"cell_type": "code",
188170
"source": [
189171
"y = df\n",
190172
"y = y.na.fill(float(avg_sl),['Sepallength'])\n",
191173
"y.describe().show(1)\n"
192174
],
193-
"execution_count": 9,
175+
"execution_count": 0,
194176
"outputs": [
195177
{
196178
"output_type": "stream",
@@ -208,12 +190,12 @@
208190
]
209191
},
210192
{
193+
"cell_type": "code",
211194
"metadata": {
212195
"id": "hZ3h4SwmKw5H",
213196
"colab_type": "code",
214197
"colab": {}
215198
},
216-
"cell_type": "code",
217199
"source": [
218200
"from pyspark.mllib.stat import Statistics\n",
219201
"import pandas as pd\n"
@@ -222,25 +204,25 @@
222204
"outputs": []
223205
},
224206
{
207+
"cell_type": "code",
225208
"metadata": {
226209
"id": "EYhtf-TDK2om",
227210
"colab_type": "code",
228211
"colab": {}
229212
},
230-
"cell_type": "code",
231213
"source": [
232214
"z = y.fillna(1)"
233215
],
234216
"execution_count": 0,
235217
"outputs": []
236218
},
237219
{
220+
"cell_type": "code",
238221
"metadata": {
239222
"id": "xTEVwpImK4S4",
240223
"colab_type": "code",
241224
"colab": {}
242225
},
243-
"cell_type": "code",
244226
"source": [
245227
"a = z.drop('Species') \n",
246228
"features = a.rdd.map(lambda row: row[0:])\n"
@@ -249,35 +231,35 @@
249231
"outputs": []
250232
},
251233
{
234+
"cell_type": "code",
252235
"metadata": {
253236
"id": "uHjpSzLhK6Hd",
254237
"colab_type": "code",
255238
"colab": {}
256239
},
257-
"cell_type": "code",
258240
"source": [
259241
"correlation_matrix = Statistics.corr(features, method=\"pearson\")"
260242
],
261243
"execution_count": 0,
262244
"outputs": []
263245
},
264246
{
247+
"cell_type": "code",
265248
"metadata": {
266249
"id": "90YKC2n5K61I",
267250
"colab_type": "code",
251+
"outputId": "fd7c04d5-ea85-4477-adbf-cd104606880e",
268252
"colab": {
269253
"base_uri": "https://localhost:8080/",
270254
"height": 173
271-
},
272-
"outputId": "fd7c04d5-ea85-4477-adbf-cd104606880e"
255+
}
273256
},
274-
"cell_type": "code",
275257
"source": [
276258
"correlation_df = pd.DataFrame(correlation_matrix)\n",
277259
"correlation_df.index, correlation_df.columns = a.columns, a.columns\n",
278260
"correlation_df\n"
279261
],
280-
"execution_count": 14,
262+
"execution_count": 0,
281263
"outputs": [
282264
{
283265
"output_type": "execute_result",
@@ -356,22 +338,22 @@
356338
]
357339
},
358340
{
341+
"cell_type": "code",
359342
"metadata": {
360343
"id": "u6lph3zyK8wZ",
361344
"colab_type": "code",
345+
"outputId": "8b987d54-97cc-4d61-cca5-9968fbbc020d",
362346
"colab": {
363347
"base_uri": "https://localhost:8080/",
364348
"height": 34
365-
},
366-
"outputId": "8b987d54-97cc-4d61-cca5-9968fbbc020d"
349+
}
367350
},
368-
"cell_type": "code",
369351
"source": [
370352
"import pandas as pd\n",
371353
"dat = y.toPandas()\n",
372354
"type(dat)\n"
373355
],
374-
"execution_count": 15,
356+
"execution_count": 0,
375357
"outputs": [
376358
{
377359
"output_type": "execute_result",
@@ -388,24 +370,24 @@
388370
]
389371
},
390372
{
373+
"cell_type": "code",
391374
"metadata": {
392375
"id": "Wgsl7lqvK-y_",
393376
"colab_type": "code",
377+
"outputId": "fc9658c0-cbb9-4e37-cd2f-26b529a82c00",
394378
"colab": {
395379
"base_uri": "https://localhost:8080/",
396380
"height": 369
397-
},
398-
"outputId": "fc9658c0-cbb9-4e37-cd2f-26b529a82c00"
381+
}
399382
},
400-
"cell_type": "code",
401383
"source": [
402384
"import matplotlib.pyplot as plt\n",
403385
"import seaborn as sns\n",
404386
"%matplotlib inline\n",
405387
"sns.lmplot(x = \"Sepallength\", y = \"Petallength\", data = dat)\n",
406388
"plt.show()\n"
407389
],
408-
"execution_count": 16,
390+
"execution_count": 0,
409391
"outputs": [
410392
{
411393
"output_type": "display_data",
@@ -422,22 +404,22 @@
422404
]
423405
},
424406
{
407+
"cell_type": "code",
425408
"metadata": {
426409
"id": "I9llgXa5LB3Y",
427410
"colab_type": "code",
411+
"outputId": "192b63b3-db45-4b3d-b880-8adf67cc5dbf",
428412
"colab": {
429413
"base_uri": "https://localhost:8080/",
430414
"height": 369
431-
},
432-
"outputId": "192b63b3-db45-4b3d-b880-8adf67cc5dbf"
415+
}
433416
},
434-
"cell_type": "code",
435417
"source": [
436418
"import seaborn as sns\n",
437419
"sns.lmplot(x = \"Sepallength\", y = \"Petalwidth\", data = dat)\n",
438420
"plt.show()\n"
439421
],
440-
"execution_count": 17,
422+
"execution_count": 0,
441423
"outputs": [
442424
{
443425
"output_type": "display_data",
@@ -454,21 +436,21 @@
454436
]
455437
},
456438
{
439+
"cell_type": "code",
457440
"metadata": {
458441
"id": "-0P4bwUoLDWh",
459442
"colab_type": "code",
443+
"outputId": "85034f66-1b36-4903-e097-a7075bb34eb8",
460444
"colab": {
461445
"base_uri": "https://localhost:8080/",
462446
"height": 369
463-
},
464-
"outputId": "85034f66-1b36-4903-e097-a7075bb34eb8"
447+
}
465448
},
466-
"cell_type": "code",
467449
"source": [
468450
"sns.lmplot(x = \"Petallength\", y = \"Petalwidth\", data = dat)\n",
469451
"plt.show()\n"
470452
],
471-
"execution_count": 18,
453+
"execution_count": 0,
472454
"outputs": [
473455
{
474456
"output_type": "display_data",

0 commit comments

Comments
 (0)