|
14 | 14 | },
|
15 | 15 | "cells": [
|
16 | 16 | {
|
| 17 | + "cell_type": "code", |
17 | 18 | "metadata": {
|
18 | 19 | "id": "Y-IWQkicJBML",
|
19 | 20 | "colab_type": "code",
|
| 21 | + "outputId": "c27555c3-d97c-4b30-85e6-056ee57b620f", |
20 | 22 | "colab": {
|
21 | 23 | "base_uri": "https://localhost:8080/",
|
22 | 24 | "height": 71
|
23 |
| - }, |
24 |
| - "outputId": "c27555c3-d97c-4b30-85e6-056ee57b620f" |
| 25 | + } |
25 | 26 | },
|
26 |
| - "cell_type": "code", |
27 | 27 | "source": [
|
28 | 28 | "!pip install pyspark"
|
29 | 29 | ],
|
30 |
| - "execution_count": 1, |
| 30 | + "execution_count": 0, |
31 | 31 | "outputs": [
|
32 | 32 | {
|
33 | 33 | "output_type": "stream",
|
|
40 | 40 | ]
|
41 | 41 | },
|
42 | 42 | {
|
| 43 | + "cell_type": "code", |
43 | 44 | "metadata": {
|
44 | 45 | "id": "HuUYUBZOKJmO",
|
45 | 46 | "colab_type": "code",
|
46 | 47 | "colab": {}
|
47 | 48 | },
|
48 |
| - "cell_type": "code", |
49 | 49 | "source": [
|
50 | 50 | "from pyspark import SparkContext\n",
|
51 | 51 | "sc = SparkContext()\n",
|
|
56 | 56 | "outputs": []
|
57 | 57 | },
|
58 | 58 | {
|
| 59 | + "cell_type": "markdown", |
59 | 60 | "metadata": {
|
60 | 61 | "id": "Fh9724PMLRzi",
|
61 | 62 | "colab_type": "text"
|
62 | 63 | },
|
63 |
| - "cell_type": "markdown", |
64 | 64 | "source": [
|
65 | 65 | ""
|
66 | 66 | ]
|
67 | 67 | },
|
68 | 68 | {
|
| 69 | + "cell_type": "code", |
69 | 70 | "metadata": {
|
70 | 71 | "id": "YTuBXHztKLlR",
|
71 | 72 | "colab_type": "code",
|
72 |
| - "colab": { |
73 |
| - "base_uri": "https://localhost:8080/", |
74 |
| - "height": 438 |
75 |
| - }, |
76 |
| - "outputId": "6d8d25e9-d440-4c60-c78e-d48c3a754351" |
| 73 | + "colab": {} |
77 | 74 | },
|
78 |
| - "cell_type": "code", |
79 | 75 | "source": [
|
80 | 76 | "from pyspark import SparkContext\n",
|
81 | 77 | "sc = SparkContext()"
|
82 | 78 | ],
|
83 |
| - "execution_count": 3, |
84 |
| - "outputs": [ |
85 |
| - { |
86 |
| - "output_type": "error", |
87 |
| - "ename": "ValueError", |
88 |
| - "evalue": "ignored", |
89 |
| - "traceback": [ |
90 |
| - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
91 |
| - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", |
92 |
| - "\u001b[0;32m<ipython-input-3-33ce3f59c0b1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", |
93 |
| - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/context.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)\u001b[0m\n\u001b[1;32m 113\u001b[0m \"\"\"\n\u001b[1;32m 114\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_callsite\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfirst_spark_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mCallSite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 115\u001b[0;31m \u001b[0mSparkContext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_initialized\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgateway\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgateway\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 116\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,\n", |
94 |
| - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pyspark/context.py\u001b[0m in \u001b[0;36m_ensure_initialized\u001b[0;34m(cls, instance, gateway, conf)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0;34m\" created by %s at %s:%s \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m % (currentAppName, currentMaster,\n\u001b[0;32m--> 314\u001b[0;31m callsite.function, callsite.file, callsite.linenum))\n\u001b[0m\u001b[1;32m 315\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0mSparkContext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_active_spark_context\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minstance\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
95 |
| - "\u001b[0;31mValueError\u001b[0m: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-2-817b62176d5d>:2 " |
96 |
| - ] |
97 |
| - } |
98 |
| - ] |
| 79 | + "execution_count": 0, |
| 80 | + "outputs": [] |
99 | 81 | },
|
100 | 82 | {
|
| 83 | + "cell_type": "code", |
101 | 84 | "metadata": {
|
102 | 85 | "id": "n4uh3owiKOxo",
|
103 | 86 | "colab_type": "code",
|
104 | 87 | "colab": {}
|
105 | 88 | },
|
106 |
| - "cell_type": "code", |
107 | 89 | "source": [
|
108 | 90 | "from pyspark.sql import SQLContext\n",
|
109 | 91 | "sqlc = SQLContext(sc)"
|
|
112 | 94 | "outputs": []
|
113 | 95 | },
|
114 | 96 | {
|
| 97 | + "cell_type": "code", |
115 | 98 | "metadata": {
|
116 | 99 | "id": "RYAvQpGuKV6j",
|
117 | 100 | "colab_type": "code",
|
118 | 101 | "colab": {}
|
119 | 102 | },
|
120 |
| - "cell_type": "code", |
121 | 103 | "source": [
|
122 | 104 | ""
|
123 | 105 | ],
|
124 | 106 | "execution_count": 0,
|
125 | 107 | "outputs": []
|
126 | 108 | },
|
127 | 109 | {
|
| 110 | + "cell_type": "code", |
128 | 111 | "metadata": {
|
129 | 112 | "id": "vkQ94MizKb-q",
|
130 | 113 | "colab_type": "code",
|
| 114 | + "outputId": "0919208d-e967-4bee-b010-2347dbb885c1", |
131 | 115 | "colab": {
|
132 | 116 | "base_uri": "https://localhost:8080/",
|
133 | 117 | "height": 204
|
134 |
| - }, |
135 |
| - "outputId": "0919208d-e967-4bee-b010-2347dbb885c1" |
| 118 | + } |
136 | 119 | },
|
137 |
| - "cell_type": "code", |
138 | 120 | "source": [
|
139 | 121 | "df = sqlc.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('iris.csv')\n",
|
140 | 122 | "df.show(5)\n"
|
141 | 123 | ],
|
142 |
| - "execution_count": 6, |
| 124 | + "execution_count": 0, |
143 | 125 | "outputs": [
|
144 | 126 | {
|
145 | 127 | "output_type": "stream",
|
|
161 | 143 | ]
|
162 | 144 | },
|
163 | 145 | {
|
| 146 | + "cell_type": "code", |
164 | 147 | "metadata": {
|
165 | 148 | "id": "ivN5jB6kKkL-",
|
166 | 149 | "colab_type": "code",
|
167 | 150 | "colab": {}
|
168 | 151 | },
|
169 |
| - "cell_type": "code", |
170 | 152 | "source": [
|
171 | 153 | "from pyspark.sql.functions import mean\n",
|
172 | 154 | "avg_sl = df.select(mean('Sepallength')).toPandas()['avg(Sepallength)']\n"
|
|
175 | 157 | "outputs": []
|
176 | 158 | },
|
177 | 159 | {
|
| 160 | + "cell_type": "code", |
178 | 161 | "metadata": {
|
179 | 162 | "id": "6dPaqDT5Kt-t",
|
180 | 163 | "colab_type": "code",
|
| 164 | + "outputId": "4a87a76b-b7b7-4aad-c3d5-0de36350fcb0", |
181 | 165 | "colab": {
|
182 | 166 | "base_uri": "https://localhost:8080/",
|
183 | 167 | "height": 136
|
184 |
| - }, |
185 |
| - "outputId": "4a87a76b-b7b7-4aad-c3d5-0de36350fcb0" |
| 168 | + } |
186 | 169 | },
|
187 |
| - "cell_type": "code", |
188 | 170 | "source": [
|
189 | 171 | "y = df\n",
|
190 | 172 | "y = y.na.fill(float(avg_sl),['Sepallength'])\n",
|
191 | 173 | "y.describe().show(1)\n"
|
192 | 174 | ],
|
193 |
| - "execution_count": 9, |
| 175 | + "execution_count": 0, |
194 | 176 | "outputs": [
|
195 | 177 | {
|
196 | 178 | "output_type": "stream",
|
|
208 | 190 | ]
|
209 | 191 | },
|
210 | 192 | {
|
| 193 | + "cell_type": "code", |
211 | 194 | "metadata": {
|
212 | 195 | "id": "hZ3h4SwmKw5H",
|
213 | 196 | "colab_type": "code",
|
214 | 197 | "colab": {}
|
215 | 198 | },
|
216 |
| - "cell_type": "code", |
217 | 199 | "source": [
|
218 | 200 | "from pyspark.mllib.stat import Statistics\n",
|
219 | 201 | "import pandas as pd\n"
|
|
222 | 204 | "outputs": []
|
223 | 205 | },
|
224 | 206 | {
|
| 207 | + "cell_type": "code", |
225 | 208 | "metadata": {
|
226 | 209 | "id": "EYhtf-TDK2om",
|
227 | 210 | "colab_type": "code",
|
228 | 211 | "colab": {}
|
229 | 212 | },
|
230 |
| - "cell_type": "code", |
231 | 213 | "source": [
|
232 | 214 | "z = y.fillna(1)"
|
233 | 215 | ],
|
234 | 216 | "execution_count": 0,
|
235 | 217 | "outputs": []
|
236 | 218 | },
|
237 | 219 | {
|
| 220 | + "cell_type": "code", |
238 | 221 | "metadata": {
|
239 | 222 | "id": "xTEVwpImK4S4",
|
240 | 223 | "colab_type": "code",
|
241 | 224 | "colab": {}
|
242 | 225 | },
|
243 |
| - "cell_type": "code", |
244 | 226 | "source": [
|
245 | 227 | "a = z.drop('Species') \n",
|
246 | 228 | "features = a.rdd.map(lambda row: row[0:])\n"
|
|
249 | 231 | "outputs": []
|
250 | 232 | },
|
251 | 233 | {
|
| 234 | + "cell_type": "code", |
252 | 235 | "metadata": {
|
253 | 236 | "id": "uHjpSzLhK6Hd",
|
254 | 237 | "colab_type": "code",
|
255 | 238 | "colab": {}
|
256 | 239 | },
|
257 |
| - "cell_type": "code", |
258 | 240 | "source": [
|
259 | 241 | "correlation_matrix = Statistics.corr(features, method=\"pearson\")"
|
260 | 242 | ],
|
261 | 243 | "execution_count": 0,
|
262 | 244 | "outputs": []
|
263 | 245 | },
|
264 | 246 | {
|
| 247 | + "cell_type": "code", |
265 | 248 | "metadata": {
|
266 | 249 | "id": "90YKC2n5K61I",
|
267 | 250 | "colab_type": "code",
|
| 251 | + "outputId": "fd7c04d5-ea85-4477-adbf-cd104606880e", |
268 | 252 | "colab": {
|
269 | 253 | "base_uri": "https://localhost:8080/",
|
270 | 254 | "height": 173
|
271 |
| - }, |
272 |
| - "outputId": "fd7c04d5-ea85-4477-adbf-cd104606880e" |
| 255 | + } |
273 | 256 | },
|
274 |
| - "cell_type": "code", |
275 | 257 | "source": [
|
276 | 258 | "correlation_df = pd.DataFrame(correlation_matrix)\n",
|
277 | 259 | "correlation_df.index, correlation_df.columns = a.columns, a.columns\n",
|
278 | 260 | "correlation_df\n"
|
279 | 261 | ],
|
280 |
| - "execution_count": 14, |
| 262 | + "execution_count": 0, |
281 | 263 | "outputs": [
|
282 | 264 | {
|
283 | 265 | "output_type": "execute_result",
|
|
356 | 338 | ]
|
357 | 339 | },
|
358 | 340 | {
|
| 341 | + "cell_type": "code", |
359 | 342 | "metadata": {
|
360 | 343 | "id": "u6lph3zyK8wZ",
|
361 | 344 | "colab_type": "code",
|
| 345 | + "outputId": "8b987d54-97cc-4d61-cca5-9968fbbc020d", |
362 | 346 | "colab": {
|
363 | 347 | "base_uri": "https://localhost:8080/",
|
364 | 348 | "height": 34
|
365 |
| - }, |
366 |
| - "outputId": "8b987d54-97cc-4d61-cca5-9968fbbc020d" |
| 349 | + } |
367 | 350 | },
|
368 |
| - "cell_type": "code", |
369 | 351 | "source": [
|
370 | 352 | "import pandas as pd\n",
|
371 | 353 | "dat = y.toPandas()\n",
|
372 | 354 | "type(dat)\n"
|
373 | 355 | ],
|
374 |
| - "execution_count": 15, |
| 356 | + "execution_count": 0, |
375 | 357 | "outputs": [
|
376 | 358 | {
|
377 | 359 | "output_type": "execute_result",
|
|
388 | 370 | ]
|
389 | 371 | },
|
390 | 372 | {
|
| 373 | + "cell_type": "code", |
391 | 374 | "metadata": {
|
392 | 375 | "id": "Wgsl7lqvK-y_",
|
393 | 376 | "colab_type": "code",
|
| 377 | + "outputId": "fc9658c0-cbb9-4e37-cd2f-26b529a82c00", |
394 | 378 | "colab": {
|
395 | 379 | "base_uri": "https://localhost:8080/",
|
396 | 380 | "height": 369
|
397 |
| - }, |
398 |
| - "outputId": "fc9658c0-cbb9-4e37-cd2f-26b529a82c00" |
| 381 | + } |
399 | 382 | },
|
400 |
| - "cell_type": "code", |
401 | 383 | "source": [
|
402 | 384 | "import matplotlib.pyplot as plt\n",
|
403 | 385 | "import seaborn as sns\n",
|
404 | 386 | "%matplotlib inline\n",
|
405 | 387 | "sns.lmplot(x = \"Sepallength\", y = \"Petallength\", data = dat)\n",
|
406 | 388 | "plt.show()\n"
|
407 | 389 | ],
|
408 |
| - "execution_count": 16, |
| 390 | + "execution_count": 0, |
409 | 391 | "outputs": [
|
410 | 392 | {
|
411 | 393 | "output_type": "display_data",
|
|
422 | 404 | ]
|
423 | 405 | },
|
424 | 406 | {
|
| 407 | + "cell_type": "code", |
425 | 408 | "metadata": {
|
426 | 409 | "id": "I9llgXa5LB3Y",
|
427 | 410 | "colab_type": "code",
|
| 411 | + "outputId": "192b63b3-db45-4b3d-b880-8adf67cc5dbf", |
428 | 412 | "colab": {
|
429 | 413 | "base_uri": "https://localhost:8080/",
|
430 | 414 | "height": 369
|
431 |
| - }, |
432 |
| - "outputId": "192b63b3-db45-4b3d-b880-8adf67cc5dbf" |
| 415 | + } |
433 | 416 | },
|
434 |
| - "cell_type": "code", |
435 | 417 | "source": [
|
436 | 418 | "import seaborn as sns\n",
|
437 | 419 | "sns.lmplot(x = \"Sepallength\", y = \"Petalwidth\", data = dat)\n",
|
438 | 420 | "plt.show()\n"
|
439 | 421 | ],
|
440 |
| - "execution_count": 17, |
| 422 | + "execution_count": 0, |
441 | 423 | "outputs": [
|
442 | 424 | {
|
443 | 425 | "output_type": "display_data",
|
|
454 | 436 | ]
|
455 | 437 | },
|
456 | 438 | {
|
| 439 | + "cell_type": "code", |
457 | 440 | "metadata": {
|
458 | 441 | "id": "-0P4bwUoLDWh",
|
459 | 442 | "colab_type": "code",
|
| 443 | + "outputId": "85034f66-1b36-4903-e097-a7075bb34eb8", |
460 | 444 | "colab": {
|
461 | 445 | "base_uri": "https://localhost:8080/",
|
462 | 446 | "height": 369
|
463 |
| - }, |
464 |
| - "outputId": "85034f66-1b36-4903-e097-a7075bb34eb8" |
| 447 | + } |
465 | 448 | },
|
466 |
| - "cell_type": "code", |
467 | 449 | "source": [
|
468 | 450 | "sns.lmplot(x = \"Petallength\", y = \"Petalwidth\", data = dat)\n",
|
469 | 451 | "plt.show()\n"
|
470 | 452 | ],
|
471 |
| - "execution_count": 18, |
| 453 | + "execution_count": 0, |
472 | 454 | "outputs": [
|
473 | 455 | {
|
474 | 456 | "output_type": "display_data",
|
|
0 commit comments