-
Notifications
You must be signed in to change notification settings - Fork 77
/
Telling-Stories-with-Data.toc
359 lines (359 loc) · 28.3 KB
/
Telling-Stories-with-Data.toc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
\contentsline {fm}{Preface}{ix}{chapter*.1}%
\contentsline {chapter}{Preface}{ix}{chapter*.1}%
\contentsline {section}{Audience and assumed background}{x}{section*.2}%
\contentsline {section}{Structure and content}{x}{section*.3}%
\contentsline {section}{Pedagogy and key features}{xii}{section*.4}%
\contentsline {section}{Software information and conventions}{xiii}{section*.5}%
\contentsline {section}{About the author}{xiv}{section*.6}%
\contentsline {section}{Acknowledgments}{xiv}{section*.7}%
\contentsline {part}{I\hspace {1em}Foundations}{1}{part.1}%
\contentsline {chapter}{\numberline {1}Telling stories with data}{3}{chapter.1}%
\contentsline {section}{\numberline {1.1}On telling stories}{3}{section.1.1}%
\contentsline {section}{\numberline {1.2}Workflow components}{4}{section.1.2}%
\contentsline {section}{\numberline {1.3}Telling stories with data}{6}{section.1.3}%
\contentsline {section}{\numberline {1.4}How do our worlds become data?}{11}{section.1.4}%
\contentsline {section}{\numberline {1.5}What is data science and how should we use it to learn about the world?}{12}{section.1.5}%
\contentsline {section}{\numberline {1.6}Exercises}{14}{section.1.6}%
\contentsline {subsection}{Questions}{14}{section*.10}%
\contentsline {subsection}{Tutorial}{15}{section*.11}%
\contentsline {chapter}{\numberline {2}Drinking from a fire hose}{17}{chapter.2}%
\contentsline {section}{\numberline {2.1}Hello, World!}{18}{section.2.1}%
\contentsline {section}{\numberline {2.2}Australian elections}{19}{section.2.2}%
\contentsline {subsection}{\numberline {2.2.1}Plan}{19}{subsection.2.2.1}%
\contentsline {subsection}{\numberline {2.2.2}Simulate}{19}{subsection.2.2.2}%
\contentsline {subsection}{\numberline {2.2.3}Acquire}{23}{subsection.2.2.3}%
\contentsline {subsection}{\numberline {2.2.4}Explore}{28}{subsection.2.2.4}%
\contentsline {subsection}{\numberline {2.2.5}Share}{29}{subsection.2.2.5}%
\contentsline {section}{\numberline {2.3}Toronto's unhoused population}{30}{section.2.3}%
\contentsline {subsection}{\numberline {2.3.1}Plan}{30}{subsection.2.3.1}%
\contentsline {subsection}{\numberline {2.3.2}Simulate}{31}{subsection.2.3.2}%
\contentsline {subsection}{\numberline {2.3.3}Acquire}{33}{subsection.2.3.3}%
\contentsline {subsection}{\numberline {2.3.4}Explore}{34}{subsection.2.3.4}%
\contentsline {subsection}{\numberline {2.3.5}Share}{36}{subsection.2.3.5}%
\contentsline {section}{\numberline {2.4}Neonatal mortality}{37}{section.2.4}%
\contentsline {subsection}{\numberline {2.4.1}Plan}{37}{subsection.2.4.1}%
\contentsline {subsection}{\numberline {2.4.2}Simulate}{37}{subsection.2.4.2}%
\contentsline {subsection}{\numberline {2.4.3}Acquire}{41}{subsection.2.4.3}%
\contentsline {subsection}{\numberline {2.4.4}Explore}{43}{subsection.2.4.4}%
\contentsline {subsection}{\numberline {2.4.5}Share}{44}{subsection.2.4.5}%
\contentsline {section}{\numberline {2.5}Concluding remarks}{45}{section.2.5}%
\contentsline {section}{\numberline {2.6}Exercises}{45}{section.2.6}%
\contentsline {subsection}{Scales}{45}{section*.20}%
\contentsline {subsection}{Questions}{46}{section*.21}%
\contentsline {subsection}{Tutorial}{47}{section*.22}%
\contentsline {chapter}{\numberline {3}Reproducible workflows}{49}{chapter.3}%
\contentsline {section}{\numberline {3.1}Introduction}{50}{section.3.1}%
\contentsline {section}{\numberline {3.2}Quarto}{53}{section.3.2}%
\contentsline {subsection}{\numberline {3.2.1}Getting started}{53}{subsection.3.2.1}%
\contentsline {subsection}{\numberline {3.2.2}Top matter}{54}{subsection.3.2.2}%
\contentsline {subsection}{\numberline {3.2.3}Essential commands}{55}{subsection.3.2.3}%
\contentsline {subsection}{\numberline {3.2.4}R chunks}{56}{subsection.3.2.4}%
\contentsline {subsection}{\numberline {3.2.5}Equations}{58}{subsection.3.2.5}%
\contentsline {subsection}{\numberline {3.2.6}Cross-references}{60}{subsection.3.2.6}%
\contentsline {section}{\numberline {3.3}R Projects and file structure}{62}{section.3.3}%
\contentsline {section}{\numberline {3.4}Version control}{64}{section.3.4}%
\contentsline {subsection}{\numberline {3.4.1}Git}{65}{subsection.3.4.1}%
\contentsline {subsection}{\numberline {3.4.2}GitHub}{66}{subsection.3.4.2}%
\contentsline {section}{\numberline {3.5}Using R in practice}{69}{section.3.5}%
\contentsline {subsection}{\numberline {3.5.1}Dealing with errors}{69}{subsection.3.5.1}%
\contentsline {subsection}{\numberline {3.5.2}Reproducible examples}{70}{subsection.3.5.2}%
\contentsline {subsection}{\numberline {3.5.3}Mentality}{71}{subsection.3.5.3}%
\contentsline {subsection}{\numberline {3.5.4}Code comments and style}{72}{subsection.3.5.4}%
\contentsline {subsection}{\numberline {3.5.5}Tests}{73}{subsection.3.5.5}%
\contentsline {section}{\numberline {3.6}Efficiency}{73}{section.3.6}%
\contentsline {subsection}{\numberline {3.6.1}Sharing a code environment}{74}{subsection.3.6.1}%
\contentsline {subsection}{\numberline {3.6.2}Code linting and styling}{75}{subsection.3.6.2}%
\contentsline {subsection}{\numberline {3.6.3}Code review}{76}{subsection.3.6.3}%
\contentsline {subsection}{\numberline {3.6.4}Code refactoring}{77}{subsection.3.6.4}%
\contentsline {subsection}{\numberline {3.6.5}Parallel processing}{78}{subsection.3.6.5}%
\contentsline {section}{\numberline {3.7}Concluding remarks}{80}{section.3.7}%
\contentsline {section}{\numberline {3.8}Exercises}{80}{section.3.8}%
\contentsline {subsection}{Scales}{80}{section*.29}%
\contentsline {subsection}{Questions}{81}{section*.30}%
\contentsline {subsection}{Tutorial}{82}{section*.31}%
\contentsline {subsection}{Paper}{82}{section*.32}%
\contentsline {part}{II\hspace {1em}Communication}{83}{part.2}%
\contentsline {chapter}{\numberline {4}Writing research}{85}{chapter.4}%
\contentsline {section}{\numberline {4.1}Introduction}{87}{section.4.1}%
\contentsline {section}{\numberline {4.2}Writing}{87}{section.4.2}%
\contentsline {section}{\numberline {4.3}Asking questions}{89}{section.4.3}%
\contentsline {subsection}{\numberline {4.3.1}Data-first}{90}{subsection.4.3.1}%
\contentsline {subsection}{\numberline {4.3.2}Question-first}{91}{subsection.4.3.2}%
\contentsline {section}{\numberline {4.4}Answering questions}{92}{section.4.4}%
\contentsline {section}{\numberline {4.5}Components of a paper}{94}{section.4.5}%
\contentsline {subsection}{\numberline {4.5.1}Title}{95}{subsection.4.5.1}%
\contentsline {subsection}{\numberline {4.5.2}Abstract}{96}{subsection.4.5.2}%
\contentsline {subsection}{\numberline {4.5.3}Introduction}{99}{subsection.4.5.3}%
\contentsline {subsection}{\numberline {4.5.4}Data}{100}{subsection.4.5.4}%
\contentsline {subsection}{\numberline {4.5.5}Model}{101}{subsection.4.5.5}%
\contentsline {subsection}{\numberline {4.5.6}Results}{103}{subsection.4.5.6}%
\contentsline {subsection}{\numberline {4.5.7}Discussion}{103}{subsection.4.5.7}%
\contentsline {subsection}{\numberline {4.5.8}Brevity, typos, and grammar}{103}{subsection.4.5.8}%
\contentsline {subsection}{\numberline {4.5.9}Rules}{104}{subsection.4.5.9}%
\contentsline {section}{\numberline {4.6}Exercises}{105}{section.4.6}%
\contentsline {subsection}{Scales}{105}{section*.34}%
\contentsline {subsection}{Questions}{105}{section*.35}%
\contentsline {subsection}{Tutorial}{106}{section*.36}%
\contentsline {chapter}{\numberline {5}Static communication}{109}{chapter.5}%
\contentsline {section}{\numberline {5.1}Introduction}{110}{section.5.1}%
\contentsline {section}{\numberline {5.2}Graphs}{111}{section.5.2}%
\contentsline {subsection}{\numberline {5.2.1}Bar charts}{115}{subsection.5.2.1}%
\contentsline {subsubsection}{\numberline {5.2.1.1}Themes}{117}{subsubsection.5.2.1.1}%
\contentsline {subsubsection}{\numberline {5.2.1.2}Facets}{119}{subsubsection.5.2.1.2}%
\contentsline {subsubsection}{\numberline {5.2.1.3}Colors}{121}{subsubsection.5.2.1.3}%
\contentsline {subsection}{\numberline {5.2.2}Scatterplots}{123}{subsection.5.2.2}%
\contentsline {subsection}{\numberline {5.2.3}Line plots}{128}{subsection.5.2.3}%
\contentsline {subsection}{\numberline {5.2.4}Histograms}{131}{subsection.5.2.4}%
\contentsline {subsection}{\numberline {5.2.5}Boxplots}{136}{subsection.5.2.5}%
\contentsline {section}{\numberline {5.3}Tables}{138}{section.5.3}%
\contentsline {subsection}{\numberline {5.3.1}Showing part of a dataset}{139}{subsection.5.3.1}%
\contentsline {subsection}{\numberline {5.3.2}Improving the formatting}{140}{subsection.5.3.2}%
\contentsline {subsection}{\numberline {5.3.3}Communicating summary statistics}{141}{subsection.5.3.3}%
\contentsline {subsection}{\numberline {5.3.4}Display regression results}{143}{subsection.5.3.4}%
\contentsline {section}{\numberline {5.4}Maps}{145}{section.5.4}%
\contentsline {subsection}{\numberline {5.4.1}Australian polling places}{146}{subsection.5.4.1}%
\contentsline {subsection}{\numberline {5.4.2}United States military bases}{149}{subsection.5.4.2}%
\contentsline {subsection}{\numberline {5.4.3}Geocoding}{150}{subsection.5.4.3}%
\contentsline {section}{\numberline {5.5}Concluding remarks}{154}{section.5.5}%
\contentsline {section}{\numberline {5.6}Exercises}{154}{section.5.6}%
\contentsline {subsection}{Scales}{154}{section*.74}%
\contentsline {subsection}{Questions}{154}{section*.75}%
\contentsline {subsection}{Tutorial}{156}{section*.76}%
\contentsline {subsection}{Paper}{156}{section*.77}%
\contentsline {part}{III\hspace {1em}Acquisition}{157}{part.3}%
\contentsline {chapter}{\numberline {6}Farm data}{159}{chapter.6}%
\contentsline {section}{\numberline {6.1}Introduction}{160}{section.6.1}%
\contentsline {section}{\numberline {6.2}Measurement}{162}{section.6.2}%
\contentsline {subsection}{\numberline {6.2.1}Properties of measurements}{164}{subsection.6.2.1}%
\contentsline {subsection}{\numberline {6.2.2}Measurement error}{165}{subsection.6.2.2}%
\contentsline {subsection}{\numberline {6.2.3}Missing data}{167}{subsection.6.2.3}%
\contentsline {section}{\numberline {6.3}Censuses and other government data}{168}{section.6.3}%
\contentsline {subsection}{\numberline {6.3.1}Canada}{170}{subsection.6.3.1}%
\contentsline {subsection}{\numberline {6.3.2}United States}{172}{subsection.6.3.2}%
\contentsline {subsubsection}{\numberline {6.3.2.1}Census}{172}{subsubsection.6.3.2.1}%
\contentsline {subsubsection}{\numberline {6.3.2.2}American Community Survey}{173}{subsubsection.6.3.2.2}%
\contentsline {section}{\numberline {6.4}Sampling essentials}{178}{section.6.4}%
\contentsline {subsection}{\numberline {6.4.1}Sampling in Dublin and Reading}{180}{subsection.6.4.1}%
\contentsline {subsubsection}{\numberline {6.4.1.1}Survey of Dublin in 1798}{180}{subsubsection.6.4.1.1}%
\contentsline {subsubsection}{\numberline {6.4.1.2}Survey of working-class households in Reading in 1912}{181}{subsubsection.6.4.1.2}%
\contentsline {subsection}{\numberline {6.4.2}Probabilistic sampling}{182}{subsection.6.4.2}%
\contentsline {subsubsection}{\numberline {6.4.2.1}Inference for probability samples}{187}{subsubsection.6.4.2.1}%
\contentsline {subsection}{\numberline {6.4.3}Non-probability samples}{191}{subsection.6.4.3}%
\contentsline {section}{\numberline {6.5}Exercises}{193}{section.6.5}%
\contentsline {subsection}{Scales}{193}{section*.86}%
\contentsline {subsection}{Questions}{193}{section*.87}%
\contentsline {subsection}{Tutorial}{194}{section*.88}%
\contentsline {chapter}{\numberline {7}Gather data}{195}{chapter.7}%
\contentsline {section}{\numberline {7.1}Introduction}{196}{section.7.1}%
\contentsline {section}{\numberline {7.2}APIs}{197}{section.7.2}%
\contentsline {subsection}{\numberline {7.2.1}arXiv, NASA, and Dataverse}{198}{subsection.7.2.1}%
\contentsline {subsubsection}{\numberline {7.2.1.1}arXiv}{198}{subsubsection.7.2.1.1}%
\contentsline {subsubsection}{\numberline {7.2.1.2}NASA Astronomy Picture of the Day}{199}{subsubsection.7.2.1.2}%
\contentsline {subsubsection}{\numberline {7.2.1.3}Dataverse}{200}{subsubsection.7.2.1.3}%
\contentsline {subsection}{\numberline {7.2.2}Spotify}{200}{subsection.7.2.2}%
\contentsline {section}{\numberline {7.3}Web scraping}{205}{section.7.3}%
\contentsline {subsection}{\numberline {7.3.1}Principles}{205}{subsection.7.3.1}%
\contentsline {subsection}{\numberline {7.3.2}HTML/CSS essentials}{207}{subsection.7.3.2}%
\contentsline {subsection}{\numberline {7.3.3}Book information}{208}{subsection.7.3.3}%
\contentsline {subsection}{\numberline {7.3.4}Prime Ministers of the United Kingdom}{211}{subsection.7.3.4}%
\contentsline {subsection}{\numberline {7.3.5}Iteration}{218}{subsection.7.3.5}%
\contentsline {section}{\numberline {7.4}PDFs}{219}{section.7.4}%
\contentsline {subsection}{\numberline {7.4.1}\emph {Jane Eyre}}{220}{subsection.7.4.1}%
\contentsline {subsection}{\numberline {7.4.2}Total Fertility Rate in the United States}{223}{subsection.7.4.2}%
\contentsline {subsection}{\numberline {7.4.3}Optical Character Recognition}{230}{subsection.7.4.3}%
\contentsline {section}{\numberline {7.5}Exercises}{233}{section.7.5}%
\contentsline {subsection}{Scales}{233}{section*.104}%
\contentsline {subsection}{Questions}{233}{section*.105}%
\contentsline {subsection}{Tutorial}{236}{section*.106}%
\contentsline {chapter}{\numberline {8}Hunt data}{237}{chapter.8}%
\contentsline {section}{\numberline {8.1}Introduction}{238}{section.8.1}%
\contentsline {section}{\numberline {8.2}Field experiments and randomized controlled trials}{241}{section.8.2}%
\contentsline {subsection}{\numberline {8.2.1}Randomization}{241}{subsection.8.2.1}%
\contentsline {subsection}{\numberline {8.2.2}Simulated example: cats or dogs}{242}{subsection.8.2.2}%
\contentsline {subsection}{\numberline {8.2.3}Treatment and control}{245}{subsection.8.2.3}%
\contentsline {subsection}{\numberline {8.2.4}Fisher's tea party}{248}{subsection.8.2.4}%
\contentsline {subsection}{\numberline {8.2.5}Ethical foundations}{249}{subsection.8.2.5}%
\contentsline {subsubsection}{\numberline {8.2.5.1}Tuskegee Syphilis Study}{249}{subsubsection.8.2.5.1}%
\contentsline {subsubsection}{\numberline {8.2.5.2}Extracorporeal membrane oxygenation (ECMO)}{250}{subsubsection.8.2.5.2}%
\contentsline {section}{\numberline {8.3}Surveys}{251}{section.8.3}%
\contentsline {subsection}{\numberline {8.3.1}Democracy Fund Voter Study Group}{253}{subsection.8.3.1}%
\contentsline {section}{\numberline {8.4}RCT examples}{256}{section.8.4}%
\contentsline {subsection}{\numberline {8.4.1}The Oregon Health Insurance Experiment}{256}{subsection.8.4.1}%
\contentsline {subsection}{\numberline {8.4.2}Civic Honesty Around The Globe}{257}{subsection.8.4.2}%
\contentsline {section}{\numberline {8.5}A/B testing}{260}{section.8.5}%
\contentsline {subsection}{\numberline {8.5.1}Upworthy}{263}{subsection.8.5.1}%
\contentsline {section}{\numberline {8.6}Exercises}{266}{section.8.6}%
\contentsline {subsection}{Scales}{266}{section*.111}%
\contentsline {subsection}{Questions}{266}{section*.112}%
\contentsline {subsection}{Tutorial}{267}{section*.113}%
\contentsline {subsection}{Paper}{267}{section*.114}%
\contentsline {part}{IV\hspace {1em}Preparation}{269}{part.4}%
\contentsline {chapter}{\numberline {9}Clean and prepare}{271}{chapter.9}%
\contentsline {section}{\numberline {9.1}Introduction}{272}{section.9.1}%
\contentsline {section}{\numberline {9.2}Workflow}{274}{section.9.2}%
\contentsline {subsection}{\numberline {9.2.1}Save the original, unedited data}{274}{subsection.9.2.1}%
\contentsline {subsection}{\numberline {9.2.2}Plan}{275}{subsection.9.2.2}%
\contentsline {subsection}{\numberline {9.2.3}Start small}{276}{subsection.9.2.3}%
\contentsline {subsection}{\numberline {9.2.4}Write tests and documentation}{278}{subsection.9.2.4}%
\contentsline {subsection}{\numberline {9.2.5}Iterate, generalize, and update}{283}{subsection.9.2.5}%
\contentsline {section}{\numberline {9.3}Checking and testing}{283}{section.9.3}%
\contentsline {subsection}{\numberline {9.3.1}Graphs}{283}{subsection.9.3.1}%
\contentsline {subsection}{\numberline {9.3.2}Counts}{284}{subsection.9.3.2}%
\contentsline {subsection}{\numberline {9.3.3}Tests}{285}{subsection.9.3.3}%
\contentsline {subsubsection}{\numberline {9.3.3.1}Aspects to test}{288}{subsubsection.9.3.3.1}%
\contentsline {subsubsection}{\numberline {9.3.3.2}Class}{290}{subsubsection.9.3.3.2}%
\contentsline {subsubsection}{\numberline {9.3.3.3}Dates}{292}{subsubsection.9.3.3.3}%
\contentsline {section}{\numberline {9.4}Simulated example: running times}{298}{section.9.4}%
\contentsline {section}{\numberline {9.5}Names}{301}{section.9.5}%
\contentsline {subsection}{\numberline {9.5.1}Machine-readable}{302}{subsection.9.5.1}%
\contentsline {subsection}{\numberline {9.5.2}Human-readable}{303}{subsection.9.5.2}%
\contentsline {section}{\numberline {9.6}1996 Tanzanian DHS}{305}{section.9.6}%
\contentsline {section}{\numberline {9.7}2019 Kenyan census}{311}{section.9.7}%
\contentsline {subsection}{\numberline {9.7.1}Gather and clean}{312}{subsection.9.7.1}%
\contentsline {subsubsection}{\numberline {9.7.1.1}Make rectangular}{312}{subsubsection.9.7.1.1}%
\contentsline {subsubsection}{\numberline {9.7.1.2}Validity}{315}{subsubsection.9.7.1.2}%
\contentsline {subsubsection}{\numberline {9.7.1.3}Internal consistency}{316}{subsubsection.9.7.1.3}%
\contentsline {subsection}{\numberline {9.7.2}Check and test}{316}{subsection.9.7.2}%
\contentsline {subsection}{\numberline {9.7.3}Tidy-up}{317}{subsection.9.7.3}%
\contentsline {section}{\numberline {9.8}Exercises}{320}{section.9.8}%
\contentsline {subsection}{Scales}{320}{section*.127}%
\contentsline {subsection}{Questions}{321}{section*.128}%
\contentsline {subsection}{Tutorial}{322}{section*.129}%
\contentsline {chapter}{\numberline {10}Store and share}{323}{chapter.10}%
\contentsline {section}{\numberline {10.1}Introduction}{324}{section.10.1}%
\contentsline {section}{\numberline {10.2}Plan}{325}{section.10.2}%
\contentsline {section}{\numberline {10.3}Share}{326}{section.10.3}%
\contentsline {subsection}{\numberline {10.3.1}GitHub}{326}{subsection.10.3.1}%
\contentsline {subsection}{\numberline {10.3.2}R packages for data}{327}{subsection.10.3.2}%
\contentsline {subsection}{\numberline {10.3.3}Depositing data}{329}{subsection.10.3.3}%
\contentsline {section}{\numberline {10.4}Data documentation}{330}{section.10.4}%
\contentsline {section}{\numberline {10.5}Personally identifying information}{331}{section.10.5}%
\contentsline {subsection}{\numberline {10.5.1}Hashing}{333}{subsection.10.5.1}%
\contentsline {subsection}{\numberline {10.5.2}Simulation}{335}{subsection.10.5.2}%
\contentsline {subsection}{\numberline {10.5.3}Differential privacy}{335}{subsection.10.5.3}%
\contentsline {section}{\numberline {10.6}Data efficiency}{338}{section.10.6}%
\contentsline {subsection}{\numberline {10.6.1}Iteration}{338}{subsection.10.6.1}%
\contentsline {subsection}{\numberline {10.6.2}Apache Arrow}{340}{subsection.10.6.2}%
\contentsline {section}{\numberline {10.7}Exercises}{342}{section.10.7}%
\contentsline {subsection}{Scales}{342}{section*.131}%
\contentsline {subsection}{Questions}{342}{section*.132}%
\contentsline {subsection}{Tutorial}{342}{section*.133}%
\contentsline {subsection}{Paper}{343}{section*.134}%
\contentsline {part}{V\hspace {1em}Modeling}{345}{part.5}%
\contentsline {chapter}{\numberline {11}Exploratory data analysis}{347}{chapter.11}%
\contentsline {section}{\numberline {11.1}Introduction}{348}{section.11.1}%
\contentsline {section}{\numberline {11.2}1975 United States population and income data}{349}{section.11.2}%
\contentsline {section}{\numberline {11.3}Missing data}{352}{section.11.3}%
\contentsline {section}{\numberline {11.4}TTC subway delays}{355}{section.11.4}%
\contentsline {subsection}{\numberline {11.4.1}Distribution and properties of individual variables}{357}{subsection.11.4.1}%
\contentsline {subsection}{\numberline {11.4.2}Relationships between variables}{367}{subsection.11.4.2}%
\contentsline {section}{\numberline {11.5}Airbnb listings in London, England}{369}{section.11.5}%
\contentsline {subsection}{\numberline {11.5.1}Distribution and properties of individual variables}{371}{subsection.11.5.1}%
\contentsline {subsection}{\numberline {11.5.2}Relationships between variables}{381}{subsection.11.5.2}%
\contentsline {section}{\numberline {11.6}Concluding remarks}{384}{section.11.6}%
\contentsline {section}{\numberline {11.7}Exercises}{385}{section.11.7}%
\contentsline {subsection}{Scales}{385}{section*.155}%
\contentsline {subsection}{Questions}{385}{section*.156}%
\contentsline {subsection}{Tutorial}{386}{section*.157}%
\contentsline {chapter}{\numberline {12}Linear models}{387}{chapter.12}%
\contentsline {section}{\numberline {12.1}Introduction}{388}{section.12.1}%
\contentsline {section}{\numberline {12.2}Simple linear regression}{389}{section.12.2}%
\contentsline {subsection}{\numberline {12.2.1}Simulated example: running times}{393}{subsection.12.2.1}%
\contentsline {section}{\numberline {12.3}Multiple linear regression}{401}{section.12.3}%
\contentsline {subsection}{\numberline {12.3.1}Simulated example: running times with rain and humidity}{401}{subsection.12.3.1}%
\contentsline {section}{\numberline {12.4}Building models}{408}{section.12.4}%
\contentsline {section}{\numberline {12.5}Concluding remarks}{416}{section.12.5}%
\contentsline {section}{\numberline {12.6}Exercises}{416}{section.12.6}%
\contentsline {subsection}{Scales}{416}{section*.171}%
\contentsline {subsection}{Questions}{417}{section*.172}%
\contentsline {subsection}{Tutorial}{417}{section*.173}%
\contentsline {subsection}{Paper}{418}{section*.174}%
\contentsline {chapter}{\numberline {13}Generalized linear models}{419}{chapter.13}%
\contentsline {section}{\numberline {13.1}Introduction}{420}{section.13.1}%
\contentsline {section}{\numberline {13.2}Logistic regression}{420}{section.13.2}%
\contentsline {subsection}{\numberline {13.2.1}Simulated example: day or night}{422}{subsection.13.2.1}%
\contentsline {subsection}{\numberline {13.2.2}Political support in the United States}{426}{subsection.13.2.2}%
\contentsline {section}{\numberline {13.3}Poisson regression}{433}{section.13.3}%
\contentsline {subsection}{\numberline {13.3.1}Simulated example: number of As by department}{434}{subsection.13.3.1}%
\contentsline {subsection}{\numberline {13.3.2}Letters used in \emph {Jane Eyre}}{437}{subsection.13.3.2}%
\contentsline {section}{\numberline {13.4}Negative binomial regression}{443}{section.13.4}%
\contentsline {subsection}{\numberline {13.4.1}Mortality in Alberta, Canada}{443}{subsection.13.4.1}%
\contentsline {section}{\numberline {13.5}Multilevel modeling}{450}{section.13.5}%
\contentsline {subsection}{\numberline {13.5.1}Simulated example: political support}{451}{subsection.13.5.1}%
\contentsline {subsection}{\numberline {13.5.2}Austen, Brontë, Dickens, and Shakespeare}{453}{subsection.13.5.2}%
\contentsline {section}{\numberline {13.6}Concluding remarks}{456}{section.13.6}%
\contentsline {section}{\numberline {13.7}Exercises}{457}{section.13.7}%
\contentsline {subsection}{Scales}{457}{section*.199}%
\contentsline {subsection}{Questions}{457}{section*.200}%
\contentsline {subsection}{Tutorial}{458}{section*.201}%
\contentsline {subsection}{Paper}{458}{section*.202}%
\contentsline {part}{VI\hspace {1em}Applications}{459}{part.6}%
\contentsline {chapter}{\numberline {14}Causality from observational data}{461}{chapter.14}%
\contentsline {section}{\numberline {14.1}Introduction}{462}{section.14.1}%
\contentsline {section}{\numberline {14.2}Directed Acyclic Graphs}{463}{section.14.2}%
\contentsline {subsection}{\numberline {14.2.1}Confounder}{463}{subsection.14.2.1}%
\contentsline {subsection}{\numberline {14.2.2}Mediator}{465}{subsection.14.2.2}%
\contentsline {subsection}{\numberline {14.2.3}Collider}{465}{subsection.14.2.3}%
\contentsline {section}{\numberline {14.3}Two common paradoxes}{467}{section.14.3}%
\contentsline {subsection}{\numberline {14.3.1}Simpson's paradox}{467}{subsection.14.3.1}%
\contentsline {subsection}{\numberline {14.3.2}Berkson's paradox}{470}{subsection.14.3.2}%
\contentsline {section}{\numberline {14.4}Difference-in-differences}{471}{section.14.4}%
\contentsline {subsection}{\numberline {14.4.1}Simulated example: tennis serve speed}{472}{subsection.14.4.1}%
\contentsline {subsection}{\numberline {14.4.2}Assumptions}{476}{subsection.14.4.2}%
\contentsline {subsection}{\numberline {14.4.3}French newspaper prices between 1960 and 1974}{477}{subsection.14.4.3}%
\contentsline {section}{\numberline {14.5}Propensity score matching}{481}{section.14.5}%
\contentsline {subsection}{\numberline {14.5.1}Simulated example: free shipping}{482}{subsection.14.5.1}%
\contentsline {section}{\numberline {14.6}Regression discontinuity design}{486}{section.14.6}%
\contentsline {subsection}{\numberline {14.6.1}Simulated example: income and grades}{487}{subsection.14.6.1}%
\contentsline {subsection}{\numberline {14.6.2}Assumptions}{490}{subsection.14.6.2}%
\contentsline {subsection}{\numberline {14.6.3}Alcohol and crime in California}{493}{subsection.14.6.3}%
\contentsline {section}{\numberline {14.7}Instrumental variables}{496}{section.14.7}%
\contentsline {subsection}{\numberline {14.7.1}Simulated example: health status, smoking, and tax rates}{498}{subsection.14.7.1}%
\contentsline {subsection}{\numberline {14.7.2}Assumptions}{501}{subsection.14.7.2}%
\contentsline {section}{\numberline {14.8}Exercises}{503}{section.14.8}%
\contentsline {subsection}{Scales}{503}{section*.229}%
\contentsline {subsection}{Questions}{503}{section*.230}%
\contentsline {subsection}{Tutorial}{504}{section*.231}%
\contentsline {chapter}{\numberline {15}Multilevel regression with post-stratification}{505}{chapter.15}%
\contentsline {section}{\numberline {15.1}Introduction}{506}{section.15.1}%
\contentsline {section}{\numberline {15.2}Simulated example: coffee or tea?}{509}{section.15.2}%
\contentsline {subsection}{\numberline {15.2.1}Construct a population and biased sample}{509}{subsection.15.2.1}%
\contentsline {subsection}{\numberline {15.2.2}Model the sample}{511}{subsection.15.2.2}%
\contentsline {subsection}{\numberline {15.2.3}Post-stratification dataset}{513}{subsection.15.2.3}%
\contentsline {section}{\numberline {15.3}Forecasting the 2020 United States election}{515}{section.15.3}%
\contentsline {subsection}{\numberline {15.3.1}Survey data}{515}{subsection.15.3.1}%
\contentsline {subsection}{\numberline {15.3.2}Post-stratification data}{516}{subsection.15.3.2}%
\contentsline {subsection}{\numberline {15.3.3}Model the sample}{517}{subsection.15.3.3}%
\contentsline {subsection}{\numberline {15.3.4}Post-stratify}{521}{subsection.15.3.4}%
\contentsline {section}{\numberline {15.4}Exercises}{524}{section.15.4}%
\contentsline {subsection}{Scales}{524}{section*.241}%
\contentsline {subsection}{Questions}{524}{section*.242}%
\contentsline {subsection}{Tutorial}{525}{section*.243}%
\contentsline {subsection}{Paper}{525}{section*.244}%
\contentsline {chapter}{\numberline {16}Text as data}{527}{chapter.16}%
\contentsline {section}{\numberline {16.1}Introduction}{528}{section.16.1}%
\contentsline {section}{\numberline {16.2}Text cleaning and preparation}{529}{section.16.2}%
\contentsline {subsection}{\numberline {16.2.1}Stop words}{530}{subsection.16.2.1}%
\contentsline {subsection}{\numberline {16.2.2}Case, numbers, and punctuation}{532}{subsection.16.2.2}%
\contentsline {subsection}{\numberline {16.2.3}Typos and uncommon words}{533}{subsection.16.2.3}%
\contentsline {subsection}{\numberline {16.2.4}Tuples}{534}{subsection.16.2.4}%
\contentsline {subsection}{\numberline {16.2.5}Stemming and lemmatizing}{536}{subsection.16.2.5}%
\contentsline {subsection}{\numberline {16.2.6}Duplication}{536}{subsection.16.2.6}%
\contentsline {section}{\numberline {16.3}Term Frequency-Inverse Document Frequency (TF-IDF)}{537}{section.16.3}%
\contentsline {subsection}{\numberline {16.3.1}Distinguishing horoscopes}{537}{subsection.16.3.1}%
\contentsline {section}{\numberline {16.4}Topic models}{540}{section.16.4}%
\contentsline {subsection}{\numberline {16.4.1}What is talked about in the Canadian parliament?}{542}{subsection.16.4.1}%
\contentsline {section}{\numberline {16.5}Exercises}{545}{section.16.5}%
\contentsline {subsection}{Scales}{545}{section*.247}%
\contentsline {subsection}{Questions}{545}{section*.248}%
\contentsline {subsection}{Tutorial}{546}{section*.249}%
\contentsline {chapter}{\numberline {17}Concluding remarks}{547}{chapter.17}%
\contentsline {section}{\numberline {17.1}Concluding remarks}{547}{section.17.1}%
\contentsline {section}{\numberline {17.2}Some outstanding issues}{548}{section.17.2}%
\contentsline {section}{\numberline {17.3}Next steps}{550}{section.17.3}%
\contentsline {section}{\numberline {17.4}Exercises}{552}{section.17.4}%
\contentsline {subsection}{Questions}{552}{section*.250}%
\contentsline {fm}{References}{553}{chapter*.252}%
\contentsline {chapter}{References}{553}{chapter*.252}%
\contentsline {chapter}{Index}{593}{chapter*.252}%
\contentsline {section}{}{593}{chapter*.252}%