@@ -76,6 +76,55 @@ process and query them using a SQL interface:
76
76
|
77
77
""".stripMargin)
78
78
spark.sql("SELECT sampleId,contigName,start,end,cigar FROM reads").show(5)
79
+
80
+ Implicit partition pruning for BAM data source
81
+ ##############################################
82
+
83
+ BAM data source supports implicit `partition pruning <https://docs.oracle.com/database/121/VLDBG/GUID-E677C85E-C5E3-4927-B3DF-684007A7B05D.htm#VLDBG00401 >`_
84
+ mechanism to speed up queries that are restricted to only subset of samples from a table. Consider a following example:
85
+
86
+ .. code-block :: bash
87
+
88
+ MacBook-Pro:multisample marek$ ls -ltr
89
+ total 2136
90
+ -rw-r--r-- 1 marek staff 364043 May 15 18:53 NA12877.slice.bam
91
+ -rw-r--r-- 1 marek staff 364043 May 15 18:53 NA12878.slice.bam
92
+ -rw-r--r-- 1 marek staff 364043 May 15 18:53 NA12879.slice.bam
93
+
94
+ MacBook-Pro:multisample marek$ pwd
95
+ /Users/marek/data/multisample
96
+
97
+
98
+ .. code-block :: scala
99
+
100
+ import org.apache.spark.sql.{SequilaSession, SparkSession}
101
+ val bamPath ="/Users/marek/data/multisample/*.bam"
102
+ val tableNameBAM = "reads"
103
+ val ss: SparkSession = SequilaSession(spark)
104
+ ss.sql(
105
+ s"""
106
+ |CREATE TABLE ${tableNameBAM}
107
+ |USING org.biodatageeks.datasources.BAM.BAMDataSource
108
+ |OPTIONS(path "${bamPath}")
109
+ |
110
+ """.stripMargin)
111
+
112
+ val query =
113
+ """
114
+ |SELECT sampleId,count(*) FROM reads where sampleId IN('NA12878','NA12879')
115
+ |GROUP BY sampleId order by sampleId
116
+ """.stripMargin
117
+ ss.sql(query)
118
+
119
+
120
+ If you run the above query you should get the information that SeQuiLa optimized the physical plan and will only read 2 BAM files
121
+ instead of 3 to answer your query:
122
+
123
+ .. code-block :: bash
124
+
125
+ WARN BAMRelation: Partition pruning detected,reading only files for samples: NA12878,NA12879
126
+
127
+
79
128
Using UDFs
80
129
##########
81
130
@@ -258,3 +307,63 @@ Parameter is set via coniguration:
258
307
spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder", "true")
259
308
260
309
310
+ Coverage
311
+ ##########
312
+
313
+ In order to compute coverage for your sample you can run a set of queries as follows:
314
+
315
+ .. code-block :: scala
316
+
317
+ val tableNameBAM = "reads"
318
+ val bamPath = "/data/samples/*.bam"
319
+ ss.sql("CREATE DATABASE dna")
320
+ ss.sql("USE dna")
321
+ ss.sql(
322
+ s"""
323
+ |CREATE TABLE ${tableNameBAM}
324
+ |USING org.biodatageeks.datasources.BAM.BAMDataSource
325
+ |OPTIONS(path "${bamPath}")
326
+ |
327
+ """.stripMargin)
328
+ ss.sql(s"SELECT * FROM coverage('${tableNameBAM}')").show(5)
329
+
330
+ +--------+----------+--------+--------+
331
+ |sampleId|contigName|position|coverage|
332
+ +--------+----------+--------+--------+
333
+ | NA12878| chr1| 137| 1|
334
+ | NA12878| chr1| 138| 1|
335
+ | NA12878| chr1| 139| 1|
336
+ | NA12878| chr1| 140| 1|
337
+ | NA12878| chr1| 141| 1|
338
+ +--------+----------+--------+--------+
339
+
340
+ If you would like to do additional short reads prefiltering, you can create a temporary table and use it as an input to the coverage function, e.g.:
341
+
342
+ .. code-block :: scala
343
+
344
+ ss.sql(s"CREATE TABLE filtered_reads AS SELECT * FROM ${tableNameBAM} WHERE mapq > 10 AND start> 200")
345
+ ss.sql(s"SELECT * FROM coverage('filtered_reads')").show(5)
346
+
347
+ +--------+----------+--------+--------+
348
+ |sampleId|contigName|position|coverage|
349
+ +--------+----------+--------+--------+
350
+ | NA12878| chr1| 361| 1|
351
+ | NA12878| chr1| 362| 1|
352
+ | NA12878| chr1| 363| 1|
353
+ | NA12878| chr1| 364| 1|
354
+ | NA12878| chr1| 365| 1|
355
+ +--------+----------+--------+--------+
356
+
357
+ (Experimental WIP) If you are interested in coverage histograms using e.g. mapping quality you can use the following table valued function:
358
+
359
+ .. code-block :: scala
360
+
361
+ ss.sql(s"SELECT * FROM coverage_hist('${tableNameBAM}') WHERE position=20204").show()
362
+
363
+ +--------+----------+--------+------------------+-------------+
364
+ |sampleId|contigName|position| coverage|coverageTotal|
365
+ +--------+----------+--------+------------------+-------------+
366
+ | NA12878| chr1| 20204|[1017, 0, 2, 0, 0]| 1019|
367
+ +--------+----------+--------+------------------+-------------+
368
+
369
+
0 commit comments