Skip to content

Instantly share code, notes, and snippets.

@agaszmurlo
Last active May 7, 2019 21:14
Show Gist options
  • Save agaszmurlo/1fb02444af41f39975cf82555bc24cf2 to your computer and use it in GitHub Desktop.
Save agaszmurlo/1fb02444af41f39975cf82555bc24cf2 to your computer and use it in GitHub Desktop.
spark-shell -v --master=local[$cores] --driver-memory=12g --conf "spark.sql.catalogImplementation=in-memory" --packages org.biodatageeks:bdg-sequila_2.11:0.5.3-spark-2.4.0-SNAPSHOT --repositories http://repo.hortonworks.com/content/repositories/releases/,http://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/* create table */
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_exome
USING org.biodatageeks.datasources.BAM.BAMDataSource
OPTIONS(path '/data/granges/exome/NA12878.proper.wes.bam')""")
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_wgs
USING org.biodatageeks.datasources.BAM.BAMDataSource
OPTIONS(path '/data/granges/exome/NA12878.proper.wgs.bam')""")
/* bases - do powtorzenia dla core = 1 TYLKO, bo mamy sie porownac z samtoolsem tylko */
spark.time{ ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'bases')").count }
spark.time{ ss.sql(s"SELECT * FROM bdg_coverage('reads_wgs','NA12878', 'bases')").count }
/* store as BED file do powtorzenia dla core = 1, 5, 10 */
spark.time { ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks')")
.coalesce(1)
.write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/exome/coverage.bed")}
spark.time { ss.sql(s"SELECT * FROM bdg_coverage('reads_wgs','NA12878', 'blocks')")
.coalesce(1)
.write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/exome/coverage.bed")}
/*** NANOPORE **/
sc.setLogLevel("WARN")
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
session.sqlContext.setConf(BDGInternalParams.ShowAllPositions, "true")
//ss.sqlContext.setConf("spark.biodatageeks.readAligment.method", "disq")
ss.sql("""
CREATE TABLE IF NOT EXISTS qreads
USING org.biodatageeks.datasources.BAM.BAMDataSource
OPTIONS(path '/data/granges/nanopore/guppy.bam')""")
// ss.sql("""
// CREATE TABLE IF NOT EXISTS qreads
// USING org.biodatageeks.datasources.BAM.BAMDataSource
// OPTIONS(path 'file://data/work/nanopore_bam/NA12878.chr21.quality.bam')""")
//ss.sql("select * from qreads limit 10").show
//spark.time{ ss.sql(s"SELECT contigName, start, coverage FROM bdg_coverage('qreads','NA12878', 'bases')").count }
val cov = ss.sql(s"SELECT contigName, start, coverage FROM bdg_coverage('qreads','NA12878', 'bases')")
spark.time{cov.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/data/granges/nanopore/guppy_cov.bed")}
@mwiewior
Copy link

ss.sql("""
  CREATE TABLE IF NOT EXISTS qreads 
  USING org.biodatageeks.datasources.BAM.BAMDataSource 
  OPTIONS(path 'file:///data/work/nanopore_bam/NA12878.chr21.quality.bam')""")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment