Created
March 23, 2016 20:24
-
-
Save thash/1c09cefda74b5e5aeb9f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// sudo yum update | |
// sudo yum install -y git | |
// sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo | |
// sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo | |
// sudo yum install -y apache-maven | |
// git clone https://github.com/bigdatagenomics/adam.git | |
// cd adam | |
// git checkout adam-parent-0.15.0 | |
// export "MAVEN_OPS=-Xmx512m -XX:MaxPermSize=128m" | |
// # long | |
// mvn clean package -DskipTests | |
// export ADAM_HOME=`pwd` | |
// alias adam-submit="$ADAM_HOME/bin/adam-submit" | |
// export SPARK_HOME=/usr/lib/spark | |
// cat >> ~/.bashrc | |
// export "MAVEN_OPS=-Xmx512m -XX:MaxPermSize=128m" | |
// export ADAM_HOME=/home/hadoop/adam | |
// export SPARK_HOME=/usr/lib/spark | |
// alias adam-submit="$ADAM_HOME/bin/adam-submit" | |
// ^C | |
/////////////////////////////// | |
// | |
// $ adam-submit | |
// SPARK_HOME must be set for 'adam-submit' | |
// | |
// $ look inside adam-submit... | |
// # Find spark-submit script | |
// if [ -z "$SPARK_HOME" ]; then | |
// echo "SPARK_HOME must be set for 'adam-submit'" | |
// exit 1 | |
// else | |
// SPARK_SUBMIT="$SPARK_HOME"/bin/spark-submit | |
// fi | |
// [root@ip-172-31-28-194 ~]# find / -name "*spark*" | |
// # => たぶん /usr/lib/spark/ | |
// /home/hadoop/adam/bin/adam-submit: 行 64: /usr/lib/spark/bin/utils.sh: そのようなファイルやディレクトリはありません | |
// http://se.bunri-u.ac.jp/~yamamoto/hadoop5/SetupCDH5/spark/index.html | |
// spark-shell | |
// /usr/lib/spark/bin/spark-shell: 行 44: /usr/lib/spark/bin/utils.sh: そのようなファイルやディレクトリはありません | |
// というエラーが表示されるので, https://github.com/apache/spark/blob/master/bin/utils.sh のutils.shを/usr/lib/spark/bin/utils.shにする. | |
// # https://github.com/apache/spark/tree/master/bin => nai | |
// # https://github.com/apache/spark/tree/v1.2.0/bin => aru | |
// curl -O https://raw.githubusercontent.com/apache/spark/v1.2.0/bin/utils.sh | |
// chmod +x utils.sh | |
// sudo mv utils.sh /usr/lib/spark/bin/ | |
// # 1.2 前提らしいがいちおう動くわ | |
// authorized_keys に追加すれば aws emr ssh も動く | |
import org.apache.spark.rdd.RDD | |
import org.bdgenomics.adam.rdd.ADAMContext._ | |
import org.bdgenomics.formats.avro.AlignmentRecord | |
val readsRDD: RDD[AlignmentRecord] = sc.adamLoad("/user/ds/genomics/reads/HG00103.adam") | |
readsRDD.first() | |
// res0: org.bdgenomics.formats.avro.AlignmentRecord = {"contig": {"contigName": "1", | |
// "contigLength": 249250621, | |
// "contigMD5": "1b22b98cdeb4a9304cb5d48026a85128", | |
// "referenceURL": "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human", | |
// "assembly": null, | |
// "species": null}, | |
// "start": 9992, | |
// "oldPosition": null, | |
// "end": 10091, | |
// "mapq": 25, | |
// "readName": "SRR062643.12466352", | |
// "sequence": "CTCTTCCGATCTCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT", | |
// "qual": "##@@BA:36<FBGCBBD>AHHB@4DD@B;0DEF6A9EDC6>9CCC@9@IIH@I8IIC4@GH=HGHCIHHHGAGABEGAGG@EGAFHGFFEEE?DEFDDA.", | |
// "cigar": "1S99M", | |
// "oldCigar": null, | |
// "basesTrimmedFromStart": 0, | |
// "basesTrimmedFromEnd": 0, | |
// "readPaired": true, | |
// "properP... | |
readsRDD.count() | |
// res1: Long = 160397565 | |
val uniq_chr = (readsRDD.map(_.contig.contigName.toString).distinct().collect()) | |
// scala> uniq_chr | |
// res2: Array[String] = Array(GL000192.1, GL000194.1, GL000196.1, GL000231.1, GL000198.1, GL000210.1, GL000233.1, GL000212.1, GL000235.1, GL000214.1, GL000237.1, GL000216.1, GL000239.1, GL000218.1, 10, 11, 12, 13, 14, 15, 16, GL000240.1, 17, 18, MT, 19, GL000242.1, GL000221.1, GL000200.1, GL000244.1, GL000223.1, GL000246.1, GL000202.1, GL000225.1, GL000204.1, GL000248.1, GL000227.1, GL000206.1, 1, GL000229.1, 2, GL000208.1, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, hs37d5, GL000191.1, GL000193.1, GL000195.1, GL000230.1, GL000197.1, GL000199.1, GL000232.1, GL000211.1, GL000234.1, GL000213.1, GL000236.1, GL000215.1, GL000238.1, GL000217.1, GL000219.1, X, Y, GL000241.1, GL000220.1, GL000243.1, GL000222.1, GL000245.1, GL000201.1, GL000224.1, GL000247.1, GL000203.1, NC_007605, GL000226.1, GL000205.1, G... | |
// "すべてヒトの染色体に由来するもの" であること | |
uniq_chr.sorted.foreach(println) | |
// 1 | |
// 10 | |
// 11 | |
// 12 | |
// 13 | |
// 14 | |
// 15 | |
// 16 | |
// 17 | |
// 18 | |
// 19 | |
// 2 | |
// 20 | |
// 21 | |
// 22 | |
// 3 | |
// 4 | |
// 5 | |
// 6 | |
// 7 | |
// 8 | |
// 9 | |
// GL000191.1 | |
// ... | |
// GL000248.1 | |
// GL000249.1 | |
// MT | |
// NC_007605 | |
// X | |
// Y | |
// hs37d5 | |
val cftr_reads = (readsRDD | |
.filter(_.contig.contigName.toString == "7") | |
.filter(_.start <= 117149189) | |
.filter(_.end > 117149189) | |
.collect()) | |
cftr_reads.length // res6: Int = 9 | |
// scala> cftr_reads | |
// res5: Array[org.bdgenomics.formats.avro.AlignmentRecord] = Array({"contig": {"contigName": "7", "contigLength": 159138663, "contigMD5": "618366e953d6aaad97dbe4777c29375e", "referenceURL": "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human", "assembly": null, "species": null}, "start": 117149104, "oldPosition": null, "end": 117149204, "mapq": 60, "readName": "SRR062642.24026612", "sequence": "TGGCTTCAAAGAAAAATCCTAAACTCATTAATGCCCTTCGGCGATGTTTTTTCTGGAGATTTATGTTCTATGGAATCTTTTTATATTTAGGGGTAAGGAT", "qual": "/LNNPNOPPPPPQQQQPQQLPQQPRQRPQPRQRRRRSQQHRHILLI?MLLHH?D7ICECFMMEEDKN@OCIBJIOIIKQQJJ?C@JIJD?<EEDCED?B>", "cigar": "100M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPa... | |
import org.bdgenomics.adam.predicates.ColumnReaderInput._ | |
import org.bdgenomics.adam.predicates.ADAMPredicate | |
import org.bdgenomics.adam.predicates.RecordCondition | |
import org.bdgenomics.adam.predicates.FieldCondition | |
class CftrLocusPredicate extends ADAMPredicate[AlignmentRecord] { | |
override val recordCondition = RecordCondition[AlignmentRecord]( | |
FieldCondition( | |
"contig.contigName", (x: String) => x == "chr7"), | |
FieldCondition( | |
"start", (x: Long) => x <= 117149189), | |
FieldCondition( | |
"end", (x: Long) => x >= 117149189) | |
) | |
} | |
val cftr_reads2 = sc.adamLoad[AlignmentRecord, CftrLocusPredicate]( | |
"/user/ds/genomics/reads/HG00103.adam", | |
Some(classOf[CftrLocusPredicate])).collect() | |
// ERROR............. | |
// org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: hdfs://ip-172-31-28-194.ap-northeast-1.compute.internal:8020/user/ds/genomics/reads/HG00103 | |
// ------------- | |
// from README.md | |
```bash | |
hadoop fs -mkdir /user/ds/genomics/dnase | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVC/@@download/ENCFF001UVC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/GM12878.DNase.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001UWQ/@@download/ENCFF001UWQ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/K562.DNase.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001WEI/@@download/ENCFF001WEI.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/BJ.DNase.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVQ/@@download/ENCFF001UVQ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/HEK293.DNase.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001SOM/@@download/ENCFF001SOM.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/H54.DNase.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVU/@@download/ENCFF001UVU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/HepG2.DNase.narrowPeak | |
``` | |
GENCODE data: | |
```bash | |
curl -s -L "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_18/gencode.v18.annotation.gtf.gz" | gunzip | hadoop fs -put - /user/ds/genomics/gencode.v18.annotation.gtf | |
``` | |
ChIP-seq data for CTCF: | |
```bash | |
hadoop fs -mkdir /user/ds/genomics/chip-seq | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001VED/@@download/ENCFF001VED.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/GM12878.ChIP-seq.CTCF.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001VMZ/@@download/ENCFF001VMZ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/K562.ChIP-seq.CTCF.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001XMU/@@download/ENCFF001XMU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/BJ.ChIP-seq.CTCF.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001XQU/@@download/ENCFF001XQU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/HEK293.ChIP-seq.CTCF.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001USC/@@download/ENCFF001USC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/H54.ChIP-seq.CTCF.narrowPeak | |
curl -s -L "https://www.encodeproject.org/files/ENCFF001XRC/@@download/ENCFF001XRC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/HepG2.ChIP-seq.CTCF.narrowPeak | |
``` | |
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/ | |
15.9 G /user/ds/genomics/HG00103.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam | |
20.8 M /user/ds/genomics/chip-seq | |
41.1 M /user/ds/genomics/dnase | |
1.0 G /user/ds/genomics/gencode.v18.annotation.gtf | |
12.7 G /user/ds/genomics/reads | |
[hadoop@ip-172-31-28-194 ~]$ hadoop fs -du -h /user/ds/genomics/dnase/ | |
7.5 M /user/ds/genomics/dnase/BJ.DNase.narrowPeak | |
7.0 M /user/ds/genomics/dnase/GM12878.DNase.narrowPeak | |
7.4 M /user/ds/genomics/dnase/H54.DNase.narrowPeak | |
6.4 M /user/ds/genomics/dnase/HEK293.DNase.narrowPeak | |
6.5 M /user/ds/genomics/dnase/HepG2.DNase.narrowPeak | |
6.3 M /user/ds/genomics/dnase/K562.DNase.narrowPeak | |
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/chip-seq | |
2.6 M /user/ds/genomics/chip-seq/BJ.ChIP-seq.CTCF.narrowPeak | |
4.7 M /user/ds/genomics/chip-seq/GM12878.ChIP-seq.CTCF.narrowPeak | |
3.6 M /user/ds/genomics/chip-seq/H54.ChIP-seq.CTCF.narrowPeak | |
2.1 M /user/ds/genomics/chip-seq/HEK293.ChIP-seq.CTCF.narrowPeak | |
2.6 M /user/ds/genomics/chip-seq/HepG2.ChIP-seq.CTCF.narrowPeak | |
5.1 M /user/ds/genomics/chip-seq/K562.ChIP-seq.CTCF.narrowPeak | |
phyloP data: | |
```bash | |
hadoop fs -mkdir /user/ds/genomics/phylop_text | |
for i in $(seq 1 22); do | |
echo "chr$i.phyloP46way.wigFix.gz" | |
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chr$i.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - "/user/ds/genomics/phylop_text/chr$i.phyloP46way.wigFix" | |
done | |
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chrX.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - /user/ds/genomics/phylop_text/chrX.phyloP46way.wigFix | |
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chrY.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - /user/ds/genomics/phylop_text/chrY.phyloP46way.wigFix | |
``` | |
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/phylop_text | |
2.6 K /user/ds/genomics/phylop_text/chr1.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr10.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr11.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr12.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr13.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr14.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr15.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr16.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr17.phyloP46way.wigFix | |
2.6 K /user/ds/genomics/phylop_text/chr18.phyloP46way.wigFix | |
... | |
adamBEDFeatureLoad | |
が見つからない | |
import org.bdgenomics.adam.rdd.ADAMContext | |
val ac = new ADAMContext(sc) | |
ac.adamLoad("/user/hadoop/genomics/phylop_text") | |
// save がわからん | |
scala> val bHg19Data = sc.broadcast( | |
| new TwoBitFile( | |
| new LocalFileByteAccess( | |
| new File("/user/ds/genomics/hg19.2bit")))) | |
<console>:51: error: not found: type TwoBitFile | |
new TwoBitFile( | |
^ | |
scala> import org.bdgenomics.adam.util.{TwoBitFile, SequenceUtils} | |
<console>:48: error: object TwoBitFile is not a member of package org.bdgenomics.adam.util | |
import org.bdgenomics.adam.util.{TwoBitFile, SequenceUtils} | |
^ | |
// ------------------------------- | |
// 1000 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment