TomConlin · June 17, 2021 18:17
diff --git a/FriendsOfEntropy b/FriendsOfEntropy

 gvcf is not just "gzipped" vcf  it is some "genomic" extension to the format

 # test
 vcftools --gzvcf  ./data/gvcf/IDTX_PCRfree_PI562985_ATTACTCG-ATAGAGGC_Sorghum_I549_L1.bwa_pcr_free.raw.snps.indels.g.vcf.gz \
 	--non-ref-ac-any 1 \
 	--recode \
 	--recode-INFO-all \
 	--stdout > test_vcftools_filtered


 ls ./data/gvcf/I*raw.snps.indels.g.vcf.gz |  
 	sed 's|.*[_.]\?\(PI[_]\?[0-9]\{4,7\}\)[_.]\?.*|\1|'|tr -d '_' | sort -u | wc -l

 364  cultivars? lines? strains?


 this "rio" breaks the name pastern of "PInnnnnnn"   ...(it is a [new?] reference strain)

 data/gvcf/IEDF_IEDE_merged_PCRfree_rio.bwa_pcr_free.raw.snps.indels.g.vcf.gz


 cultivars starts as 1.7T gzipped in ~364 files


 time \
 for gvcf in $(ls ./data/gvcf/I*raw.snps.indels.g.vcf.gz); do      
 	cultivar=$(sed 's|.*[_.]\?\(PI[_]\?[0-9]\{4,7\}\)[_.]\?.*|\1|' <(echo -n "$gvcf")|tr -d '_') ;  
 	vcftools --gzvcf "$gvcf" --non-ref-ac-any 1 --recode --recode-INFO-all --stdout >\
 		 "./data/Cultivar/$cultivar.vcf"; 
 done

 ~ 280G uncompressed    &  47G compressed

 compressing ... should have used a parallel compress 
 	tar -cf - Cultivar | pigz -p $(($(nproc)/2)) > cultivar.tar.gz


 the filtering step will have removed every call 
 where the cultivar agrees with the reference.

 leaving

 1,370,111,696 calls   is 3.764M call per cultivar on avg

 more precisely stats for calls per cultivar are

 cut -f2 cultivar_call_count.txt | ~/bin/sumstat.r 

       V1          
 Min.   :      43     only one close to this small likely needs prune/redo
 1st Qu.: 3207119  
 Median : 3755217  
 Mean   : 3774412  
 3rd Qu.: 4245765  
 Max.   :11101842 


 biggest question; 

 	Are all 364 cultivar lines relevant?  -- yes


 --------------------------------------------------------------
 for a variation.

 	is it within a gene  (& which one(s)) 
 		is it within an exon?
 			is it a silent substitution

 	is it in a regulatory region	up/down stream what size
 		can regions be generically characterized? (diminishing periodic? histome sized?)
 	is it in a known binding site (which)

 	does it change the gc content (in which direction)

 	
 need to do a massive crunches

 want maybe 30-40 features total
 so metrics that can be done per cultivar vcf and wind up as some concise value

 1) gc delta of genome
 2) total calls
 3) calls within genes
 4) calls upstream   2k
 5) calls downstream 1k		
 6)	0/1, 1/0, 1/1
 7) ...



 -------------------------------------------------------------------------
 Is the gene symbol|id  w/coordinates I got from OMA the correct assembly?
 	 Is there a preferred one?


 have a permission request for a cultivar mapping file on Goog. no response yet
 https://drive.google.com/file/d/18DI3BqVv7NCJdQM2tu9S69dZww8CXdCB/view


 can Ensembl Plant 49 references be used?  e.g.

 ##gff-version 3
 ##sequence-region   10 1 61233695
 #!genome-build Joint Genome Institute Sorghum_bicolor_NCBIv3
 #!genome-version Sorghum_bicolor_NCBIv3
 #!genome-date 2017-04
 #!genome-build-accession GCA_000003195.3
 #!genebuild-last-updated 2017-06

 Is ncbi's different/better? 
 ##gff-version 3
 #!gff-spec-version 1.21
 #!processor NCBI annotwriter
 #!genome-build Sorghum_bicolor_NCBIv3
 #!genome-build-accession NCBI_Assembly:GCF_000003195.3
 #!annotation-source NCBI Sorghum bicolor Annotation Release 101
 ##sequence-region NC_012870.2 1 80884392
 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=4558


 	

 should we limit to particular cohort of "suspected" genes?


 -------------------------------------------------------------------------------
 vcf call col
  1     2   3   4   5   6       7       8       9       10
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	IDYE_PI92270


 grep -v "^#"  data/Cultivar/PI92270.vcf | head -1 | tr '\t' '\n' | grep -n .
 1:Chr01
 2:104
 3:.
 4:C
 5:A,<NON_REF>
 6:475.77
 7:.
 8:BaseQRankSum=0.046;ClippingRankSum=0.000;DP=21;ExcessHet=3.0103;MLEAC=1,0;MLEAF=0.500,0.00;MQRankSum=3.829;RAW_MQ=54154.00;ReadPosRankSum=-1.038
 9:GT:AD:DP:GQ:PGT:PID:PL:SB
 10:0/1:8,11,0:19:99:0|1:104_C_A:504,0,169,527,208,735:8,0,5,6

 -----------------------------------------------------------------------------

 cut -f 1,2,5 data/Cultivar/PI[0-9]*.vcf | grep -v "^#" | more



 with straight cultivars ... say we want snp that occur in apx half the cultivars.


 with phylo trees  making  "blurred-lines" cluster of cultivars.... clustivars  
 	want snps that dominate within a clustivar 
 	and occur in apx. half the clustivars  maybe the middle half or middle third? 	


 # get a list of all the plain snps (no indels)  ... expect about a billion
 # chr location cultivar and non-ref base


 # trying the  bash history substitution trick to quote a string
 # awk -F'[/:,\t]' 'BEGIN{OSF="\t"}$7~/^A$|^C$|^T$|^G$/{print $4,$5,$7,substr($3,1,length($3)-4)}'
 !:q
 CMD="awk -F'[/:,\t]' 'BEGIN{OSF=\"\t\"}$7~/^A$|^C$|^T$|^G$/{print $4,$5,$7,substr($3,1,length($3)-4)}'"


 time \
 grep -v "^#"  data/Cultivar/PI[0-9]*.vcf |
    parallel -j16 "$CMD {} > chr_loc_snp_cul.tab"


 time \
 grep -v "^#"  data/Cultivar/PI[0-9]*.vcf |
    parallel -j24 awk -F'[/:,\t]' 'BEGIN{OSF="\t"}$7~/^A$|^C$|^T$|^G$/{print $4,$5,$7,substr($3,1,length($3)-4)}' {} > chr_loc_snp_cul.tab

 ...gaaaahhh too slow

 time \
 grep -v "^#"  data/Cultivar/PI[0-9]*.vcf |
 	awk -F '[/:,\t]' 'BEGIN{OSF="\t"}$7 ~ /^A$|^C$|^T$|^G$/{print $4,$5,$7,substr($3,1,length($3)-4)}' |
 	sort --parallel=6 -k1,2n,3,4  > chr_loc_snp_cul.tab

 real	42m49.090s


 wc -l < chr_loc_snp_cul.tab
 1,277,633,097 

 next pass, accumulate genomic locations with how many hits (from cultivars) it has
 to reject the too-sparse and the ubiquitous

 time cut -f 1,2,3  -d ' '  foo | uniq -c | sort --parallel=4 -nr > count_snp.txt

 real	4m35.117s

 look at the distribution  are there obvious cut offs?

 cut -c1-8 ~/fuse/monarch4/Projects/GenoPhenoEnvo/count_snp.txt| sumstat.r 
       V1        
 Min.   :  1.00  
 1st Qu.:  1.00  
 Median :  2.00  
 Mean   : 22.11  
 3rd Qu.: 13.00  
 Max.   :362.0


 ouch. 
 nothing anywhere near half. not shocking but hope does spring eternal


 # anything in the distribution of counts?
 cut -c1-8 count_snp.txt | uniq -c > count_of_counts.txt

 cut -c1-8 ~/fuse/monarch4/Projects/GenoPhenoEnvo/count_of_counts.txt| sumstat.r 
       V1          
 Min.   :    2,962  
 1st Qu.:   10,092  
 Median :   17,198  
 Mean   :  159,661  
 3rd Qu.:   47,859  
 Max.   :23709912 

 counts w/ between 10k & 50k dominate, skewed to the low side  not immediately helpful.


 clearly we can cut off all the single instance snps

 24M exist only once and ~5k exist everywhere which is zero info in either case

 maybe a different approach that looks  for distance from max information
 for *this* collection of ~360 cultivars how far is a snp from being included in half



 # lop the the top & bottom now as we will later anyway  
 awk '$1<330 && $1>30 {print $1<180?180-$1:$1-180,$2,$3,$4}' count_snp.txt |
 	sort -n > middling-snp_count.txt

 # anything in the distribution of middling counts?
 cut -f1 -d ' ' middling-snp_count.txt | uniq -c > count_of_middling-counts.txt

 cut -c1-8 ~/fuse/monarch4/Projects/GenoPhenoEnvo/count_of_middling-counts.txt|sumstat.r 

 Min.   : 16457  
 1st Qu.: 37663  
 Median : 46814  
 Mean   : 63162  
 3rd Qu.: 71216  
 Max.   :165330

 that is a smoother distribution


 cut -f1 -d ' ' ~/fuse/monarch4/Projects/GenoPhenoEnvo/middling-snp_count.txt | 
 	uniq -c |
 	~/bin/otsu.awk
 32

 that is a warm fuzzy number

 # how many snps are kept?
 awk '$1<=32{print}' middling-snp_count.txt | wc -l
 1,185,000

 # for how many chromosome or parts there of
 awk '$1<=32{print $2}' middling-snp_count.txt | sort -u | wc -l
 482


 So keeping *snps* which exist within 150-to-210 of the 360 lines
 results in about 1.1M (distinct) out of 1.4B snps flagged for future consideration.
 (to be specific I am not considering indels at this point)


 fgrep  -f <(cut -c3- middling-snp_count.txt)  foo  > middling-snp-all.txt

 # all individual discriminating snps 
 wc -l < middling-snp-all.txt

 59,451,572   ~60 M individual snps
 166,666 snps per cultivar is uniformly distributed


 # Distribution of individual snps over cultivars.

 cut -f4 -d ' ' middling-snp-all.txt | sort | uniq -c | sort -nr > 

 ohhh looks very nice  100k to 250k per cultivar


 cut -f4 -d ' ' ~/fuse/monarch4/Projects/GenoPhenoEnvo/middling-snp-all.txt | 
 	sort | uniq -c | cut -c1-8 | sumstat.r

 cut -f4 -d ' ' ~/fuse/monarch4/Projects/GenoPhenoEnvo/middling-snp-all.txt | sort | uniq -c | cut -c1-8 | sumstat.r 
       V1        
 Min.   :    13  
 1st Qu.:142376  
 Median :161990  
 Mean   :163778  
 3rd Qu.:187212  
 Max.   :248036  

 not perfectly uniform, but close enough.


 make a "--regions-file" to filter the per cultivar vcf 
 	to provide a filter for [bv]cftools


 cut -f 1,2 -d ' ' middling-snp-all.txt | tr ' ' '\t' | sort -u |
 	sort  -k1,1 -k2,2n  > middling-region.tsv

 wc -l < middling-region.tsv
 330,228


 Note: these 330k spots may have a base other than one that is "middling"
 	or have more than one viable base ... (any but ref) 


 for c in ./PI[0-9]*.vcf; do bgzip  $c ; done

 for f in ./PI[0-9]*.vcf.gz ; do tabix -p vcf  $f; done 





 # ended up filtering with tablix instead of [bv]cftools here ; seems even slower
 # bcftools --gvcf data/Cultivar/PI[0-9]*.vcf.gz	 --out data/Middling  --regions-file  middling-region.tsv 

 test case
 tabix -R middling-region.tsv -p vcf data/Cultivar/PI92270.vcf.gz
 results in  176,664  out of  3,884,467  calls kept  (removed 95%)



 ###  well there goes a whole day

 takes it down to 60M calls   that is 12G (uncompressed)  2.1G gzipped

 # [--cache|--offline|--database]

 ~/GitHub/Ensembl/ensembl-vep/vep --offline --species sorghum_bicolor -in Middling/PI303658.vcf


 tried snagging Kent's .vep/ cache but it is complaining
 so retrying with 


 cd $HOME/.vep
 curl -O ftp://ftp.ensemblgenomes.org/pub/plants/current/variation/vep/sorghum_bicolor_vep_49_Sorghum_bicolor_NCBIv3.tar.gz 
 tar xzf sorghum_bicolor_vep_49_Sorghum_bicolor_NCBIv3.tar.gz



 needs the EnsembleGenome version number (49) as it
 is different than the Ensembl build number (102)

 ~/GitHub/Ensembl/ensembl-vep/vep --offline --cache_version 49 --species sorghum_bicolor -i Middling/PI303658.vcf



 is having problems because  chr0N  does not have a leading capital 'C'
 
 for i in $(seq 1 9) ; do echo -e "$i\tChr0$i"; done \
 	>> ~/.vep/sorghum_bicolor/49_Sorghum_bicolor_NCBIv3/chr_synonyms.txt


 echo -e "10\tChr10" >> ~/.vep/sorghum_bicolor/49_Sorghum_bicolor_NCBIv3/chr_synonyms.txt


 lots of complaints of   "super_nnn" not found in annotation sources or synonyms

 but spot checks they do exist as values in 
 ~/.vep/sorghum_bicolor/49_Sorghum_bicolor_NCBIv3/chr_synonyms.txt

 I wonder if I should put the inverse in the dict... don't know.


 awk -F '\t' '$2 ~ /super_[0-9]+/{print $2 "\t" $1}' \
 	~/.vep/sorghum_bicolor/49_Sorghum_bicolor_NCBIv3/chr_synonyms.txt >> \
 	~/.vep/sorghum_bicolor/49_Sorghum_bicolor_NCBIv3/chr_synonyms.txt

 guess I will try it out ...

 still see the complaints so that was not the solution to all the:
 ...
 ...
 WARNING: Chromosome super_85 not found in annotation sources or synonyms on line 131356
 ...
 ...

 so maybe it is reporting the other way around; 
 that the contig is not im my sample.

 -----------------------------------------------------------

 ~/GitHub/Ensembl/ensembl-vep/vep \
 	--offline \
 	--cache_version 49 \
 	--species sorghum_bicolor \
 	--force_overwrite \
 	-i Middling/PI303658.vcf


 For this generic particular test case there are:
 	 ~130k calls and vep finds ~16.5k hit genes 

 (it is pretty fast,  about a minute. so maybe 6 hours compute for all of them)

 the cached annotation file may not be rich enough to capture things like 
 	regulatory regions / other features

 I do see up/down stream and don't know if those are count as gene hits or not
 but they might work as an approximate proxy for regulation


 SIFT and PolyPhen filters are not available (don't know enough to say why)

 running with  `--everything`  takes quite a bit longer ~ 300 seconds
 bloats the output from 30M to 50M

 some human genome metadata there  ... not seeing it as a win

 time \
 for f in ./data/Middling/PI[0-9]*.vcf ; do
 	g=${f##*/} 
 	# tabix -R middling-region.tsv -p vcf "$f" 
 	~/GitHub/Ensembl/ensembl-vep/vep \
 	--offline \
 	--cache_version 49 \
 	--species sorghum_bicolor \
 	--force_overwrite \
 	-i "$f" \
 	-o "./data/MidVep/$g"
 done 


 # real	369m23.404s    6-hrs & 10-min

 du -sh ./data/MidVep/
 14G  (added 2G of "stuff")


 What is in it? (my same generic particular sample in this case)

 grep -v "^#" ./data/MidVep/PI92270.vcf | cut -f7 | sort | uniq -c | sort -nr
  93733 intergenic_variant
  76547 upstream_gene_variant
  69342 downstream_gene_variant
  18395 intron_variant
   9639 coding_sequence_variant
   6163 3_prime_UTR_variant
   5399 5_prime_UTR_variant
     24 start_lost,coding_sequence_variant
     11 non_coding_transcript_exon_variant
      1 start_lost,coding_sequence_variant,5_prime_UTR_variant

 don't see any here but there could be premature-stops and read-through
 maybe that is part of what the missing SIFT / PolyPhen filters would provide 



 variants in coding sequences across all cultivars

 fgrep -c "coding_sequence_variant" ./data/MidVep/PI[0-9]*.vcf | cut -f2 -d ':' | sumstat.r 
       V1       
 Min.   :    0  
 1st Qu.: 8316  
 Median : 9142  
 Mean   : 9116  
 3rd Qu.:10126  
 Max.   :13256  

 very consistent,  a little less than 10k coding snps per cultivar



 Note:
 ```
 perl INSTALL.pl -a p -g list
 WARNING: DBD::mysql module not found. VEP can only run in offline (--offline) mode without DBD::mysql installed

 http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#requirements

 WARNING: The following plugins have not been found: list
 Available plugins: 
 AncestralAllele,Blosum62,CADD,CSN,Carol,Condel,Conservation,
 DisGeNET,Downstream,Draw,ExAC,ExACpLI,FATHMM,FATHMM_MKL,FlagLRG,FunMotifs,
 G2P,GO,GeneSplicer,Gwava,HGVSIntronOffset,LD,LOVD,LoF,LoFtool,LocalID,MPC,MTR,
 Mastermind,MaxEntScan,NearestExonJB,NearestGene,PON_P2,Phenotypes,PostGAP,
 ProteinSeqs,REVEL,ReferenceQuality,SameCodon,SingleLetterAA,SpliceAI,SpliceRegion,
 StructuralVariantOverlap,SubsetVCF,TSSDistance,dbNSFP,dbscSNV,gnomADc,miRNA,
 neXtProt,satMutMPRA

 ```

 SIFT is not immediately among them


 time \
 > ag -v "^#" ./data/MidVep/PI[0-9]*.vcf | cut -f7 | sort | uniq -c | sort -nr

 31,292,150 intergenic_variant
 25,978,416 upstream_gene_variant
 23,811,383 downstream_gene_variant
 8,458,950 intron_variant
 3,298,203 coding_sequence_variant
 2,205,214 3_prime_UTR_variant
 1,877,106 5_prime_UTR_variant
     8,639 non_coding_transcript_exon_variant
     8,638 start_lost,coding_sequence_variant
       786 coding_sequence_variant,intron_variant
       749 coding_sequence_variant,3_prime_UTR_variant
       392 coding_sequence_variant,3_prime_UTR_variant,intron_variant
       207 coding_sequence_variant,5_prime_UTR_variant
       135 start_lost,coding_sequence_variant,5_prime_UTR_variant
         2 coding_sequence_variant,5_prime_UTR_variant,intron_variant

 real	6m38.872s
 user	7m34.265s

 ---------------------------------------------------------------



 those 3.3M snps in coding regions look like the hottest properties

 how many genes?
 

 ag "coding_sequence_variant" ./data/MidVep/PI[0-9]*.vcf |
 	cut -f4 |
 	sort | 
 	uniq -c |
 	sort -nr > middling_genes.counts

 wc -l <  middling_genes.counts
 4455

 there are ~40k genes in the genome so  ~10% of the genes 

 there are about 21 genes which must lack sufficient representation across our set
 as there fewer calls than than half our count of cultivars  [1-60 calls]

 and there is one 
 	 119,773 SORBI_3001G353900
 with over twice the average that may want a closer look.


 other than that the counts of calls per gene  within a line look like:

 cut -c1-8  middling_genes.counts| sumstat.r 
       V1          
 Min.   :     1.0  
 1st Qu.:   181.0  
 Median :   350.0  
 Mean   :   742.8  
 3rd Qu.:   690.0  
 Max.   :119773.0  


 the main story is the majority *could* be genes with a small handful of snps 

 We could maybe take the step of learning over the 4k genes as well as the 300 cultivars
 it has the advantage of allowing more columns to consider whild maintaining a 10:1 shape.


 want to know about snps per gene and genes per line

 time\ 
 ag "coding_sequence_variant" ./data/MidVep/PI[0-9]*.vcf |
 	cut -f1,4 |
 	sed 's|data/MidVep/\(PI[0-9]*\).vcf.*\t\(.*\)|\1\t\2|' |
 	sort| uniq -c |
 	awk '{print $2 "\t" $3 "\t" $1}' > line_gene_calls.tab


 # calls per gene
 cut -f3 ./line_gene_calls.tab  | sumstat.r 
       V1         
 Min.   :  1.000  
 1st Qu.:  1.000  
 Median :  2.000  
 Mean   :  3.911  
 3rd Qu.:  3.000  
 Max.   :645.000



 # gene per line
 cut -f1 ./line_gene_calls.tab  |sort | uniq -c | cut -c1-8 | sumstat.r 
       V1      
 Min.   :  23  
 1st Qu.:2105  
 Median :2364  
 Mean   :2337  
 3rd Qu.:2611  
 Max.   :3433  


 So we get a couple of thousand genes per line ... 
 does not contradict the previous finding of ~4.5k genes in play which contained 
 snps specifically chosen to split the set of lines roughly in half.


 to recap 

 everything (1.7TB) 
 	-> variants (w/o ref ~50GB)
 		-> discriminating SNPs
 			-> genes-w/hi-entropy-snps (discriminating)
 				- cultivar clusters based on gene perturbations (snp count within gene)	

 I wonder if I can recover sets of genes-w/ds which discriminate these lines as well


 so for each of the genes, how many lines use them  
 	then look for the ones jolly in the middle 


 # lines per gene
 cut -f2 ./line_gene_calls.tab  |sort | uniq -c | cut -c1-8 | sumstat.r
       V1       
 Min.   :  1.0  
 1st Qu.:177.0  
 Median :183.0  
 Mean   :189.9  
 3rd Qu.:188.0  
 Max.   :351.0  

 kinda surprised it tracks so perfectly 

 still not out of the woods though

 the lines themselves will not be in any (biologically) meaningful sense "independent"
 so the sets discriminating genes/snps will also be ... clumpy 

 we can cluster the lines/genes... jaccard distance for a first pass perhaps

 ... bi-clustering seems like it might be a legit approach here (maybe).


 - omit gene SORBI_3001G353900  with its 120k calls across the board

 - omit the 21 genes which contain less than 60 call across the board


 awk '$1 > 100 && $1 < 100000{print $2}' middling_genes.counts > selected_genes.txt
 wc -l < selected_genes.txt 
 4431

 the only lines that look a bit weak are

 cut -f1 ./line_gene_calls.tab  |sort | uniq -c | sort -nr | tail -2
    706 PI606706
     23 PI564163

 but I can just leave them in now and they can fall out on their own later.

 cut -f1 ./line_gene_calls.tab  |sort | uniq -c | sort -nr |cut -c9- > selected_cultivar.txt
 wc -l < selected_cultivar.txt 
 362


 # made a quick sparse representation to full dataframe converter

 ./scripts/sp2df.awk -F'\t' line_gene_calls.tab > gene_cultivar_call.dataframe

 moved to R-lang/R-README

 Ishita was not liking the R formatted data (save file) 

 there is still maybe another basic question   something like   

 how many lines include a particular gene   ... 
 the stats above preclude finding a smoking gun there... 
 still looking for something that shows if the collection 
 must be unhelpfully lopsided in some way 


 sqlite3 gpe
 sqlite> .mode tabs
 sqlite> create table lgc(line text not null, gene text not null, cnt integer not null);
 sqlite> .import ./line_gene_calls.tab lgc
 sqlite> select count(*) from lgc;
 846,079

 sqlite> create index lgc_line_idx on lgc(line);
 sqlite> create index lgc_gene_idx on lgc(gene);


 sqlite> select line, sum(cnt) snps from lgc group by line order by snps;
 PI564163|53
 PI606706|3221
 PI561072|4647
 PI329435|4694
 ...

 PI455307|12919
 PI329466|12941
 PI569422|13053
 PI562897|13256


 I wonder if the cultivar PI564163 is a (nigh) reference genome.

 ----------------------------------------------------------------------------------------

 Tried  the venerable  BiCat
 https://academic.oup.com/bioinformatics/article/22/10/1282/237365

 .. sub optimal

 	3 of the 7 methods did not fail with Java exceptions
 	  - one method produced (ISA) produced a huge skew (one cluster won too much) 
 	  - Bicluster (OPSM) worked ... lots (48) overlapping clusters... maybe?
 	  - K-means seems to work  (not biclustering)


 to make a list of cultivars and a cluster id

 # awk -F' ' '/^P/{for(i=1;i<=NF;i++)print $i"\t" FNR}' K30-means.out > cultivar_clustK30.tab 
 # ... this cultivar_clustK30.tab  is not as expected. 
 # the k-means keeps a sub set of genes  but _every_ cultivar

 awk -F' ' '/^S/{for(i=1;i<=NF;i++)print $i"\t" FNR}' K30-means.out > gene_clustK30.tab


 awk -F' ' '/^P/{for(i=1;i<=NF;i++)print $i"\t" FNR}' OPSM_bicluster.out > cultivar_clustOPSM.tab 


 ### back in sql land
 .mode tabs

 drop table if exists K30;
 create table K30(gene text not null, clust integer not null);
 .import ./BiCat/gene_clustK30.tab K30



 create table OPSM(cultivar text not null, clust integer not null);
 .import ./BiCat/cultivar_clustOPSM.tab OPSM

 .mode tabs
 .output cultivar_k30cluster_score.tab
 select line as cultivar,clust, sum(cnt) score   
 from lgc join K30 on lgc.gene = k30.gene
 group by 1,2
 --limit 30
 ;
 .output


 select a.clust, lgc.gene, sum(lgc.cnt) num, 
 (select count(b.clust) from K30 b where b.cultivar = lgc.line) denom  
 from lgc join K30 a  on lgc.line = a.cultivar
 group by 1,2
 ;
 


 ----------------------------------------------------------------------------------------


 reformat cultivar_k30cluster_score.tab to be more dataframe-ish

 awk '{a[$1,$2]=$3;line[$1]++;c[$2]++}\
 	END{n=asorti(c);for(i=1;i<=n;i++)r=r "\t" c[i];print r;\
 	for(l in line){r=l;for(i=1;i<=n;i++)r=r "\t" a[l,c[i]];print r}}' \
 	cultivar_k30cluster_score.tab > cultivar_k30cluster_score.dataframe

 ########################################################################################

 	Function

 from docs at 
 	https://github.com/genophenoenvo/documentation

 in the Phenotype section we find:

 Genes relevant for the phenotypes of interest were found using Gramene and are here.
 https://docs.google.com/spreadsheets/d/1ugMisjghvSfa0W_TPhA-0_6C8A0X-gwOqPZbzqjJOrg/edit#gid=1246181510

 export & transform that to something useful


 hmmm ~300 gene-trait pairs is most likely insufficient. 
 Especially it they do not happen to fall into the 10% if the genome 
 selected as decidable.

 import as 'KG_gene_query-data.tsv' (11k)


 grep "Sorghum bicolor" KG_gene_query-data.tsv | wc -l
 64

 ahh ... most genes are not for our species 

 grep "Sorghum bicolor" KG_gene_query-data.tsv | cut -f 1 | sort | uniq -c | sort -nr
     32 days to flowering
     17 canopy and biomass
     15 days to flag leaf emergence


 60 genes have three phenotypes



 fgrep -n -f <(grep "Sorghum bicolor" KG_gene_query-data.tsv | cut -f 4 |sort -u) ../GitHub/genomic_data/FriendsOfEntropy/selected_genes.txt 

 472:SORBI_3004G207000
 596:SORBI_3008G136601
 975:SORBI_3001G177100
 1041:SORBI_3010G191800
 2652:SORBI_3003G046800
 2949:SORBI_3003G028300
 3267:SORBI_3009G228700



 Seven of the genes with function are in the set of 4.4k selected

 not getting warm fuzzies here 

 ##.###########################################################################

 2021 Feb:

 Asked to build a phylogenetic tree  of the cultivars based on my F.O.E. approach

 a tree could not really be ancestor based anymore as genetics 
 have been primaraly ignored throughout the filtering process.

 So it is better to just think of it as heirachiacal clustering.


 try generating (brute force) jaccard distance with several cross joins in sqlite.
 blows out remaing disk space on laptop. 


 cp /dev/null cultivar_gene_onehot.txt 

 for col in $(seq 2 363) ; do cut -f "$col" gene_cultivar_call.dataframe | tr '\n' ' '|
 	sed -n 's| [2-9][0-9]*| 1|gp' >> "cultivar_gene_onehot.txt"; echo >> "cultivar_gene_onehot.txt"; done


 pairwise reduce the onehot genes to their Jaccard distance
 
 scripts/onehot_jaccard.awk -F ' ' cultivar_gene_onehot.txt > cultivars_jdistance.txt 

 wc -l <cultivars_jdistance.txt 
 65,341

 should be something like ((362^2)-362)/2  which is ... 65,341


 tail cultivars_jdistance.txt
 358	359	PI656051	PI656056	1110	2248	0.493772
 358	360	PI656051	PI656065	1004	3034	0.330916
 358	361	PI656051	PI673843	1107	2892	0.38278
 358	362	PI656051	PI92270	923	3323	0.277761
 359	360	PI656056	PI656065	934	2926	0.319207
 359	361	PI656056	PI673843	1015	2806	0.361725
 359	362	PI656056	PI92270	857	3211	0.266895
 360	361	PI656065	PI673843	1386	3115	0.444944
 360	362	PI656065	PI92270	1288	3460	0.372254
 361	362	PI673843	PI92270	1225	3484	0.351607


 are there any pairs of cultivars with _no_ genes in common?

 fgrep -c "NAN" cultivars_jdistance.txt
 0
 no there are not.


 most similar?
 sort -k7,7nr cultivars_jdistance.txt | head
 92	93	PI329300	PI329301	2678	2738	0.978086
 205	353	PI540518	PI656023	1792	1877	0.954715
 91	92	PI329299	PI329300	2607	2738	0.952155
 203	204	PI535795	PI535796	1957	2056	0.951848
 91	93	PI329299	PI329301	2609	2742	0.951495
 315	316	PI620072	PI620157	2230	2350	0.948936
 182	262	PI524475	PI569459	2357	2503	0.94167
 257	277	PI569453	PI570087	2252	2411	0.934052
 185	186	PI526905	PI527045	1860	1998	0.930931
 43	48	PI156463	PI157033	1858	1998	0.92993


 least similar?
 sort -k7,7n cultivars_jdistance.txt | head
 137	236	PI329710	PI564163	6	3165	0.00189573
 236	310	PI564163	PI595699	8	2519	0.00317586
 195	236	PI534120	PI564163	8	2426	0.00329761
 57	236	PI181080	PI564163	8	2387	0.00335149
 126	236	PI329585	PI564163	9	2676	0.00336323
 208	236	PI552851	PI564163	8	2329	0.00343495
 110	236	PI329506	PI564163	9	2615	0.00344168
 179	236	PI521280	PI564163	8	2234	0.00358102
 145	236	PI330168	PI564163	10	2726	0.00366838
 79	236	PI266927	PI564163	9	2441	0.00368701


 summary stats on jaccard distances
 cut -f7 cultivars_jdistance.txt | sumstat.r 
       V1          
 Min.   :0.001896  
 1st Qu.:0.315296  
 Median :0.356688  
 Mean   :0.367557  
 3rd Qu.:0.402389  
 Max.   :0.978086  

 summary stats on size of paiwise union of gene sets per cultivar 
 cut -f6 cultivars_jdistance.txt | sumstat.r 
      V1      
 Min.   : 716  
 1st Qu.:3265  
 Median :3456  
 Mean   :3417  
 3rd Qu.:3632  
 Max.   :4239 

 summary stats on size of paiwise intersection of gene sets per cultivar 
 cut -f5 cultivars_jdistance.txt | sumstat.r 
      V1      
 Min.   :   6  
 1st Qu.:1071  
 Median :1235  
 Mean   :1258  
 3rd Qu.:1405  
 Max.   :2811  



 # Record an indexing of the cultivars for distance matrix and h-trees
 
 cut -f1 -d ' '  cultivar_gene_onehot.txt| grep -n . | tr ':' '\t' > index_cultivar.tsv


 cut -f1,2,7 cultivars_jdistance.txt > jaccard-dist-mat-sparse.tsv





 ######################################################################################

 old: (... everything old is new again)

 - consider cohorts that share snps and how well they correlate with the phylogenitic trees


 - tag each cultivar with a id of the clustivar phylogenitic cluster it belongs to
 	use Kents tassle distance matrices to cluster?