TomConlin · March 19, 2018 03:26 · pnrobinson · Mar 19, 2018
diff --git a/v2small_check_20180318.txt b/v2small_check_20180318.txt
 head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n .
 1:#diseaseID            <- inconsistant capatilization
 2:diseaseName
 3:phenotypeId
 4:phenotypeName
 5:onsetId
 6:onsetName
 7:frequency
 8:sex
 9:negation
 10:modifier
 11:description
 12:publication
 13:evidence
 14:assignedBy
 15:dateCreated

 ########################

 # easier to survey the records in one pile
 grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2small.tab

 # howmany rows
 wc -l < v2small.tab
 92,727  

 # right number of columns every time?
 awk -F'\t' 'NF!=15' v2small.tab
 # yep

 # disease identifier types 
 cut -f1  -d':' v2small.tab | sort | uniq -c | sort -nr
  92430 OMIM
    297 DECIPHER

 # files with the most rows
 cut -f1  v2small.tab |  uniq -c | sort -nr| head 
    131 OMIM:312870
    128 OMIM:180849
    108 OMIM:607872
    108 OMIM:194050
     99 OMIM:613406
     95 OMIM:270400
     94 OMIM:194190
     87 OMIM:601803
     87 OMIM:122470
     85 OMIM:305600

 #  check if any disease identifiers in more than one file?
 cut -f1  v2small.tab |  uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head  
      1 OMIM:617537
      1 OMIM:617526
      1 OMIM:617506
      1 OMIM:617478
      1 OMIM:617468
      1 OMIM:617466
      1 OMIM:617460
      1 OMIM:617452
      1 OMIM:617450
      1 OMIM:617442
 # no

 # summary stats on rows per file
 cut -f1  v2small.tab |  uniq -c | sort -nr| cut -c1-8 | sumstat.r
       V1        
 Min.   :  1.00  
 1st Qu.:  4.00  
 Median :  8.00  
 Mean   : 12.61  
 3rd Qu.: 17.00  
 Max.   :131.00  
 [1] "sd :12.46"

 ###########################################
 # sumstat.r 
 #! /usr/bin/Rscript --vanilla
 	x <- read.csv('stdin', header = F); 
 	summary(x);
 	sprintf("sd :%.02f", sd(x[,1]));
 ###########################################





 ###########################################
 # diseaseID
 # howmany distinct disease identifiers
 cut -f1 v2small.tab | uniq | wc -l
 7351

 #####################################
 # diseaseName
 # howmany distinct disease ... "word thingies"(tm)
 cut -f2 v2small.tab | uniq | wc -l
 12378
 # not 1:1

 # sometimes appears to be a ';;' seperated list
 # howmany are lists? 
 cut -f2 v2small.tab | sort -u | grep -c  ";;" 
 2437

 # How many list semicolons are followed by a space
 cut -f2 v2small.tab | sort -u | grep -c  ";; "
 4

 # How many list semicolons are followed by a letter
 cut -f2 v2small.tab | sort -u | egrep -c  ";;[A-Za-z]"
 2428

 # TODO standardise on space or not after semicolon in list

 # many are purely OMIM screaming caps
 cut -f2 v2small.tab | grep -c  "^[A-Z0-9; ]*$" | sort -u
 23143

 # some contain lowercase 
 cut -f2 v2small.tab | grep "[a-z]" | sort -u | wc -l
 375

 # a few are only non caps mostly cytonygenic locations
 cut -f2 v2small.tab | grep -v "[A-Z]" | sort -u

 12q14 microdeletion syndrome
 15q13.3 microdeletion syndrome
 15q24 recurrent microdeletion syndrome
 15q26 overgrowth syndrome
 16p11.2-p12.2 microdeletion syndrome
 17q21.31 recurrent microdeletion syndrome
 1p36 microdeletion syndrome
 1q21.1 recurrent microdeletion (susceptibility locus for neurodevelopmental disorders)
 1q21.1 recurrent microduplication (possible susceptibility locus for neurodevelopmental disorders)
 22q11.2 distal deletion syndrome
 22q11 duplication syndrome
 2p15-16.1 microdeletion syndrome
 2q33.1 deletion syndrome
 2q37 monosomy
 3q29 microdeletion syndrome
 7q11.23 duplication syndrome
 8p23.1 deletion syndrome
 8p23.1 duplication syndrome
 9q subtelomeric deletion syndrome


 ##########################################
 # phenotypeId
 # howmany?
 cut -f3 v2small.tab | wc -l
 92727
 # unique?
 cut -f3 v2small.tab | sort -u | wc -l
 6994

 # have correct curie prefix?
 cut -f3 v2small.tab | cut -f1 -d':' | uniq -c
  92727 HP

 ################################
 # phenotypeName
 cut -f4 v2small.tab  | wc -l
 92727
 cut -f4 v2small.tab  | sort -u | wc -l
 6994

 # anything not simple words? any puncuation indicating lists?
 cut -f4 v2small.tab  | grep -v "[a-z A-Z]*"
 # nothing
 ################################
 # onsetId
 cut -f5 v2small.tab | sort | uniq -c | sort -nr
  92194 
    134 HP:0003577
     97 HP:0003593
     87 HP:0011463
     74 HP:0003623
     50 HP:0003581
     45 HP:0003621
     33 HP:0003584
      8 HP:0011462
      3 HP:0003596
      1 HP:0011461
      1 HP:0003674

 #####################################
 # onsetName
 cut -f6 v2small.tab | sort | uniq -c | sort -nr
  92194 
    134 Congenital onset
     97 Infantile onset
     87 Childhood onset
     74 Neonatal onset
     50 Adult onset
     45 Juvenile onset
     33 Late onset
      8 Young adult onset
      3 Middle age onset
      1 Onset
      1 Fetal onset

 # total onsets
 cut -f6 v2small.tab | grep -cv "^$" 
 533

 # Note one: onset terms can be be merged
 # Note two: terms should be comparable  
 #           before?  during?  after?

 ##########################################
 ##########################################
 # frequency  
 # sparse collection of:
 #   nothing 
 #	identifiers for ontolgy terms
 #	proper rationals
 #	percentages (including ranges of percentages)

 # easy: express percentages more uniformly
 # hard: recover the proper rational the percentage was derived from (curating pubs)
 #
 # mixing identifiers and rationals; 
 # I guess one sparse column is better than two
 # but what would really make it worthwhile is if the identifiers 
 # refrenced a value (back in the ontology) which allowed them to be comparable 
 # (even approximatly) to the proper rationals (and percentages)
 # 
 # Okay we sort of of have that but uncomputably in a string

 HP:0040283   "Occasional (29-5%)"

 Maybe something like statements along the lines of: 

 <HP:0040283>  <???:greater than or equal to>  0.05 . 
 <HP:0040283>  <???:less than or equal to>  0.29 .


 ----------------------------------------------------

 cut -f7 v2small.tab | sort | uniq -c | sort -nr | head
 84602 
   5087 HP:0040283
    682 HP:0040282
    179 HP:0040281
    116 2/2
     87 3/3
     74 HP:0040280
     70 7.5000%
     59 1/3

 # terms present are:
 cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "HP"
   5087 HP:0040283
    682 HP:0040282
    179 HP:0040281
     74 HP:0040280
     23 HP:0040284

 # more common percentages
 cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "%"
     70 7.5000%
     22 50.0000%
     20 50%
     16 30%
     15 25%
     13 20%
     12 10%
     11 15%
      9 35%
      9 33.0000%
      8 25.0000%
      8 100%

 # standardize 50% v.s. 50.0000% ,  25% v.s. 25.0000%  etc

 # most common rationals
 cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "/" | head 
    116 2/2
     87 3/3
     59 1/3
     51 1/2
     50 2/3
     40 1/1
     38 5/5
     34 4/4
     31 6/6
     28 2/4


 # As mentioned in the first big file survey I recommend
 # abandoning percentages for normalized Reals in 0.0 - 1.0


 # are all rationals proper?
 cut -f7 v2small.tab | awk -F'/' '$2>0{if($1>$2)print}'
 # of course they are. 

 ##################################
 # sex
 cut -f8 v2small.tab | sort | uniq -c | sort -nr
  92647 
     58 Male
     22 Female

 # Male and Female are unforturnate terms to use
 # because insensitive searchs for male returns feMALEs  
 # no I am not joking.

 ################################
 # negation
 cut -f9 v2small.tab | sort | uniq -c | sort -nr
  91939 
    788 NO

 #################################
 # modifier
 cut -f10 v2small.tab | sort | uniq -c | sort -nr
  91950 
    307 HP:0012825
    194 HP:0012828
     53 HP:0003676
     42 HP:0025303
     35 HP:0012829
     27 HP:0012832
     26 HP:0012833
     23 HP:0031796
     16 HP:0031375
     15 HP:0012826
     12 HP:0012840
     11 HP:0012839
      8 HP:0012837
      5 HP:0011010
      1 HP:0030650
      1 HP:0012827
      1 HP:0003831

 ########################################
 # description
 # cut -f11 v2small.tab | sort | uniq -c | sort -nr | less
 # 60357  empty
 # OMIM screaming caps (mostly descriptive) 
 # and some other more random hint like statemets 
 # including data that visually seems like it belongs in other columns


 frequency
 awk -F'\t' '/.*[0-9]* OF [0-9].*/ {print $7, substr($11,index($11,"("))}' v2small.tab
 (12 OF 62 PATIENTS)
 HP:0040283 (IN 1 OF 3 SIBLINGS)
 HP:0040283 (IN 1 OF 3 SIBLINGS)
 HP:0040283 (IN 1 OF 3 SIBLINGS)
 HP:0040283 (IN 1 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS)
 HP:0040282 (IN 2 OF 3 SIBLINGS) 
 13/23 (13 OF 23 PATIENTS)                note agreement w/PATIENTS here
 6/22 (6 OF 22)
 9/22 (9 OF 22)
 12/22 (12 OF 22)
 11/21 (11 OF 21)
 9/22 (9 OF 22)
 8/23 (8 OF 23)
 HP:0040282 (IN 2 OF 3 FAMILIES)
 HP:0040282 (IN 2 OF 3 FAMILIES)
 HP:0040282 (IN 1 OF 3 FAMILIES)
 HP:0040282 (IN 1 OF 3 FAMILIES)
 HP:0040282 (IN 2 OF 3 FAMILIES)
 (IN 2 OF 3 PATIENTS)                    note absense of frequency here   
 (IN 2 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 4 PATIENTS)
 (IN 2 OF 6 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)          note switching to HP term here 
 HP:0040282 (IN 1 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)
 HP:0040282 (IN 2 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)
 (IN 2 OF 3 PATIENTS)
 (IN 2 OF 3 PATIENTS)
 HP:0040282 (IN 2 OF 3 PATIENTS)
 OMIM-CS:MOLECULAR BASIS > A CONTIGUOUS GENE SYNDROME CAUSED BY DELETION OF 180KB ENCOMPASSING
 HP:0040283 (IN 2 OF 9 PATIENTS)
 (IN 2 OF 4 PATIENTS FROM 1 FAMILY)
 HP:0040282 (IN 2 OF 7 PATIENTS)
 HP:0040282 (IN 2 OF 7 PATIENTS)
 HP:0040282 (IN 2 OF 7 PATIENTS)
 HP:0040282 (IN 2 OF 3 PATIENTS)
 HP:0040282 (IN 1 OF 3 PATIENTS)
 HP:0040282 (IN 2 OF 3 PATIENTS)
 HP:0040282 (IN 2 OF 3 PATIENTS)
 (IN 1 OF 2 SIBS)
 (IN 2 OF 4 PATIENTS)
 (IN 2 MEMBERS OF 1 FAMILY)
 (IN 1 OF 2 PATIENTS)
 (IN 1 OF 2 PATIENTS)
 (IN 2 OF 4 PATIENTS)
 (IN 2 OF 4 PATIENTS)
 (IN 1 OF 4 PATIENTS)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 	sex
 cut -f11 v2small.tab | grep -ci male 
 645
 # that is alot more than the 80 explicitly specified

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	negation

 awk -F'\t' '$11=="NOT"{print $9,$11}' v2small.tab 
 NOT NOT
 NOT NOT
 # redundant with the negation column

 # Are other NOTs captureable which aren't now?

 awk -F'\t' '($9!="NOT") && /.* NOT .*/{print $11}' v2small.tab 

 OMIM-CS:MISCELLANEOUS > NOT ALL PATIENTS HAVE A MYOPATHY
 OMIM-CS:MUSCLE, SOFT TISSUE > EPISODIC WEAKNESS MAY OR MAY NOT OCCUR INDEPENDENT OF MYOTONIA
 OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL
 OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL
 OMIM-CS:NEUROLOGIC_CENTRAL NERVOUS SYSTEM > DYSMYELINATION OF THE BRAIN, MYELIN IS NOT FORMED PROPERLY

 # none of those strike me as a cleanly negated phenotype. 

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	onset

 cut -f11 v2small.tab | grep  -ic "onset" 
 392

 # That is smaller than the number of explicit onsets (533)
 # Maybe they are all covered.

 awk -F '\t' '($6=="") && $11 ~ /.* ONSET .*/' v2small.tab  | wc -l
 291
 
 # nope only 101 of them are. 
 # I could produce suggestions for the others if you like.


 #################################################################
 # publication
 # hmmm... plural but more like
 # citations  

 # can be lists (with different seperator than previous list)
 # can be missing curie suffix  `OMIM:`  (sixty-nine like this) 
 # can be url
 # can be ISBN
 # can mix curie case 	`PMID:17918734;pmid:12687501`
 # can be spaced out   	`PMID:    17223397`
 # can be bare integer  	`12089525` 
 # can be folks   		`HPO:sdoelken`
 # there can be space after list seperators (or not)


 # prefixes (comments from stale big file survey still applies here) 
 cut -f12 v2small.tab | cut -f1 -d":" |sort | uniq -c | sort -nr 
  86477 OMIM
   5041 PMID
    536 http
    425 ISBN-13
    223 DECIPHER
     15 HPO
      8 ISBN-10
      1 PMID12687501;PMID
      1 ISBN


 # howmany are lists:
 cut -f12 v2small.tab | grep -c ';'
 372

 # space after semicolon
 cut -f12 v2small.tab | grep -c '; '
 51

 # next item after semicolon
 cut -f12 v2small.tab | grep -c ';[^ ]'
 321

 # space after item before semicolon
 cut -f12 v2small.tab | grep -c ' ;'
 0

 ###############################################################
 # Note: 
 # web search on 'GO_evidence_code' returns correct code descriptions as top hit 
 # evidence
 cut -f13 v2small.tab | sort | uniq -c | sort -nr 
  43897 TAS
  42405 IEA
   6400 PCS
     25 ICE

 # total GO codes
 cut -f13 v2small.tab | grep -vc "^$"
 92727

 #####################################
 # assignedBy
 cut -f14 v2small.tab | sort | uniq -c | sort -nr 
  42088 HPO:skoehler
  36962 HPO:iea
  13523 HPO:probinson
     51 HPO:lccarmody
     49 HPO:sdoelken
     34 ZFIN:bruef; HPO:sdoelken
     13 HPO:curators
      6 PATOC:GVG; PATOC:PS
      1 HPO:nvasilevsky

 # comments from stale big file survey still applies here

 ##################################################
 # date_created
 cut -f15 v2small.tab | sort | uniq -c | sort -nr | head
  40213 2009-02-17
   7718 2017-07-13
   6532 2012-10-17
   2434 2015-12-30
   2185 2010-06-20
   1958 2010-06-19
   1145 2012-11-18
   1089 2010-06-18
   1014 2012-04-24
    942 2014-11-26


 # mature
 cut -f15 v2small.tab | sort -u | head
 2009-02-17
 2009-07-24
 2009-07-31
 2009-08-31
 2009-09-17
 2009-10-01
 2009-10-02
 2009-10-09
 2009-10-15
 2009-10-16

 # recent 
 cut -f15 v2small.tab | sort -u | tail
 2017-12-11
 2017-12-12
 2017-12-13
 2017-12-17
 2017-12-22
 2018-01-25
 2018-01-28
 2018-03-04
 2018-03-05
 2018-03-07
  

 # check the rest for impossible dates

 for date in $(cut -f15 v2small.tab | sort -u); do 
 	date --date=${date}; 
 done | grep invalid

 # the dates all look valid.
	head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab \| tr '\t' '\n' \| grep -n .
	1:#diseaseID <- inconsistant capatilization
	2:diseaseName
	3:phenotypeId
	4:phenotypeName
	5:onsetId
	6:onsetName
	7:frequency
	8:sex
	9:negation
	10:modifier
	11:description
	12:publication
	13:evidence
	14:assignedBy
	15:dateCreated

	########################

	# easier to survey the records in one pile
	grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2small.tab

	# howmany rows
	wc -l < v2small.tab
	92,727

	# right number of columns every time?
	awk -F'\t' 'NF!=15' v2small.tab
	# yep

	# disease identifier types
	cut -f1 -d':' v2small.tab \| sort \| uniq -c \| sort -nr
	92430 OMIM
	297 DECIPHER

	# files with the most rows
	cut -f1 v2small.tab \| uniq -c \| sort -nr\| head
	131 OMIM:312870
	128 OMIM:180849
	108 OMIM:607872
	108 OMIM:194050
	99 OMIM:613406
	95 OMIM:270400
	94 OMIM:194190
	87 OMIM:601803
	87 OMIM:122470
	85 OMIM:305600

	# check if any disease identifiers in more than one file?
	cut -f1 v2small.tab \| uniq -c \| sort -nr\| cut -c9- \| uniq -c \| sort -nr \| head
	1 OMIM:617537
	1 OMIM:617526
	1 OMIM:617506
	1 OMIM:617478
	1 OMIM:617468
	1 OMIM:617466
	1 OMIM:617460
	1 OMIM:617452
	1 OMIM:617450
	1 OMIM:617442
	# no

	# summary stats on rows per file
	cut -f1 v2small.tab \| uniq -c \| sort -nr\| cut -c1-8 \| sumstat.r
	V1
	Min. : 1.00
	1st Qu.: 4.00
	Median : 8.00
	Mean : 12.61
	3rd Qu.: 17.00
	Max. :131.00
	[1] "sd :12.46"

	###########################################
	# sumstat.r
	#! /usr/bin/Rscript --vanilla
	x <- read.csv('stdin', header = F);
	summary(x);
	sprintf("sd :%.02f", sd(x[,1]));
	###########################################





	###########################################
	# diseaseID
	# howmany distinct disease identifiers
	cut -f1 v2small.tab \| uniq \| wc -l
	7351

	#####################################
	# diseaseName
	# howmany distinct disease ... "word thingies"(tm)
	cut -f2 v2small.tab \| uniq \| wc -l
	12378
	# not 1:1

	# sometimes appears to be a ';;' seperated list
	# howmany are lists?
	cut -f2 v2small.tab \| sort -u \| grep -c ";;"
	2437

	# How many list semicolons are followed by a space
	cut -f2 v2small.tab \| sort -u \| grep -c ";; "
	4

	# How many list semicolons are followed by a letter
	cut -f2 v2small.tab \| sort -u \| egrep -c ";;[A-Za-z]"
	2428

	# TODO standardise on space or not after semicolon in list

	# many are purely OMIM screaming caps
	cut -f2 v2small.tab \| grep -c "^[A-Z0-9; ]*$" \| sort -u
	23143

	# some contain lowercase
	cut -f2 v2small.tab \| grep "[a-z]" \| sort -u \| wc -l
	375

	# a few are only non caps mostly cytonygenic locations
	cut -f2 v2small.tab \| grep -v "[A-Z]" \| sort -u

	12q14 microdeletion syndrome
	15q13.3 microdeletion syndrome
	15q24 recurrent microdeletion syndrome
	15q26 overgrowth syndrome
	16p11.2-p12.2 microdeletion syndrome
	17q21.31 recurrent microdeletion syndrome
	1p36 microdeletion syndrome
	1q21.1 recurrent microdeletion (susceptibility locus for neurodevelopmental disorders)
	1q21.1 recurrent microduplication (possible susceptibility locus for neurodevelopmental disorders)
	22q11.2 distal deletion syndrome
	22q11 duplication syndrome
	2p15-16.1 microdeletion syndrome
	2q33.1 deletion syndrome
	2q37 monosomy
	3q29 microdeletion syndrome
	7q11.23 duplication syndrome
	8p23.1 deletion syndrome
	8p23.1 duplication syndrome
	9q subtelomeric deletion syndrome


	##########################################
	# phenotypeId
	# howmany?
	cut -f3 v2small.tab \| wc -l
	92727
	# unique?
	cut -f3 v2small.tab \| sort -u \| wc -l
	6994

	# have correct curie prefix?
	cut -f3 v2small.tab \| cut -f1 -d':' \| uniq -c
	92727 HP

	################################
	# phenotypeName
	cut -f4 v2small.tab \| wc -l
	92727
	cut -f4 v2small.tab \| sort -u \| wc -l
	6994

	# anything not simple words? any puncuation indicating lists?
	cut -f4 v2small.tab \| grep -v "[a-z A-Z]*"
	# nothing
	################################
	# onsetId
	cut -f5 v2small.tab \| sort \| uniq -c \| sort -nr
	92194
	134 HP:0003577
	97 HP:0003593
	87 HP:0011463
	74 HP:0003623
	50 HP:0003581
	45 HP:0003621
	33 HP:0003584
	8 HP:0011462
	3 HP:0003596
	1 HP:0011461
	1 HP:0003674

	#####################################
	# onsetName
	cut -f6 v2small.tab \| sort \| uniq -c \| sort -nr
	92194
	134 Congenital onset
	97 Infantile onset
	87 Childhood onset
	74 Neonatal onset
	50 Adult onset
	45 Juvenile onset
	33 Late onset
	8 Young adult onset
	3 Middle age onset
	1 Onset
	1 Fetal onset

	# total onsets
	cut -f6 v2small.tab \| grep -cv "^$"
	533

	# Note one: onset terms can be be merged
	# Note two: terms should be comparable
	# before? during? after?

	##########################################
	##########################################
	# frequency
	# sparse collection of:
	# nothing
	# identifiers for ontolgy terms
	# proper rationals
	# percentages (including ranges of percentages)

	# easy: express percentages more uniformly
	# hard: recover the proper rational the percentage was derived from (curating pubs)
	#
	# mixing identifiers and rationals;
	# I guess one sparse column is better than two
	# but what would really make it worthwhile is if the identifiers
	# refrenced a value (back in the ontology) which allowed them to be comparable
	# (even approximatly) to the proper rationals (and percentages)
	#
	# Okay we sort of of have that but uncomputably in a string

	HP:0040283 "Occasional (29-5%)"

	Maybe something like statements along the lines of:

	<HP:0040283> <???:greater than or equal to> 0.05 .
	<HP:0040283> <???:less than or equal to> 0.29 .


	----------------------------------------------------

	cut -f7 v2small.tab \| sort \| uniq -c \| sort -nr \| head
	84602
	5087 HP:0040283
	682 HP:0040282
	179 HP:0040281
	116 2/2
	87 3/3
	74 HP:0040280
	70 7.5000%
	59 1/3

	# terms present are:
	cut -f7 v2small.tab \| sort \| uniq -c \| sort -nr \| grep "HP"
	5087 HP:0040283
	682 HP:0040282
	179 HP:0040281
	74 HP:0040280
	23 HP:0040284

	# more common percentages
	cut -f7 v2small.tab \| sort \| uniq -c \| sort -nr \| grep "%"
	70 7.5000%
	22 50.0000%
	20 50%
	16 30%
	15 25%
	13 20%
	12 10%
	11 15%
	9 35%
	9 33.0000%
	8 25.0000%
	8 100%

	# standardize 50% v.s. 50.0000% , 25% v.s. 25.0000% etc

	# most common rationals
	cut -f7 v2small.tab \| sort \| uniq -c \| sort -nr \| grep "/" \| head
	116 2/2
	87 3/3
	59 1/3
	51 1/2
	50 2/3
	40 1/1
	38 5/5
	34 4/4
	31 6/6
	28 2/4


	# As mentioned in the first big file survey I recommend
	# abandoning percentages for normalized Reals in 0.0 - 1.0


	# are all rationals proper?
	cut -f7 v2small.tab \| awk -F'/' '$2>0{if($1>$2)print}'
	# of course they are.

	##################################
	# sex
	cut -f8 v2small.tab \| sort \| uniq -c \| sort -nr
	92647
	58 Male
	22 Female

	# Male and Female are unforturnate terms to use
	# because insensitive searchs for male returns feMALEs
	# no I am not joking.

	################################
	# negation
	cut -f9 v2small.tab \| sort \| uniq -c \| sort -nr
	91939
	788 NO

	#################################
	# modifier
	cut -f10 v2small.tab \| sort \| uniq -c \| sort -nr
	91950
	307 HP:0012825
	194 HP:0012828
	53 HP:0003676
	42 HP:0025303
	35 HP:0012829
	27 HP:0012832
	26 HP:0012833
	23 HP:0031796
	16 HP:0031375
	15 HP:0012826
	12 HP:0012840
	11 HP:0012839
	8 HP:0012837
	5 HP:0011010
	1 HP:0030650
	1 HP:0012827
	1 HP:0003831

	########################################
	# description
	# cut -f11 v2small.tab \| sort \| uniq -c \| sort -nr \| less
	# 60357 empty
	# OMIM screaming caps (mostly descriptive)
	# and some other more random hint like statemets
	# including data that visually seems like it belongs in other columns


	frequency
	awk -F'\t' '/.[0-9] OF [0-9].*/ {print $7, substr($11,index($11,"("))}' v2small.tab
	(12 OF 62 PATIENTS)
	HP:0040283 (IN 1 OF 3 SIBLINGS)
	HP:0040283 (IN 1 OF 3 SIBLINGS)
	HP:0040283 (IN 1 OF 3 SIBLINGS)
	HP:0040283 (IN 1 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	HP:0040282 (IN 2 OF 3 SIBLINGS)
	13/23 (13 OF 23 PATIENTS) note agreement w/PATIENTS here
	6/22 (6 OF 22)
	9/22 (9 OF 22)
	12/22 (12 OF 22)
	11/21 (11 OF 21)
	9/22 (9 OF 22)
	8/23 (8 OF 23)
	HP:0040282 (IN 2 OF 3 FAMILIES)
	HP:0040282 (IN 2 OF 3 FAMILIES)
	HP:0040282 (IN 1 OF 3 FAMILIES)
	HP:0040282 (IN 1 OF 3 FAMILIES)
	HP:0040282 (IN 2 OF 3 FAMILIES)
	(IN 2 OF 3 PATIENTS) note absense of frequency here
	(IN 2 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 4 PATIENTS)
	(IN 2 OF 6 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS) note switching to HP term here
	HP:0040282 (IN 1 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS)
	HP:0040282 (IN 2 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS)
	(IN 2 OF 3 PATIENTS)
	(IN 2 OF 3 PATIENTS)
	HP:0040282 (IN 2 OF 3 PATIENTS)
	OMIM-CS:MOLECULAR BASIS > A CONTIGUOUS GENE SYNDROME CAUSED BY DELETION OF 180KB ENCOMPASSING
	HP:0040283 (IN 2 OF 9 PATIENTS)
	(IN 2 OF 4 PATIENTS FROM 1 FAMILY)
	HP:0040282 (IN 2 OF 7 PATIENTS)
	HP:0040282 (IN 2 OF 7 PATIENTS)
	HP:0040282 (IN 2 OF 7 PATIENTS)
	HP:0040282 (IN 2 OF 3 PATIENTS)
	HP:0040282 (IN 1 OF 3 PATIENTS)
	HP:0040282 (IN 2 OF 3 PATIENTS)
	HP:0040282 (IN 2 OF 3 PATIENTS)
	(IN 1 OF 2 SIBS)
	(IN 2 OF 4 PATIENTS)
	(IN 2 MEMBERS OF 1 FAMILY)
	(IN 1 OF 2 PATIENTS)
	(IN 1 OF 2 PATIENTS)
	(IN 2 OF 4 PATIENTS)
	(IN 2 OF 4 PATIENTS)
	(IN 1 OF 4 PATIENTS)
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	sex
	cut -f11 v2small.tab \| grep -ci male
	645
	# that is alot more than the 80 explicitly specified

	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	negation

	awk -F'\t' '$11=="NOT"{print $9,$11}' v2small.tab
	NOT NOT
	NOT NOT
	# redundant with the negation column

	# Are other NOTs captureable which aren't now?

	awk -F'\t' '($9!="NOT") && /.* NOT .*/{print $11}' v2small.tab

	OMIM-CS:MISCELLANEOUS > NOT ALL PATIENTS HAVE A MYOPATHY
	OMIM-CS:MUSCLE, SOFT TISSUE > EPISODIC WEAKNESS MAY OR MAY NOT OCCUR INDEPENDENT OF MYOTONIA
	OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL
	OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL
	OMIM-CS:NEUROLOGIC_CENTRAL NERVOUS SYSTEM > DYSMYELINATION OF THE BRAIN, MYELIN IS NOT FORMED PROPERLY

	# none of those strike me as a cleanly negated phenotype.

	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	onset

	cut -f11 v2small.tab \| grep -ic "onset"
	392

	# That is smaller than the number of explicit onsets (533)
	# Maybe they are all covered.

	awk -F '\t' '($6=="") && $11 ~ /.* ONSET .*/' v2small.tab \| wc -l
	291

	# nope only 101 of them are.
	# I could produce suggestions for the others if you like.


	#################################################################
	# publication
	# hmmm... plural but more like
	# citations

	# can be lists (with different seperator than previous list)
	# can be missing curie suffix `OMIM:` (sixty-nine like this)
	# can be url
	# can be ISBN
	# can mix curie case `PMID:17918734;pmid:12687501`
	# can be spaced out `PMID: 17223397`
	# can be bare integer `12089525`
	# can be folks `HPO:sdoelken`
	# there can be space after list seperators (or not)


	# prefixes (comments from stale big file survey still applies here)
	cut -f12 v2small.tab \| cut -f1 -d":" \|sort \| uniq -c \| sort -nr
	86477 OMIM
	5041 PMID
	536 http
	425 ISBN-13
	223 DECIPHER
	15 HPO
	8 ISBN-10
	1 PMID12687501;PMID
	1 ISBN


	# howmany are lists:
	cut -f12 v2small.tab \| grep -c ';'
	372

	# space after semicolon
	cut -f12 v2small.tab \| grep -c '; '
	51

	# next item after semicolon
	cut -f12 v2small.tab \| grep -c ';[^ ]'
	321

	# space after item before semicolon
	cut -f12 v2small.tab \| grep -c ' ;'
	0

	###############################################################
	# Note:
	# web search on 'GO_evidence_code' returns correct code descriptions as top hit
	# evidence
	cut -f13 v2small.tab \| sort \| uniq -c \| sort -nr
	43897 TAS
	42405 IEA
	6400 PCS
	25 ICE

	# total GO codes
	cut -f13 v2small.tab \| grep -vc "^$"
	92727

	#####################################
	# assignedBy
	cut -f14 v2small.tab \| sort \| uniq -c \| sort -nr
	42088 HPO:skoehler
	36962 HPO:iea
	13523 HPO:probinson
	51 HPO:lccarmody
	49 HPO:sdoelken
	34 ZFIN:bruef; HPO:sdoelken
	13 HPO:curators
	6 PATOC:GVG; PATOC:PS
	1 HPO:nvasilevsky

	# comments from stale big file survey still applies here

	##################################################
	# date_created
	cut -f15 v2small.tab \| sort \| uniq -c \| sort -nr \| head
	40213 2009-02-17
	7718 2017-07-13
	6532 2012-10-17
	2434 2015-12-30
	2185 2010-06-20
	1958 2010-06-19
	1145 2012-11-18
	1089 2010-06-18
	1014 2012-04-24
	942 2014-11-26


	# mature
	cut -f15 v2small.tab \| sort -u \| head
	2009-02-17
	2009-07-24
	2009-07-31
	2009-08-31
	2009-09-17
	2009-10-01
	2009-10-02
	2009-10-09
	2009-10-15
	2009-10-16

	# recent
	cut -f15 v2small.tab \| sort -u \| tail
	2017-12-11
	2017-12-12
	2017-12-13
	2017-12-17
	2017-12-22
	2018-01-25
	2018-01-28
	2018-03-04
	2018-03-05
	2018-03-07


	# check the rest for impossible dates

	for date in $(cut -f15 v2small.tab \| sort -u); do
	date --date=${date};
	done \| grep invalid

	# the dates all look valid.