TomConlin · March 18, 2018 18:30
diff --git a/v2 big file survey b/v2 big file survey
 big files are already a single file 
 give a short pathname for ease 
 and bring it to the root directory 
 so I don't make a mess in the data directories

 ln -s rare-diseases/annotated/v2bigfiles/phenotype.hpoa v2big.tab 

 Giving the file a non standard suffix which does not denote the format
 is not doing anyone a favor, phenotype is redundant in hpo 

 Perhaps some variant of
 	hpo_annotation_rare.tab  
 	hpo_annotation_common.tab 
 etc

 would tell people who are less familiar, what is in them.


 !!! Note the file appears to be truncated !!!

 tail -3 rare-diseases/annotated/v2bigfiles/phenotype.hpoa 
 ORPHA	183	Eosinophilic granulomatosis with polyangiitis		HP:0012819	ORPHA:183	TAS		HP:0040283			P	2018-03-12	ORPHA:orphadata
 ORPHA	183	Eosinophilic granulomatosis with polyangiitis		HP:0100582	ORPHA:183	TAS		HP:0040283			P	2018-03-12	ORPHA:orphadata
 ORPHA	183	Eosinophilic granulomatosis with polyangiitis		HP:0100584	ORP


 #######################################################################
 wc -l  < v2big.tab  
 103906

 # header check

 head -1  v2big.tab  | tr '\t' '\n' | grep -n .
 1:#DB
 2:DB_Object_ID
 3:DB_Name
 4:Qualifier
 5:HPO_ID
 6:DB_Reference
 7:Evidence
 8:Onset
 9:Frequency
 10:Sex
 11:Modifier
 12:Aspect
 13:Date_Created
 14:Assigned_By

 # columns between big and small files should be as similar as possible
 # and identical when the contents of the columns are he same.


 cut -f1 v2big.tab | sort | uniq -c | sort -nr
  92430 OMIM
  11179 ORPHA
    297 DECIPHER
      1 #DB

 # check for trailing spaces
 cut -f1 v2big.tab | grep " $"


 # are column 2 local ids strictly numeric?
 cut -f2 v2big.tab | grep -v "^[1-9][0-9]*" 
 # yep

 # all non-zero-padded integers although they are zero-padded when used
 # means searching column finds super strings when searching on substrings 

 # integers are not unique between DBs 
 # (not uprising everyone loves excel row one)  
 cut -f2 v2big.tab | sort -u | wc -l 
 7757
 cut -f1,2 v2big.tab | sort -u | wc -l 
 7771

 # any leading/trailing spaces what need to be trimmed?
 cut -f2 v2big.tab | grep "^ "
 cut -f2 v2big.tab | grep " $"
 # nope

 #########################################
 # third column 
 # this is the description/comment/name-ish thingies  

 #  third column has ';;' separated lists. howmany?
 cut -f3 v2big.tab | grep  -c ';;'
 14359

 # are the characters used most likely sane 7bit ASCII?
 cut -f3 v2big.tab | grep -v "[ -~]*" 
 # yep

 # third column trailing spaces 
 cut -f3 v2big.tab | egrep " $" | uniq

 GALLOWAY-MOWAT SYNDROME; GAMOS 
 CRANIOMETADIAPHYSEAL DYSPLASIA; CRMDD 

 # Any field a human could ever have entered
 # should always be confirmed as trimmed.

 ######################################
 # forth column
 # negation

 cut -f4 v2big.tab | sort | uniq -c | sort -nr
 103118 
    788 NOT
      1 Qualifier

 # there are no funny chars (anywhere in the file)
 grep -v "[ -~]*" v2big.tab 

 # and no or leading trailing spaces on the forth column

 #################################################
 # column five
 cut -f5 v2big.tab | sort | uniq -c | sort -nr | head
   3317 HP:0000007
   3083 HP:0000006
   1303 HP:0001249
   1046 HP:0001250
   1013 HP:0001263
    873 HP:0004322
    770 HP:0000252
    744 HP:0001290
    731 HP:0001252
    528 HP:0000316
 cut -f5 v2big.tab | sort | uniq -c | sort -nr | tail
      1 HP:0000245
      1 HP:0000223
      1 HP:0000197
      1 HP:0000178
      1 HP:0000177
      1 HP:0000095
      1 HP:0000080
      1 HP:0000045
      1 HP:0000036
      1 HP:0000022

 # Does the curie prefix look reasonable? 
 cut -f5 v2big.tab | cut -f1 -d ':' | uniq -c
      1 HPO_ID
 103906 HP
 # yep

 # distinct 
 cut -f5 v2big.tab | sort -u | wc -l
 7256

 cut -f5 v2big.tab | grep " $"
 # no trailing space

 ##############################################
 # column six

 cut -f6 v2big.tab | sort | uniq -c | sort -nr | head
    354 ISBN-13:978-0721606156
    187 ORPHA:904
    131 ORPHA:567
    130 OMIM:312870
    120 ORPHA:534
    116 ORPHA:550
    115 ORPHA:881
    112 ORPHA:800
    106 ORPHA:84
    101 ORPHA:744

 cut -f6 v2big.tab | sort | uniq -c | sort -nr | tail
      1 ISBN-13:978-0721606156;PMID:12687501
      1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMID:16868563
      1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1191/
      1 ISBN-13:0-19-262896-8;http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai
      1 http://www.ncbi.nlm.nih.gov/books/NBK1198/#gr_22q13_3.Genetic_Counseling
      1 http://www.ncbi.nlm.nih.gov/books/NBK1191/; PMID:17918734
      1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=mecp2-dup; PMID:17088400
      1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai; PMID:12923867
      1 http://decipher.sanger.ac.uk/syndrome/35
      1 DB_Reference

 # has semicolon lists

 # spaces before semicolon?
 cut -f6 v2big.tab | grep " ;" | wc -l
 0

 # spaces after semicolon?
 cut -f6 v2big.tab | grep "; " | wc -l
 51

 # no space after semicolon?
 cut -f6 v2big.tab | grep ";[^ ]" | wc -l
 321

 # TODO: 
 # Standardize on space after semicolon or not, for ALL lists.
 # Consider identical list delimiter throughout.
 # Some lists already use ';;'  double semicolon
 # I would strongly prefer that to occasionally having a space or not.


 # what curie prefixes are used in the first list item
 cut -f6 v2big.tab | cut -f 1 -d ':' | sort | uniq -c | sort -nr
  86477 OMIM
  11178 ORPHA
   5041 PMID
    536 http
    425 ISBN-13
    223 DECIPHER
     15 HPO
      8 ISBN-10
      1 PMID12687501;PMID
      1 ORP
      1 ISBN
      1 DB_Reference

 # expect PMID12687501 should be PMID:12687501

 cut -f6 v2big.tab | grep '^ISBN:' 
 ISBN:3642035590
 https://books.google.com/books?vid=ISBN3642035590
 # looks legit

 awk -F'\t' '$6=="ORP"' v2big.tab 
 ORPHA	183	Eosinophilic granulomatosis with polyangiitis		HP:0100584	ORP

 #  ORP -> ORPHA:nnn ?


 cut -f6 v2big.tab | grep '^HPO:'
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken
 HPO:sdoelken

 # ... on account of I said so.
 # Minting a mini citation describing the sorts of things a 
 # a particular curator may feel comfortable asserting may 
 # be cleaner here. also provides context & venue for feedback


 #######################################################
 # column seven
 cut -f7 v2big.tab | sort | uniq -c | sort -nr 
  55075 TAS
  42405 IEA
   6400 PCS
     25 ICE
      1 Evidence
      1 

 # as noted before some form of the words "go evidence code"
 # as a header works wonders for comprehension   

 # I recall seeing a discussion to reabsorb ICE
 # which leaves the empty field as the most suspect 

 awk -F '\t' '$7==""' v2big.tab 
 ORPHA	183	Eosinophilic granulomatosis with polyangiitis		HP:0100584	ORP

 # it is that same row as before which is truncated at ORP

 #################################################################
 # column eight

 cut -f8 v2big.tab | sort | uniq -c | sort -nr | head
  103373 
    134 HP:0003577
     97 HP:0003593
     87 HP:0011463
     74 HP:0003623
     50 HP:0003581
     45 HP:0003621
     33 HP:0003584
      8 HP:0011462
      3 HP:0003596

 cut -f8 v2big.tab | sort | uniq -c | sort -nr | tail
     87 HP:0011463
     74 HP:0003623
     50 HP:0003581
     45 HP:0003621
     33 HP:0003584
      8 HP:0011462
      3 HP:0003596
      1 Onset
      1 HP:0011461
      1 HP:0003674

 # As mentioned in the small file notes
 # there are cases where the term appear to be just created 
 # based on whatever was found, 
 # without an attempt to coalesce equivalent terms.
 #
 # Directly associating each  onset term with 
 # comparable values or intervals in the ontology
 # seems critical to me 
 
 ####################################################
 # column nine  frequency

 everything from the small file notes holds here as well
 in particular the ontology terms should be 
 directly associated with values/intervals 
 which allow then to be comparable with the 
 rationals and percentages. 

 I would also be happy the see the percentages expressed 
 as simple normalized reals in the domain (0.0,1.0)

 that is: don't multiple by 100.0 and append '%' 
 we just have to undo that before we can use it
 and if it a human wants to see it that way 
 it is an interface rendering issue

 # all proper?
 cut -f9 v2big.tab | awk -F'/' '$2>0{if($1>$2)print}'
 # yep


 #######################################################
 # column ten


 cut -f10 v2big.tab | sort | uniq -c | sort -nr | head
 103826 
     58 Male
     22 Female
      1 Sex

 # no comment

 ########################################################
 # column eleven
 # modifier
 cut -f11 v2big.tab | sort | uniq -c | sort -nr | head
 103129 
    307 HP:0012825
    194 HP:0012828
     53 HP:0003676
     42 HP:0025303
     35 HP:0012829
     27 HP:0012832
     26 HP:0012833
     23 HP:0031796
     16 HP:0031375

 cut -f11 v2big.tab | sort | uniq -c | sort -nr | tail
     16 HP:0031375
     15 HP:0012826
     12 HP:0012840
     11 HP:0012839
      8 HP:0012837
      5 HP:0011010
      1 Modifier
      1 HP:0030650
      1 HP:0012827
      1 HP:0003831

 cut -f11 v2big.tab | cut -f1 -d ':' | sort | uniq -c | sort -nr 
 103129 
    777 HP
      1 Modifier

 cut -f11 v2big.tab | sort -u | wc -l
 19 
 # we see all of them as the last head is the first tail


 #########################################################
 # column twelve
 # aspect

 cut -f12 v2big.tab | cut -f1 -d ':' | sort | uniq -c | sort -nr 
  93888 P
   7605 I
   1774 C
    638 M
      1 Aspect
      1 

 # blank will be the truncated last row

 #####################################################
 # column thirteen

 cut -f13 v2big.tab | sort | uniq -c | sort -nr | head
  40213 2009-02-17
  11178 2018-03-12
   7718 2017-07-13
   6532 2012-10-17
   2434 2015-12-30
   2185 2010-06-20
   1958 2010-06-19
   1145 2012-11-18
   1089 2010-06-18
   1014 2012-04-24



 for date in $(cut -f13 v2big.tab | sort -u); do 
 	date --date=${date}; 
 done | grep invalid

 # the dates are looking good

 ##########################################################
 # column fourteen

 So is "HPO:iea" another person 
 or is it the same as the GO's "inferred from electronic annotation"? 

 ORPHA:orphadata is not electronic annotation?

 HPO:curators and HPO:probinson overlap


 This column strikes me as having room for improvement 
 on the identifier front. Even if its only purpose was 
 vanity metrics and not attribution we could and should do better. 

 cut -f14 v2big.tab | sort | uniq -c | sort -nr 
  42088 HPO:skoehler
  36962 HPO:iea
  13523 HPO:probinson
  11178 ORPHA:orphadata
     51 HPO:lccarmody
     49 HPO:sdoelken
     34 ZFIN:bruef; HPO:sdoelken
     13 HPO:curators
      6 PATOC:GVG; PATOC:PS
      1 HPO:nvasilevsky
      1 Assigned_By
      1 

 cut -f14 v2big.tab | grep -c "; " 
 40
 cut -f14 v2big.tab | grep -c " ;" 
 0
 cut -f14 v2big.tab | grep -c ";[^ ]" 
 0

 # forty lists, all only have a space after the semicolon


 #############################################################
	big files are already a single file
	give a short pathname for ease
	and bring it to the root directory
	so I don't make a mess in the data directories

	ln -s rare-diseases/annotated/v2bigfiles/phenotype.hpoa v2big.tab

	Giving the file a non standard suffix which does not denote the format
	is not doing anyone a favor, phenotype is redundant in hpo

	Perhaps some variant of
	hpo_annotation_rare.tab
	hpo_annotation_common.tab
	etc

	would tell people who are less familiar, what is in them.


	!!! Note the file appears to be truncated !!!

	tail -3 rare-diseases/annotated/v2bigfiles/phenotype.hpoa
	ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0012819 ORPHA:183 TAS HP:0040283 P 2018-03-12 ORPHA:orphadata
	ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100582 ORPHA:183 TAS HP:0040283 P 2018-03-12 ORPHA:orphadata
	ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP


	#######################################################################
	wc -l < v2big.tab
	103906

	# header check

	head -1 v2big.tab \| tr '\t' '\n' \| grep -n .
	1:#DB
	2:DB_Object_ID
	3:DB_Name
	4:Qualifier
	5:HPO_ID
	6:DB_Reference
	7:Evidence
	8:Onset
	9:Frequency
	10:Sex
	11:Modifier
	12:Aspect
	13:Date_Created
	14:Assigned_By

	# columns between big and small files should be as similar as possible
	# and identical when the contents of the columns are he same.


	cut -f1 v2big.tab \| sort \| uniq -c \| sort -nr
	92430 OMIM
	11179 ORPHA
	297 DECIPHER
	1 #DB

	# check for trailing spaces
	cut -f1 v2big.tab \| grep " $"


	# are column 2 local ids strictly numeric?
	cut -f2 v2big.tab \| grep -v "^[1-9][0-9]*"
	# yep

	# all non-zero-padded integers although they are zero-padded when used
	# means searching column finds super strings when searching on substrings

	# integers are not unique between DBs
	# (not uprising everyone loves excel row one)
	cut -f2 v2big.tab \| sort -u \| wc -l
	7757
	cut -f1,2 v2big.tab \| sort -u \| wc -l
	7771

	# any leading/trailing spaces what need to be trimmed?
	cut -f2 v2big.tab \| grep "^ "
	cut -f2 v2big.tab \| grep " $"
	# nope

	#########################################
	# third column
	# this is the description/comment/name-ish thingies

	# third column has ';;' separated lists. howmany?
	cut -f3 v2big.tab \| grep -c ';;'
	14359

	# are the characters used most likely sane 7bit ASCII?
	cut -f3 v2big.tab \| grep -v "[ -~]*"
	# yep

	# third column trailing spaces
	cut -f3 v2big.tab \| egrep " $" \| uniq

	GALLOWAY-MOWAT SYNDROME; GAMOS
	CRANIOMETADIAPHYSEAL DYSPLASIA; CRMDD

	# Any field a human could ever have entered
	# should always be confirmed as trimmed.

	######################################
	# forth column
	# negation

	cut -f4 v2big.tab \| sort \| uniq -c \| sort -nr
	103118
	788 NOT
	1 Qualifier

	# there are no funny chars (anywhere in the file)
	grep -v "[ -~]*" v2big.tab

	# and no or leading trailing spaces on the forth column

	#################################################
	# column five
	cut -f5 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	3317 HP:0000007
	3083 HP:0000006
	1303 HP:0001249
	1046 HP:0001250
	1013 HP:0001263
	873 HP:0004322
	770 HP:0000252
	744 HP:0001290
	731 HP:0001252
	528 HP:0000316
	cut -f5 v2big.tab \| sort \| uniq -c \| sort -nr \| tail
	1 HP:0000245
	1 HP:0000223
	1 HP:0000197
	1 HP:0000178
	1 HP:0000177
	1 HP:0000095
	1 HP:0000080
	1 HP:0000045
	1 HP:0000036
	1 HP:0000022

	# Does the curie prefix look reasonable?
	cut -f5 v2big.tab \| cut -f1 -d ':' \| uniq -c
	1 HPO_ID
	103906 HP
	# yep

	# distinct
	cut -f5 v2big.tab \| sort -u \| wc -l
	7256

	cut -f5 v2big.tab \| grep " $"
	# no trailing space

	##############################################
	# column six

	cut -f6 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	354 ISBN-13:978-0721606156
	187 ORPHA:904
	131 ORPHA:567
	130 OMIM:312870
	120 ORPHA:534
	116 ORPHA:550
	115 ORPHA:881
	112 ORPHA:800
	106 ORPHA:84
	101 ORPHA:744

	cut -f6 v2big.tab \| sort \| uniq -c \| sort -nr \| tail
	1 ISBN-13:978-0721606156;PMID:12687501
	1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMID:16868563
	1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1191/
	1 ISBN-13:0-19-262896-8;http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai
	1 http://www.ncbi.nlm.nih.gov/books/NBK1198/#gr_22q13_3.Genetic_Counseling
	1 http://www.ncbi.nlm.nih.gov/books/NBK1191/; PMID:17918734
	1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=mecp2-dup; PMID:17088400
	1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai; PMID:12923867
	1 http://decipher.sanger.ac.uk/syndrome/35
	1 DB_Reference

	# has semicolon lists

	# spaces before semicolon?
	cut -f6 v2big.tab \| grep " ;" \| wc -l
	0

	# spaces after semicolon?
	cut -f6 v2big.tab \| grep "; " \| wc -l
	51

	# no space after semicolon?
	cut -f6 v2big.tab \| grep ";[^ ]" \| wc -l
	321

	# TODO:
	# Standardize on space after semicolon or not, for ALL lists.
	# Consider identical list delimiter throughout.
	# Some lists already use ';;' double semicolon
	# I would strongly prefer that to occasionally having a space or not.


	# what curie prefixes are used in the first list item
	cut -f6 v2big.tab \| cut -f 1 -d ':' \| sort \| uniq -c \| sort -nr
	86477 OMIM
	11178 ORPHA
	5041 PMID
	536 http
	425 ISBN-13
	223 DECIPHER
	15 HPO
	8 ISBN-10
	1 PMID12687501;PMID
	1 ORP
	1 ISBN
	1 DB_Reference

	# expect PMID12687501 should be PMID:12687501

	cut -f6 v2big.tab \| grep '^ISBN:'
	ISBN:3642035590
	https://books.google.com/books?vid=ISBN3642035590
	# looks legit

	awk -F'\t' '$6=="ORP"' v2big.tab
	ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP

	# ORP -> ORPHA:nnn ?


	cut -f6 v2big.tab \| grep '^HPO:'
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken
	HPO:sdoelken

	# ... on account of I said so.
	# Minting a mini citation describing the sorts of things a
	# a particular curator may feel comfortable asserting may
	# be cleaner here. also provides context & venue for feedback


	#######################################################
	# column seven
	cut -f7 v2big.tab \| sort \| uniq -c \| sort -nr
	55075 TAS
	42405 IEA
	6400 PCS
	25 ICE
	1 Evidence
	1

	# as noted before some form of the words "go evidence code"
	# as a header works wonders for comprehension

	# I recall seeing a discussion to reabsorb ICE
	# which leaves the empty field as the most suspect

	awk -F '\t' '$7==""' v2big.tab
	ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP

	# it is that same row as before which is truncated at ORP

	#################################################################
	# column eight

	cut -f8 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	103373
	134 HP:0003577
	97 HP:0003593
	87 HP:0011463
	74 HP:0003623
	50 HP:0003581
	45 HP:0003621
	33 HP:0003584
	8 HP:0011462
	3 HP:0003596

	cut -f8 v2big.tab \| sort \| uniq -c \| sort -nr \| tail
	87 HP:0011463
	74 HP:0003623
	50 HP:0003581
	45 HP:0003621
	33 HP:0003584
	8 HP:0011462
	3 HP:0003596
	1 Onset
	1 HP:0011461
	1 HP:0003674

	# As mentioned in the small file notes
	# there are cases where the term appear to be just created
	# based on whatever was found,
	# without an attempt to coalesce equivalent terms.
	#
	# Directly associating each onset term with
	# comparable values or intervals in the ontology
	# seems critical to me

	####################################################
	# column nine frequency

	everything from the small file notes holds here as well
	in particular the ontology terms should be
	directly associated with values/intervals
	which allow then to be comparable with the
	rationals and percentages.

	I would also be happy the see the percentages expressed
	as simple normalized reals in the domain (0.0,1.0)

	that is: don't multiple by 100.0 and append '%'
	we just have to undo that before we can use it
	and if it a human wants to see it that way
	it is an interface rendering issue

	# all proper?
	cut -f9 v2big.tab \| awk -F'/' '$2>0{if($1>$2)print}'
	# yep


	#######################################################
	# column ten


	cut -f10 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	103826
	58 Male
	22 Female
	1 Sex

	# no comment

	########################################################
	# column eleven
	# modifier
	cut -f11 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	103129
	307 HP:0012825
	194 HP:0012828
	53 HP:0003676
	42 HP:0025303
	35 HP:0012829
	27 HP:0012832
	26 HP:0012833
	23 HP:0031796
	16 HP:0031375

	cut -f11 v2big.tab \| sort \| uniq -c \| sort -nr \| tail
	16 HP:0031375
	15 HP:0012826
	12 HP:0012840
	11 HP:0012839
	8 HP:0012837
	5 HP:0011010
	1 Modifier
	1 HP:0030650
	1 HP:0012827
	1 HP:0003831

	cut -f11 v2big.tab \| cut -f1 -d ':' \| sort \| uniq -c \| sort -nr
	103129
	777 HP
	1 Modifier

	cut -f11 v2big.tab \| sort -u \| wc -l
	19
	# we see all of them as the last head is the first tail


	#########################################################
	# column twelve
	# aspect

	cut -f12 v2big.tab \| cut -f1 -d ':' \| sort \| uniq -c \| sort -nr
	93888 P
	7605 I
	1774 C
	638 M
	1 Aspect
	1

	# blank will be the truncated last row

	#####################################################
	# column thirteen

	cut -f13 v2big.tab \| sort \| uniq -c \| sort -nr \| head
	40213 2009-02-17
	11178 2018-03-12
	7718 2017-07-13
	6532 2012-10-17
	2434 2015-12-30
	2185 2010-06-20
	1958 2010-06-19
	1145 2012-11-18
	1089 2010-06-18
	1014 2012-04-24



	for date in $(cut -f13 v2big.tab \| sort -u); do
	date --date=${date};
	done \| grep invalid

	# the dates are looking good

	##########################################################
	# column fourteen

	So is "HPO:iea" another person
	or is it the same as the GO's "inferred from electronic annotation"?

	ORPHA:orphadata is not electronic annotation?

	HPO:curators and HPO:probinson overlap


	This column strikes me as having room for improvement
	on the identifier front. Even if its only purpose was
	vanity metrics and not attribution we could and should do better.

	cut -f14 v2big.tab \| sort \| uniq -c \| sort -nr
	42088 HPO:skoehler
	36962 HPO:iea
	13523 HPO:probinson
	11178 ORPHA:orphadata
	51 HPO:lccarmody
	49 HPO:sdoelken
	34 ZFIN:bruef; HPO:sdoelken
	13 HPO:curators
	6 PATOC:GVG; PATOC:PS
	1 HPO:nvasilevsky
	1 Assigned_By
	1

	cut -f14 v2big.tab \| grep -c "; "
	40
	cut -f14 v2big.tab \| grep -c " ;"
	0
	cut -f14 v2big.tab \| grep -c ";[^ ]"
	0

	# forty lists, all only have a space after the semicolon


	#############################################################