Created
March 18, 2018 18:30
-
-
Save TomConlin/4c6acd99a5f5de0c3f25ae50a3eec340 to your computer and use it in GitHub Desktop.
HPO rare disease annotation bigfile survey
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
big files are already a single file | |
give a short pathname for ease | |
and bring it to the root directory | |
so I don't make a mess in the data directories | |
ln -s rare-diseases/annotated/v2bigfiles/phenotype.hpoa v2big.tab | |
Giving the file a non standard suffix which does not denote the format | |
is not doing anyone a favor, phenotype is redundant in hpo | |
Perhaps some variant of | |
hpo_annotation_rare.tab | |
hpo_annotation_common.tab | |
etc | |
would tell people who are less familiar, what is in them. | |
!!! Note the file appears to be truncated !!! | |
tail -3 rare-diseases/annotated/v2bigfiles/phenotype.hpoa | |
ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0012819 ORPHA:183 TAS HP:0040283 P 2018-03-12 ORPHA:orphadata | |
ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100582 ORPHA:183 TAS HP:0040283 P 2018-03-12 ORPHA:orphadata | |
ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP | |
####################################################################### | |
wc -l < v2big.tab | |
103906 | |
# header check | |
head -1 v2big.tab | tr '\t' '\n' | grep -n . | |
1:#DB | |
2:DB_Object_ID | |
3:DB_Name | |
4:Qualifier | |
5:HPO_ID | |
6:DB_Reference | |
7:Evidence | |
8:Onset | |
9:Frequency | |
10:Sex | |
11:Modifier | |
12:Aspect | |
13:Date_Created | |
14:Assigned_By | |
# columns between big and small files should be as similar as possible | |
# and identical when the contents of the columns are he same. | |
cut -f1 v2big.tab | sort | uniq -c | sort -nr | |
92430 OMIM | |
11179 ORPHA | |
297 DECIPHER | |
1 #DB | |
# check for trailing spaces | |
cut -f1 v2big.tab | grep " $" | |
# are column 2 local ids strictly numeric? | |
cut -f2 v2big.tab | grep -v "^[1-9][0-9]*" | |
# yep | |
# all non-zero-padded integers although they are zero-padded when used | |
# means searching column finds super strings when searching on substrings | |
# integers are not unique between DBs | |
# (not uprising everyone loves excel row one) | |
cut -f2 v2big.tab | sort -u | wc -l | |
7757 | |
cut -f1,2 v2big.tab | sort -u | wc -l | |
7771 | |
# any leading/trailing spaces what need to be trimmed? | |
cut -f2 v2big.tab | grep "^ " | |
cut -f2 v2big.tab | grep " $" | |
# nope | |
######################################### | |
# third column | |
# this is the description/comment/name-ish thingies | |
# third column has ';;' separated lists. howmany? | |
cut -f3 v2big.tab | grep -c ';;' | |
14359 | |
# are the characters used most likely sane 7bit ASCII? | |
cut -f3 v2big.tab | grep -v "[ -~]*" | |
# yep | |
# third column trailing spaces | |
cut -f3 v2big.tab | egrep " $" | uniq | |
GALLOWAY-MOWAT SYNDROME; GAMOS | |
CRANIOMETADIAPHYSEAL DYSPLASIA; CRMDD | |
# Any field a human could ever have entered | |
# should always be confirmed as trimmed. | |
###################################### | |
# forth column | |
# negation | |
cut -f4 v2big.tab | sort | uniq -c | sort -nr | |
103118 | |
788 NOT | |
1 Qualifier | |
# there are no funny chars (anywhere in the file) | |
grep -v "[ -~]*" v2big.tab | |
# and no or leading trailing spaces on the forth column | |
################################################# | |
# column five | |
cut -f5 v2big.tab | sort | uniq -c | sort -nr | head | |
3317 HP:0000007 | |
3083 HP:0000006 | |
1303 HP:0001249 | |
1046 HP:0001250 | |
1013 HP:0001263 | |
873 HP:0004322 | |
770 HP:0000252 | |
744 HP:0001290 | |
731 HP:0001252 | |
528 HP:0000316 | |
cut -f5 v2big.tab | sort | uniq -c | sort -nr | tail | |
1 HP:0000245 | |
1 HP:0000223 | |
1 HP:0000197 | |
1 HP:0000178 | |
1 HP:0000177 | |
1 HP:0000095 | |
1 HP:0000080 | |
1 HP:0000045 | |
1 HP:0000036 | |
1 HP:0000022 | |
# Does the curie prefix look reasonable? | |
cut -f5 v2big.tab | cut -f1 -d ':' | uniq -c | |
1 HPO_ID | |
103906 HP | |
# yep | |
# distinct | |
cut -f5 v2big.tab | sort -u | wc -l | |
7256 | |
cut -f5 v2big.tab | grep " $" | |
# no trailing space | |
############################################## | |
# column six | |
cut -f6 v2big.tab | sort | uniq -c | sort -nr | head | |
354 ISBN-13:978-0721606156 | |
187 ORPHA:904 | |
131 ORPHA:567 | |
130 OMIM:312870 | |
120 ORPHA:534 | |
116 ORPHA:550 | |
115 ORPHA:881 | |
112 ORPHA:800 | |
106 ORPHA:84 | |
101 ORPHA:744 | |
cut -f6 v2big.tab | sort | uniq -c | sort -nr | tail | |
1 ISBN-13:978-0721606156;PMID:12687501 | |
1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMID:16868563 | |
1 ISBN-13:978-0721606156;http://www.ncbi.nlm.nih.gov/books/NBK1191/ | |
1 ISBN-13:0-19-262896-8;http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai | |
1 http://www.ncbi.nlm.nih.gov/books/NBK1198/#gr_22q13_3.Genetic_Counseling | |
1 http://www.ncbi.nlm.nih.gov/books/NBK1191/; PMID:17918734 | |
1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=mecp2-dup; PMID:17088400 | |
1 http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=donnai; PMID:12923867 | |
1 http://decipher.sanger.ac.uk/syndrome/35 | |
1 DB_Reference | |
# has semicolon lists | |
# spaces before semicolon? | |
cut -f6 v2big.tab | grep " ;" | wc -l | |
0 | |
# spaces after semicolon? | |
cut -f6 v2big.tab | grep "; " | wc -l | |
51 | |
# no space after semicolon? | |
cut -f6 v2big.tab | grep ";[^ ]" | wc -l | |
321 | |
# TODO: | |
# Standardize on space after semicolon or not, for ALL lists. | |
# Consider identical list delimiter throughout. | |
# Some lists already use ';;' double semicolon | |
# I would strongly prefer that to occasionally having a space or not. | |
# what curie prefixes are used in the first list item | |
cut -f6 v2big.tab | cut -f 1 -d ':' | sort | uniq -c | sort -nr | |
86477 OMIM | |
11178 ORPHA | |
5041 PMID | |
536 http | |
425 ISBN-13 | |
223 DECIPHER | |
15 HPO | |
8 ISBN-10 | |
1 PMID12687501;PMID | |
1 ORP | |
1 ISBN | |
1 DB_Reference | |
# expect PMID12687501 should be PMID:12687501 | |
cut -f6 v2big.tab | grep '^ISBN:' | |
ISBN:3642035590 | |
https://books.google.com/books?vid=ISBN3642035590 | |
# looks legit | |
awk -F'\t' '$6=="ORP"' v2big.tab | |
ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP | |
# ORP -> ORPHA:nnn ? | |
cut -f6 v2big.tab | grep '^HPO:' | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
HPO:sdoelken | |
# ... on account of I said so. | |
# Minting a mini citation describing the sorts of things a | |
# a particular curator may feel comfortable asserting may | |
# be cleaner here. also provides context & venue for feedback | |
####################################################### | |
# column seven | |
cut -f7 v2big.tab | sort | uniq -c | sort -nr | |
55075 TAS | |
42405 IEA | |
6400 PCS | |
25 ICE | |
1 Evidence | |
1 | |
# as noted before some form of the words "go evidence code" | |
# as a header works wonders for comprehension | |
# I recall seeing a discussion to reabsorb ICE | |
# which leaves the empty field as the most suspect | |
awk -F '\t' '$7==""' v2big.tab | |
ORPHA 183 Eosinophilic granulomatosis with polyangiitis HP:0100584 ORP | |
# it is that same row as before which is truncated at ORP | |
################################################################# | |
# column eight | |
cut -f8 v2big.tab | sort | uniq -c | sort -nr | head | |
103373 | |
134 HP:0003577 | |
97 HP:0003593 | |
87 HP:0011463 | |
74 HP:0003623 | |
50 HP:0003581 | |
45 HP:0003621 | |
33 HP:0003584 | |
8 HP:0011462 | |
3 HP:0003596 | |
cut -f8 v2big.tab | sort | uniq -c | sort -nr | tail | |
87 HP:0011463 | |
74 HP:0003623 | |
50 HP:0003581 | |
45 HP:0003621 | |
33 HP:0003584 | |
8 HP:0011462 | |
3 HP:0003596 | |
1 Onset | |
1 HP:0011461 | |
1 HP:0003674 | |
# As mentioned in the small file notes | |
# there are cases where the term appear to be just created | |
# based on whatever was found, | |
# without an attempt to coalesce equivalent terms. | |
# | |
# Directly associating each onset term with | |
# comparable values or intervals in the ontology | |
# seems critical to me | |
#################################################### | |
# column nine frequency | |
everything from the small file notes holds here as well | |
in particular the ontology terms should be | |
directly associated with values/intervals | |
which allow then to be comparable with the | |
rationals and percentages. | |
I would also be happy the see the percentages expressed | |
as simple normalized reals in the domain (0.0,1.0) | |
that is: don't multiple by 100.0 and append '%' | |
we just have to undo that before we can use it | |
and if it a human wants to see it that way | |
it is an interface rendering issue | |
# all proper? | |
cut -f9 v2big.tab | awk -F'/' '$2>0{if($1>$2)print}' | |
# yep | |
####################################################### | |
# column ten | |
cut -f10 v2big.tab | sort | uniq -c | sort -nr | head | |
103826 | |
58 Male | |
22 Female | |
1 Sex | |
# no comment | |
######################################################## | |
# column eleven | |
# modifier | |
cut -f11 v2big.tab | sort | uniq -c | sort -nr | head | |
103129 | |
307 HP:0012825 | |
194 HP:0012828 | |
53 HP:0003676 | |
42 HP:0025303 | |
35 HP:0012829 | |
27 HP:0012832 | |
26 HP:0012833 | |
23 HP:0031796 | |
16 HP:0031375 | |
cut -f11 v2big.tab | sort | uniq -c | sort -nr | tail | |
16 HP:0031375 | |
15 HP:0012826 | |
12 HP:0012840 | |
11 HP:0012839 | |
8 HP:0012837 | |
5 HP:0011010 | |
1 Modifier | |
1 HP:0030650 | |
1 HP:0012827 | |
1 HP:0003831 | |
cut -f11 v2big.tab | cut -f1 -d ':' | sort | uniq -c | sort -nr | |
103129 | |
777 HP | |
1 Modifier | |
cut -f11 v2big.tab | sort -u | wc -l | |
19 | |
# we see all of them as the last head is the first tail | |
######################################################### | |
# column twelve | |
# aspect | |
cut -f12 v2big.tab | cut -f1 -d ':' | sort | uniq -c | sort -nr | |
93888 P | |
7605 I | |
1774 C | |
638 M | |
1 Aspect | |
1 | |
# blank will be the truncated last row | |
##################################################### | |
# column thirteen | |
cut -f13 v2big.tab | sort | uniq -c | sort -nr | head | |
40213 2009-02-17 | |
11178 2018-03-12 | |
7718 2017-07-13 | |
6532 2012-10-17 | |
2434 2015-12-30 | |
2185 2010-06-20 | |
1958 2010-06-19 | |
1145 2012-11-18 | |
1089 2010-06-18 | |
1014 2012-04-24 | |
for date in $(cut -f13 v2big.tab | sort -u); do | |
date --date=${date}; | |
done | grep invalid | |
# the dates are looking good | |
########################################################## | |
# column fourteen | |
So is "HPO:iea" another person | |
or is it the same as the GO's "inferred from electronic annotation"? | |
ORPHA:orphadata is not electronic annotation? | |
HPO:curators and HPO:probinson overlap | |
This column strikes me as having room for improvement | |
on the identifier front. Even if its only purpose was | |
vanity metrics and not attribution we could and should do better. | |
cut -f14 v2big.tab | sort | uniq -c | sort -nr | |
42088 HPO:skoehler | |
36962 HPO:iea | |
13523 HPO:probinson | |
11178 ORPHA:orphadata | |
51 HPO:lccarmody | |
49 HPO:sdoelken | |
34 ZFIN:bruef; HPO:sdoelken | |
13 HPO:curators | |
6 PATOC:GVG; PATOC:PS | |
1 HPO:nvasilevsky | |
1 Assigned_By | |
1 | |
cut -f14 v2big.tab | grep -c "; " | |
40 | |
cut -f14 v2big.tab | grep -c " ;" | |
0 | |
cut -f14 v2big.tab | grep -c ";[^ ]" | |
0 | |
# forty lists, all only have a space after the semicolon | |
############################################################# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment