Created
March 19, 2018 03:26
-
-
Save TomConlin/f71aa813a3092dabc9f9ca8eeb442a22 to your computer and use it in GitHub Desktop.
Third pass on the hpo rare disease annotation v2 small files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n . | |
1:#diseaseID <- inconsistant capatilization | |
2:diseaseName | |
3:phenotypeId | |
4:phenotypeName | |
5:onsetId | |
6:onsetName | |
7:frequency | |
8:sex | |
9:negation | |
10:modifier | |
11:description | |
12:publication | |
13:evidence | |
14:assignedBy | |
15:dateCreated | |
######################## | |
# easier to survey the records in one pile | |
grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2small.tab | |
# howmany rows | |
wc -l < v2small.tab | |
92,727 | |
# right number of columns every time? | |
awk -F'\t' 'NF!=15' v2small.tab | |
# yep | |
# disease identifier types | |
cut -f1 -d':' v2small.tab | sort | uniq -c | sort -nr | |
92430 OMIM | |
297 DECIPHER | |
# files with the most rows | |
cut -f1 v2small.tab | uniq -c | sort -nr| head | |
131 OMIM:312870 | |
128 OMIM:180849 | |
108 OMIM:607872 | |
108 OMIM:194050 | |
99 OMIM:613406 | |
95 OMIM:270400 | |
94 OMIM:194190 | |
87 OMIM:601803 | |
87 OMIM:122470 | |
85 OMIM:305600 | |
# check if any disease identifiers in more than one file? | |
cut -f1 v2small.tab | uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head | |
1 OMIM:617537 | |
1 OMIM:617526 | |
1 OMIM:617506 | |
1 OMIM:617478 | |
1 OMIM:617468 | |
1 OMIM:617466 | |
1 OMIM:617460 | |
1 OMIM:617452 | |
1 OMIM:617450 | |
1 OMIM:617442 | |
# no | |
# summary stats on rows per file | |
cut -f1 v2small.tab | uniq -c | sort -nr| cut -c1-8 | sumstat.r | |
V1 | |
Min. : 1.00 | |
1st Qu.: 4.00 | |
Median : 8.00 | |
Mean : 12.61 | |
3rd Qu.: 17.00 | |
Max. :131.00 | |
[1] "sd :12.46" | |
########################################### | |
# sumstat.r | |
#! /usr/bin/Rscript --vanilla | |
x <- read.csv('stdin', header = F); | |
summary(x); | |
sprintf("sd :%.02f", sd(x[,1])); | |
########################################### | |
########################################### | |
# diseaseID | |
# howmany distinct disease identifiers | |
cut -f1 v2small.tab | uniq | wc -l | |
7351 | |
##################################### | |
# diseaseName | |
# howmany distinct disease ... "word thingies"(tm) | |
cut -f2 v2small.tab | uniq | wc -l | |
12378 | |
# not 1:1 | |
# sometimes appears to be a ';;' seperated list | |
# howmany are lists? | |
cut -f2 v2small.tab | sort -u | grep -c ";;" | |
2437 | |
# How many list semicolons are followed by a space | |
cut -f2 v2small.tab | sort -u | grep -c ";; " | |
4 | |
# How many list semicolons are followed by a letter | |
cut -f2 v2small.tab | sort -u | egrep -c ";;[A-Za-z]" | |
2428 | |
# TODO standardise on space or not after semicolon in list | |
# many are purely OMIM screaming caps | |
cut -f2 v2small.tab | grep -c "^[A-Z0-9; ]*$" | sort -u | |
23143 | |
# some contain lowercase | |
cut -f2 v2small.tab | grep "[a-z]" | sort -u | wc -l | |
375 | |
# a few are only non caps mostly cytonygenic locations | |
cut -f2 v2small.tab | grep -v "[A-Z]" | sort -u | |
12q14 microdeletion syndrome | |
15q13.3 microdeletion syndrome | |
15q24 recurrent microdeletion syndrome | |
15q26 overgrowth syndrome | |
16p11.2-p12.2 microdeletion syndrome | |
17q21.31 recurrent microdeletion syndrome | |
1p36 microdeletion syndrome | |
1q21.1 recurrent microdeletion (susceptibility locus for neurodevelopmental disorders) | |
1q21.1 recurrent microduplication (possible susceptibility locus for neurodevelopmental disorders) | |
22q11.2 distal deletion syndrome | |
22q11 duplication syndrome | |
2p15-16.1 microdeletion syndrome | |
2q33.1 deletion syndrome | |
2q37 monosomy | |
3q29 microdeletion syndrome | |
7q11.23 duplication syndrome | |
8p23.1 deletion syndrome | |
8p23.1 duplication syndrome | |
9q subtelomeric deletion syndrome | |
########################################## | |
# phenotypeId | |
# howmany? | |
cut -f3 v2small.tab | wc -l | |
92727 | |
# unique? | |
cut -f3 v2small.tab | sort -u | wc -l | |
6994 | |
# have correct curie prefix? | |
cut -f3 v2small.tab | cut -f1 -d':' | uniq -c | |
92727 HP | |
################################ | |
# phenotypeName | |
cut -f4 v2small.tab | wc -l | |
92727 | |
cut -f4 v2small.tab | sort -u | wc -l | |
6994 | |
# anything not simple words? any puncuation indicating lists? | |
cut -f4 v2small.tab | grep -v "[a-z A-Z]*" | |
# nothing | |
################################ | |
# onsetId | |
cut -f5 v2small.tab | sort | uniq -c | sort -nr | |
92194 | |
134 HP:0003577 | |
97 HP:0003593 | |
87 HP:0011463 | |
74 HP:0003623 | |
50 HP:0003581 | |
45 HP:0003621 | |
33 HP:0003584 | |
8 HP:0011462 | |
3 HP:0003596 | |
1 HP:0011461 | |
1 HP:0003674 | |
##################################### | |
# onsetName | |
cut -f6 v2small.tab | sort | uniq -c | sort -nr | |
92194 | |
134 Congenital onset | |
97 Infantile onset | |
87 Childhood onset | |
74 Neonatal onset | |
50 Adult onset | |
45 Juvenile onset | |
33 Late onset | |
8 Young adult onset | |
3 Middle age onset | |
1 Onset | |
1 Fetal onset | |
# total onsets | |
cut -f6 v2small.tab | grep -cv "^$" | |
533 | |
# Note one: onset terms can be be merged | |
# Note two: terms should be comparable | |
# before? during? after? | |
########################################## | |
########################################## | |
# frequency | |
# sparse collection of: | |
# nothing | |
# identifiers for ontolgy terms | |
# proper rationals | |
# percentages (including ranges of percentages) | |
# easy: express percentages more uniformly | |
# hard: recover the proper rational the percentage was derived from (curating pubs) | |
# | |
# mixing identifiers and rationals; | |
# I guess one sparse column is better than two | |
# but what would really make it worthwhile is if the identifiers | |
# refrenced a value (back in the ontology) which allowed them to be comparable | |
# (even approximatly) to the proper rationals (and percentages) | |
# | |
# Okay we sort of of have that but uncomputably in a string | |
HP:0040283 "Occasional (29-5%)" | |
Maybe something like statements along the lines of: | |
<HP:0040283> <???:greater than or equal to> 0.05 . | |
<HP:0040283> <???:less than or equal to> 0.29 . | |
---------------------------------------------------- | |
cut -f7 v2small.tab | sort | uniq -c | sort -nr | head | |
84602 | |
5087 HP:0040283 | |
682 HP:0040282 | |
179 HP:0040281 | |
116 2/2 | |
87 3/3 | |
74 HP:0040280 | |
70 7.5000% | |
59 1/3 | |
# terms present are: | |
cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "HP" | |
5087 HP:0040283 | |
682 HP:0040282 | |
179 HP:0040281 | |
74 HP:0040280 | |
23 HP:0040284 | |
# more common percentages | |
cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "%" | |
70 7.5000% | |
22 50.0000% | |
20 50% | |
16 30% | |
15 25% | |
13 20% | |
12 10% | |
11 15% | |
9 35% | |
9 33.0000% | |
8 25.0000% | |
8 100% | |
# standardize 50% v.s. 50.0000% , 25% v.s. 25.0000% etc | |
# most common rationals | |
cut -f7 v2small.tab | sort | uniq -c | sort -nr | grep "/" | head | |
116 2/2 | |
87 3/3 | |
59 1/3 | |
51 1/2 | |
50 2/3 | |
40 1/1 | |
38 5/5 | |
34 4/4 | |
31 6/6 | |
28 2/4 | |
# As mentioned in the first big file survey I recommend | |
# abandoning percentages for normalized Reals in 0.0 - 1.0 | |
# are all rationals proper? | |
cut -f7 v2small.tab | awk -F'/' '$2>0{if($1>$2)print}' | |
# of course they are. | |
################################## | |
# sex | |
cut -f8 v2small.tab | sort | uniq -c | sort -nr | |
92647 | |
58 Male | |
22 Female | |
# Male and Female are unforturnate terms to use | |
# because insensitive searchs for male returns feMALEs | |
# no I am not joking. | |
################################ | |
# negation | |
cut -f9 v2small.tab | sort | uniq -c | sort -nr | |
91939 | |
788 NO | |
################################# | |
# modifier | |
cut -f10 v2small.tab | sort | uniq -c | sort -nr | |
91950 | |
307 HP:0012825 | |
194 HP:0012828 | |
53 HP:0003676 | |
42 HP:0025303 | |
35 HP:0012829 | |
27 HP:0012832 | |
26 HP:0012833 | |
23 HP:0031796 | |
16 HP:0031375 | |
15 HP:0012826 | |
12 HP:0012840 | |
11 HP:0012839 | |
8 HP:0012837 | |
5 HP:0011010 | |
1 HP:0030650 | |
1 HP:0012827 | |
1 HP:0003831 | |
######################################## | |
# description | |
# cut -f11 v2small.tab | sort | uniq -c | sort -nr | less | |
# 60357 empty | |
# OMIM screaming caps (mostly descriptive) | |
# and some other more random hint like statemets | |
# including data that visually seems like it belongs in other columns | |
frequency | |
awk -F'\t' '/.*[0-9]* OF [0-9].*/ {print $7, substr($11,index($11,"("))}' v2small.tab | |
(12 OF 62 PATIENTS) | |
HP:0040283 (IN 1 OF 3 SIBLINGS) | |
HP:0040283 (IN 1 OF 3 SIBLINGS) | |
HP:0040283 (IN 1 OF 3 SIBLINGS) | |
HP:0040283 (IN 1 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
HP:0040282 (IN 2 OF 3 SIBLINGS) | |
13/23 (13 OF 23 PATIENTS) note agreement w/PATIENTS here | |
6/22 (6 OF 22) | |
9/22 (9 OF 22) | |
12/22 (12 OF 22) | |
11/21 (11 OF 21) | |
9/22 (9 OF 22) | |
8/23 (8 OF 23) | |
HP:0040282 (IN 2 OF 3 FAMILIES) | |
HP:0040282 (IN 2 OF 3 FAMILIES) | |
HP:0040282 (IN 1 OF 3 FAMILIES) | |
HP:0040282 (IN 1 OF 3 FAMILIES) | |
HP:0040282 (IN 2 OF 3 FAMILIES) | |
(IN 2 OF 3 PATIENTS) note absense of frequency here | |
(IN 2 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 4 PATIENTS) | |
(IN 2 OF 6 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) note switching to HP term here | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
HP:0040282 (IN 2 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
(IN 2 OF 3 PATIENTS) | |
(IN 2 OF 3 PATIENTS) | |
HP:0040282 (IN 2 OF 3 PATIENTS) | |
OMIM-CS:MOLECULAR BASIS > A CONTIGUOUS GENE SYNDROME CAUSED BY DELETION OF 180KB ENCOMPASSING | |
HP:0040283 (IN 2 OF 9 PATIENTS) | |
(IN 2 OF 4 PATIENTS FROM 1 FAMILY) | |
HP:0040282 (IN 2 OF 7 PATIENTS) | |
HP:0040282 (IN 2 OF 7 PATIENTS) | |
HP:0040282 (IN 2 OF 7 PATIENTS) | |
HP:0040282 (IN 2 OF 3 PATIENTS) | |
HP:0040282 (IN 1 OF 3 PATIENTS) | |
HP:0040282 (IN 2 OF 3 PATIENTS) | |
HP:0040282 (IN 2 OF 3 PATIENTS) | |
(IN 1 OF 2 SIBS) | |
(IN 2 OF 4 PATIENTS) | |
(IN 2 MEMBERS OF 1 FAMILY) | |
(IN 1 OF 2 PATIENTS) | |
(IN 1 OF 2 PATIENTS) | |
(IN 2 OF 4 PATIENTS) | |
(IN 2 OF 4 PATIENTS) | |
(IN 1 OF 4 PATIENTS) | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
sex | |
cut -f11 v2small.tab | grep -ci male | |
645 | |
# that is alot more than the 80 explicitly specified | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
negation | |
awk -F'\t' '$11=="NOT"{print $9,$11}' v2small.tab | |
NOT NOT | |
NOT NOT | |
# redundant with the negation column | |
# Are other NOTs captureable which aren't now? | |
awk -F'\t' '($9!="NOT") && /.* NOT .*/{print $11}' v2small.tab | |
OMIM-CS:MISCELLANEOUS > NOT ALL PATIENTS HAVE A MYOPATHY | |
OMIM-CS:MUSCLE, SOFT TISSUE > EPISODIC WEAKNESS MAY OR MAY NOT OCCUR INDEPENDENT OF MYOTONIA | |
OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL | |
OMIM-CS:INHERITANCE > AUTOSOMAL RECESSIVE NOT EXCLUDED, BUT PROBABLY MULTIFACTORIAL | |
OMIM-CS:NEUROLOGIC_CENTRAL NERVOUS SYSTEM > DYSMYELINATION OF THE BRAIN, MYELIN IS NOT FORMED PROPERLY | |
# none of those strike me as a cleanly negated phenotype. | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
onset | |
cut -f11 v2small.tab | grep -ic "onset" | |
392 | |
# That is smaller than the number of explicit onsets (533) | |
# Maybe they are all covered. | |
awk -F '\t' '($6=="") && $11 ~ /.* ONSET .*/' v2small.tab | wc -l | |
291 | |
# nope only 101 of them are. | |
# I could produce suggestions for the others if you like. | |
################################################################# | |
# publication | |
# hmmm... plural but more like | |
# citations | |
# can be lists (with different seperator than previous list) | |
# can be missing curie suffix `OMIM:` (sixty-nine like this) | |
# can be url | |
# can be ISBN | |
# can mix curie case `PMID:17918734;pmid:12687501` | |
# can be spaced out `PMID: 17223397` | |
# can be bare integer `12089525` | |
# can be folks `HPO:sdoelken` | |
# there can be space after list seperators (or not) | |
# prefixes (comments from stale big file survey still applies here) | |
cut -f12 v2small.tab | cut -f1 -d":" |sort | uniq -c | sort -nr | |
86477 OMIM | |
5041 PMID | |
536 http | |
425 ISBN-13 | |
223 DECIPHER | |
15 HPO | |
8 ISBN-10 | |
1 PMID12687501;PMID | |
1 ISBN | |
# howmany are lists: | |
cut -f12 v2small.tab | grep -c ';' | |
372 | |
# space after semicolon | |
cut -f12 v2small.tab | grep -c '; ' | |
51 | |
# next item after semicolon | |
cut -f12 v2small.tab | grep -c ';[^ ]' | |
321 | |
# space after item before semicolon | |
cut -f12 v2small.tab | grep -c ' ;' | |
0 | |
############################################################### | |
# Note: | |
# web search on 'GO_evidence_code' returns correct code descriptions as top hit | |
# evidence | |
cut -f13 v2small.tab | sort | uniq -c | sort -nr | |
43897 TAS | |
42405 IEA | |
6400 PCS | |
25 ICE | |
# total GO codes | |
cut -f13 v2small.tab | grep -vc "^$" | |
92727 | |
##################################### | |
# assignedBy | |
cut -f14 v2small.tab | sort | uniq -c | sort -nr | |
42088 HPO:skoehler | |
36962 HPO:iea | |
13523 HPO:probinson | |
51 HPO:lccarmody | |
49 HPO:sdoelken | |
34 ZFIN:bruef; HPO:sdoelken | |
13 HPO:curators | |
6 PATOC:GVG; PATOC:PS | |
1 HPO:nvasilevsky | |
# comments from stale big file survey still applies here | |
################################################## | |
# date_created | |
cut -f15 v2small.tab | sort | uniq -c | sort -nr | head | |
40213 2009-02-17 | |
7718 2017-07-13 | |
6532 2012-10-17 | |
2434 2015-12-30 | |
2185 2010-06-20 | |
1958 2010-06-19 | |
1145 2012-11-18 | |
1089 2010-06-18 | |
1014 2012-04-24 | |
942 2014-11-26 | |
# mature | |
cut -f15 v2small.tab | sort -u | head | |
2009-02-17 | |
2009-07-24 | |
2009-07-31 | |
2009-08-31 | |
2009-09-17 | |
2009-10-01 | |
2009-10-02 | |
2009-10-09 | |
2009-10-15 | |
2009-10-16 | |
# recent | |
cut -f15 v2small.tab | sort -u | tail | |
2017-12-11 | |
2017-12-12 | |
2017-12-13 | |
2017-12-17 | |
2017-12-22 | |
2018-01-25 | |
2018-01-28 | |
2018-03-04 | |
2018-03-05 | |
2018-03-07 | |
# check the rest for impossible dates | |
for date in $(cut -f15 v2small.tab | sort -u); do | |
date --date=${date}; | |
done | grep invalid | |
# the dates all look valid. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Male and Female are unforturnate terms to use
because insensitive searchs for male returns feMALEs
no I am not joking.
=> Should we use LADY and GENTLEMAN ? :-0
=> I think I fixed at least nearly all of this prior to generating the new small files. Some of the issues (e.g., in the description fields) can be dealt with in the new small files, for now I have mainly just wanted to get syntactic uniformity. The descriptions will require manual curation, but this will now be a lot easier with the new PhenoteFX :-0 (and wait there's more...)