Created
April 8, 2023 02:37
-
-
Save mlin/e55025fb1fae42116e41f78baf26a462 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8" /> | |
<meta name="generator" content="pandoc" /> | |
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1" /> | |
<title>Census datasets presence</title> | |
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to | |
// be compatible with the behavior of Pandoc < 2.8). | |
document.addEventListener('DOMContentLoaded', function(e) { | |
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); | |
var i, h, a; | |
for (i = 0; i < hs.length; i++) { | |
h = hs[i]; | |
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 | |
a = h.attributes; | |
while (a.length > 0) h.removeAttribute(a[0].name); | |
} | |
}); | |
</script> | |
<style type="text/css"> | |
code{white-space: pre-wrap;} | |
span.smallcaps{font-variant: small-caps;} | |
span.underline{text-decoration: underline;} | |
div.column{display: inline-block; vertical-align: top; width: 50%;} | |
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} | |
ul.task-list{list-style: none;} | |
</style> | |
<style type="text/css"> | |
code { | |
white-space: pre; | |
} | |
.sourceCode { | |
overflow: visible; | |
} | |
</style> | |
<style type="text/css" data-origin="pandoc"> | |
pre > code.sourceCode { white-space: pre; position: relative; } | |
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } | |
pre > code.sourceCode > span:empty { height: 1.2em; } | |
.sourceCode { overflow: visible; } | |
code.sourceCode > span { color: inherit; text-decoration: inherit; } | |
div.sourceCode { margin: 1em 0; } | |
pre.sourceCode { margin: 0; } | |
@media screen { | |
div.sourceCode { overflow: auto; } | |
} | |
@media print { | |
pre > code.sourceCode { white-space: pre-wrap; } | |
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } | |
} | |
pre.numberSource code | |
{ counter-reset: source-line 0; } | |
pre.numberSource code > span | |
{ position: relative; left: -4em; counter-increment: source-line; } | |
pre.numberSource code > span > a:first-child::before | |
{ content: counter(source-line); | |
position: relative; left: -1em; text-align: right; vertical-align: baseline; | |
border: none; display: inline-block; | |
-webkit-touch-callout: none; -webkit-user-select: none; | |
-khtml-user-select: none; -moz-user-select: none; | |
-ms-user-select: none; user-select: none; | |
padding: 0 4px; width: 4em; | |
color: #aaaaaa; | |
} | |
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } | |
div.sourceCode | |
{ } | |
@media screen { | |
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } | |
} | |
code span.al { color: #ff0000; font-weight: bold; } | |
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } | |
code span.at { color: #7d9029; } | |
code span.bn { color: #40a070; } | |
code span.bu { color: #008000; } | |
code span.cf { color: #007020; font-weight: bold; } | |
code span.ch { color: #4070a0; } | |
code span.cn { color: #880000; } | |
code span.co { color: #60a0b0; font-style: italic; } | |
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } | |
code span.do { color: #ba2121; font-style: italic; } | |
code span.dt { color: #902000; } | |
code span.dv { color: #40a070; } | |
code span.er { color: #ff0000; font-weight: bold; } | |
code span.ex { } | |
code span.fl { color: #40a070; } | |
code span.fu { color: #06287e; } | |
code span.im { color: #008000; font-weight: bold; } | |
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } | |
code span.kw { color: #007020; font-weight: bold; } | |
code span.op { color: #666666; } | |
code span.ot { color: #007020; } | |
code span.pp { color: #bc7a00; } | |
code span.sc { color: #4070a0; } | |
code span.ss { color: #bb6688; } | |
code span.st { color: #4070a0; } | |
code span.va { color: #19177c; } | |
code span.vs { color: #4070a0; } | |
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } | |
</style> | |
<script> | |
// apply pandoc div.sourceCode style to pre.sourceCode instead | |
(function() { | |
var sheets = document.styleSheets; | |
for (var i = 0; i < sheets.length; i++) { | |
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; | |
try { var rules = sheets[i].cssRules; } catch (e) { continue; } | |
var j = 0; | |
while (j < rules.length) { | |
var rule = rules[j]; | |
// check if there is a div.sourceCode rule | |
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") { | |
j++; | |
continue; | |
} | |
var style = rule.style.cssText; | |
// check if color or background-color is set | |
if (rule.style.color === '' && rule.style.backgroundColor === '') { | |
j++; | |
continue; | |
} | |
// replace div.sourceCode by a pre.sourceCode rule | |
sheets[i].deleteRule(j); | |
sheets[i].insertRule('pre.sourceCode{' + style + '}', j); | |
} | |
} | |
})(); | |
</script> | |
<style type="text/css">body { | |
background-color: #fff; | |
margin: 1em auto; | |
max-width: 700px; | |
overflow: visible; | |
padding-left: 2em; | |
padding-right: 2em; | |
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; | |
font-size: 14px; | |
line-height: 1.35; | |
} | |
#TOC { | |
clear: both; | |
margin: 0 0 10px 10px; | |
padding: 4px; | |
width: 400px; | |
border: 1px solid #CCCCCC; | |
border-radius: 5px; | |
background-color: #f6f6f6; | |
font-size: 13px; | |
line-height: 1.3; | |
} | |
#TOC .toctitle { | |
font-weight: bold; | |
font-size: 15px; | |
margin-left: 5px; | |
} | |
#TOC ul { | |
padding-left: 40px; | |
margin-left: -1.5em; | |
margin-top: 5px; | |
margin-bottom: 5px; | |
} | |
#TOC ul ul { | |
margin-left: -2em; | |
} | |
#TOC li { | |
line-height: 16px; | |
} | |
table { | |
margin: 1em auto; | |
border-width: 1px; | |
border-color: #DDDDDD; | |
border-style: outset; | |
border-collapse: collapse; | |
} | |
table th { | |
border-width: 2px; | |
padding: 5px; | |
border-style: inset; | |
} | |
table td { | |
border-width: 1px; | |
border-style: inset; | |
line-height: 18px; | |
padding: 5px 5px; | |
} | |
table, table th, table td { | |
border-left-style: none; | |
border-right-style: none; | |
} | |
table thead, table tr.even { | |
background-color: #f7f7f7; | |
} | |
p { | |
margin: 0.5em 0; | |
} | |
blockquote { | |
background-color: #f6f6f6; | |
padding: 0.25em 0.75em; | |
} | |
hr { | |
border-style: solid; | |
border: none; | |
border-top: 1px solid #777; | |
margin: 28px 0; | |
} | |
dl { | |
margin-left: 0; | |
} | |
dl dd { | |
margin-bottom: 13px; | |
margin-left: 13px; | |
} | |
dl dt { | |
font-weight: bold; | |
} | |
ul { | |
margin-top: 0; | |
} | |
ul li { | |
list-style: circle outside; | |
} | |
ul ul { | |
margin-bottom: 0; | |
} | |
pre, code { | |
background-color: #f7f7f7; | |
border-radius: 3px; | |
color: #333; | |
white-space: pre-wrap; | |
} | |
pre { | |
border-radius: 3px; | |
margin: 5px 0px 10px 0px; | |
padding: 10px; | |
} | |
pre:not([class]) { | |
background-color: #f7f7f7; | |
} | |
code { | |
font-family: Consolas, Monaco, 'Courier New', monospace; | |
font-size: 85%; | |
} | |
p > code, li > code { | |
padding: 2px 0px; | |
} | |
div.figure { | |
text-align: center; | |
} | |
img { | |
background-color: #FFFFFF; | |
padding: 2px; | |
border: 1px solid #DDDDDD; | |
border-radius: 3px; | |
border: 1px solid #CCCCCC; | |
margin: 0 5px; | |
} | |
h1 { | |
margin-top: 0; | |
font-size: 35px; | |
line-height: 40px; | |
} | |
h2 { | |
border-bottom: 4px solid #f7f7f7; | |
padding-top: 10px; | |
padding-bottom: 2px; | |
font-size: 145%; | |
} | |
h3 { | |
border-bottom: 2px solid #f7f7f7; | |
padding-top: 10px; | |
font-size: 120%; | |
} | |
h4 { | |
border-bottom: 1px solid #f7f7f7; | |
margin-left: 8px; | |
font-size: 105%; | |
} | |
h5, h6 { | |
border-bottom: 1px solid #ccc; | |
font-size: 105%; | |
} | |
a { | |
color: #0033dd; | |
text-decoration: none; | |
} | |
a:hover { | |
color: #6666ff; } | |
a:visited { | |
color: #800080; } | |
a:visited:hover { | |
color: #BB00BB; } | |
a[href^="http:"] { | |
text-decoration: underline; } | |
a[href^="https:"] { | |
text-decoration: underline; } | |
code > span.kw { color: #555; font-weight: bold; } | |
code > span.dt { color: #902000; } | |
code > span.dv { color: #40a070; } | |
code > span.bn { color: #d14; } | |
code > span.fl { color: #d14; } | |
code > span.ch { color: #d14; } | |
code > span.st { color: #d14; } | |
code > span.co { color: #888888; font-style: italic; } | |
code > span.ot { color: #007020; } | |
code > span.al { color: #ff0000; font-weight: bold; } | |
code > span.fu { color: #900; font-weight: bold; } | |
code > span.er { color: #a61717; background-color: #e3d2d2; } | |
</style> | |
</head> | |
<body> | |
<h1 class="title toc-ignore">Census datasets presence</h1> | |
<!-- | |
THIS VIGNETTE IS BASED ON: | |
https://github.com/chanzuckerberg/cellxgene-census/blob/main/api/python/notebooks/api_demo/census_dataset_presence.ipynb | |
--> | |
<p><em>Goal:</em> demonstrate basic use of the | |
<code>datasets_presence_matrix</code> array.</p> | |
<p>The presence matrix is a sparse array, indicating which features | |
(var) were present in each dataset. The array has dimensions | |
[n_datasets, n_var], and is stored in the SOMA Measurement | |
<code>varp</code> collection. The first dimension is indexed by the | |
<code>soma_joinid</code> in the <code>census_datasets</code> dataframe. | |
The second is indexed by the <code>soma_joinid</code> in the | |
<code>var</code> dataframe of the measurement.</p> | |
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>census <span class="ot"><-</span> cellxgene.census<span class="sc">::</span><span class="fu">open_soma</span>()</span> | |
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Grab the experiment containing human data, and the measurement therein with RNA</span></span> | |
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>human <span class="ot"><-</span> census<span class="sc">$</span><span class="fu">get</span>(<span class="st">"census_data"</span>)<span class="sc">$</span><span class="fu">get</span>(<span class="st">"homo_sapiens"</span>)</span> | |
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>human_rna <span class="ot"><-</span> human<span class="sc">$</span>ms<span class="sc">$</span><span class="fu">get</span>(<span class="st">"RNA"</span>)</span> | |
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span> | |
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The census-wide datasets</span></span> | |
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>datasets_df <span class="ot"><-</span> <span class="fu">as.data.frame</span>(census<span class="sc">$</span><span class="fu">get</span>(<span class="st">"census_info"</span>)<span class="sc">$</span><span class="fu">get</span>(<span class="st">"datasets"</span>)<span class="sc">$</span><span class="fu">read</span>())</span> | |
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(datasets_df)</span> | |
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> # A tibble: 522 × 8</span></span> | |
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> soma_joinid collection_id collection_name collection_doi dataset_id</span></span> | |
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> <int> <chr> <chr> <chr> <chr> </span></span> | |
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 0 43d4bb39-21af-4d05-b97… Transcriptiona… 10.1016/j.cel… f512b8b6-…</span></span> | |
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 1 d36ca85c-3e8b-444c-ba3… A molecular at… 10.1101/2022.… 90d4a63b-…</span></span> | |
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 2 d36ca85c-3e8b-444c-ba3… A molecular at… 10.1101/2022.… d1207c81-…</span></span> | |
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> 4 3 2b02dff7-e427-4cdc-96f… Single-Cell An… 10.1016/j.cel… 36c867a7-…</span></span> | |
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> 5 4 e9eec7f5-8519-42f6-99b… Humoral immuni… 10.1016/j.coi… 58b01044-…</span></span> | |
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> 6 5 a72afd53-ab92-4511-88d… Single-cell at… 10.1038/s4159… 456e8b9b-…</span></span> | |
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co">#> 7 6 e4c9ed14-e560-4900-a3b… A molecular si… 10.1038/s4158… d8da613f-…</span></span> | |
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="co">#> 8 7 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… 97d9238c-…</span></span> | |
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="co">#> 9 8 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… e3a7e927-…</span></span> | |
<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> 10 9 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… 0caedec7-…</span></span> | |
<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 512 more rows</span></span> | |
<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 3 more variables: dataset_title <chr>, dataset_h5ad_path <chr>,</span></span> | |
<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> # dataset_total_cell_count <int></span></span></code></pre></div> | |
<p>For convenience, read the entire presence matrix (for Homo sapiens) | |
into a <code>Matrix::sparseMatrix</code>. There is a convenience API | |
providing this capability:</p> | |
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>presence_matrix <span class="ot"><-</span> cellxgene.census<span class="sc">::</span><span class="fu">get_presence_matrix</span>(census, <span class="st">"Homo sapiens"</span>, <span class="st">"RNA"</span>)</span> | |
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">dim</span>(presence_matrix))</span> | |
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 522 60664</span></span></code></pre></div> | |
<p>We also need the <code>var</code> dataframe, which is read into an R | |
data frame for convenient manipulation:</p> | |
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>var_df <span class="ot"><-</span> <span class="fu">as.data.frame</span>(human_rna<span class="sc">$</span>var<span class="sc">$</span><span class="fu">read</span>())</span> | |
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(var_df)</span> | |
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> # A tibble: 60,664 × 4</span></span> | |
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> soma_joinid feature_id feature_name feature_length</span></span> | |
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> <int> <chr> <chr> <int></span></span> | |
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 0 ENSG00000238009 RP11-34P13.7 3726</span></span> | |
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 1 ENSG00000279457 WASH9P 1397</span></span> | |
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 2 ENSG00000228463 AP006222.1 8224</span></span> | |
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> 4 3 ENSG00000237094 RP4-669L17.4 6204</span></span> | |
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> 5 4 ENSG00000230021 RP11-206L10.17 5495</span></span> | |
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> 6 5 ENSG00000237491 LINC01409 8413</span></span> | |
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> 7 6 ENSG00000177757 FAM87B 1947</span></span> | |
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> 8 7 ENSG00000225880 LINC00115 1317</span></span> | |
<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> 9 8 ENSG00000230368 FAM41C 1971</span></span> | |
<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> 10 9 ENSG00000230699 RP11-54O7.1 3043</span></span> | |
<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 60,654 more rows</span></span></code></pre></div> | |
<div id="is-a-feature-present-in-a-dataset" class="section level2"> | |
<h2>Is a feature present in a dataset?</h2> | |
<p><em>Goal:</em> test if a given feature is present in a given | |
dataset.</p> | |
<p><strong>Important:</strong> the (one-based) indexes in the sparse | |
presence matrix correspond to the (zero-based) <code>soma_joinid</code> | |
+ 1. In other words:</p> | |
<ul> | |
<li>the first dimension of the presence matrix is (one plus) the | |
dataset’s <code>soma_joinid</code> as stored in the | |
<code>census_datasets</code> dataframe.</li> | |
<li>the second dimension of the presence matrix is (one plus) the | |
feature’s <code>soma_joinid</code> as stored in the <code>var</code> | |
dataframe.</li> | |
</ul> | |
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>var_joinid <span class="ot"><-</span> var_df<span class="sc">$</span>soma_joinid[var_df<span class="sc">$</span>feature_id <span class="sc">==</span> <span class="st">"ENSG00000286096"</span>]</span> | |
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>dataset_joinid <span class="ot"><-</span> datasets_df<span class="sc">$</span>soma_joinid[datasets_df<span class="sc">$</span>dataset_id <span class="sc">==</span> <span class="st">"97a17473-e2b1-4f31-a544-44a60773e2dd"</span>]</span> | |
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>is_present <span class="ot"><-</span> presence_matrix[dataset_joinid <span class="sc">+</span> <span class="dv">1</span>, var_joinid <span class="sc">+</span> <span class="dv">1</span>]</span> | |
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">cat</span>(<span class="fu">paste</span>(<span class="st">"Feature is"</span>, <span class="cf">if</span> (is_present) <span class="st">"present."</span> <span class="cf">else</span> <span class="st">"not present."</span>))</span> | |
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> Feature is present.</span></span></code></pre></div> | |
</div> | |
<div id="what-datasets-contain-a-feature" class="section level2"> | |
<h2>What datasets contain a feature?</h2> | |
<p><em>Goal:</em> look up all datasets that have a feature_id | |
present.</p> | |
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Grab the feature's soma_joinid from the var dataframe</span></span> | |
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>var_joinid <span class="ot"><-</span> var_df<span class="sc">$</span>soma_joinid[var_df<span class="sc">$</span>feature_id <span class="sc">==</span> <span class="st">"ENSG00000286096"</span>]</span> | |
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span> | |
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># The presence matrix is indexed by the joinids of the dataset and var dataframes,</span></span> | |
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co"># so slice out the feature of interest by its joinid.</span></span> | |
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>dataset_joinids <span class="ot"><-</span> datasets_df<span class="sc">$</span>soma_joinid[presence_matrix[, var_joinid <span class="sc">+</span> <span class="dv">1</span>] <span class="sc">!=</span> <span class="dv">0</span>]</span> | |
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a></span> | |
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(datasets_df[dataset_joinids <span class="sc">+</span> <span class="dv">1</span>, ])</span> | |
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> # A tibble: 24 × 8</span></span> | |
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> soma_joinid collection_id collection_name collection_doi dataset_id</span></span> | |
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> <int> <chr> <chr> <chr> <chr> </span></span> | |
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 89 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 07b1d7c8-…</span></span> | |
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 102 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 7c1c3d47-…</span></span> | |
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 103 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 9372df2d-…</span></span> | |
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> 4 131 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… dd03ce70-…</span></span> | |
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> 5 145 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 7a0a8891-…</span></span> | |
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> 6 147 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… d2b5efc1-…</span></span> | |
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a><span class="co">#> 7 151 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… f8dda921-…</span></span> | |
<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a><span class="co">#> 8 154 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 3a7f3ab4-…</span></span> | |
<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="co">#> 9 156 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… bdb26abd-…</span></span> | |
<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> 10 158 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 5e5ab909-…</span></span> | |
<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 14 more rows</span></span> | |
<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 3 more variables: dataset_title <chr>, dataset_h5ad_path <chr>,</span></span> | |
<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> # dataset_total_cell_count <int></span></span></code></pre></div> | |
</div> | |
<div id="what-features-are-in-a-dataset" class="section level2"> | |
<h2>What features are in a dataset?</h2> | |
<p><em>Goal:</em> lookup the features present in a given dataset.</p> | |
<p>This example also demonstrates the ability to do the query on | |
multiple datasets.</p> | |
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Slice the dataset(s) of interest, and get the joinid(s)</span></span> | |
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>dataset_joinids <span class="ot"><-</span> datasets_df<span class="sc">$</span>soma_joinid[datasets_df<span class="sc">$</span>collection_id <span class="sc">==</span> <span class="st">"17481d16-ee44-49e5-bcf0-28c0780d8c4a"</span>]</span> | |
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a></span> | |
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Slice the presence matrix by the first dimension, i.e., by dataset</span></span> | |
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>var_joinids <span class="ot"><-</span> var_df<span class="sc">$</span>soma_joinid[<span class="fu">which</span>(Matrix<span class="sc">::</span><span class="fu">colSums</span>(presence_matrix[dataset_joinids <span class="sc">+</span> <span class="dv">1</span>, ]) <span class="sc">></span> <span class="dv">0</span>)]</span> | |
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span> | |
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(var_df[var_joinids <span class="sc">+</span> <span class="dv">1</span>, ])</span> | |
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> # A tibble: 27,211 × 4</span></span> | |
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> soma_joinid feature_id feature_name feature_length</span></span> | |
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> <int> <chr> <chr> <int></span></span> | |
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 0 ENSG00000238009 RP11-34P13.7 3726</span></span> | |
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 1 ENSG00000279457 WASH9P 1397</span></span> | |
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 2 ENSG00000228463 AP006222.1 8224</span></span> | |
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> 4 3 ENSG00000237094 RP4-669L17.4 6204</span></span> | |
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> 5 4 ENSG00000230021 RP11-206L10.17 5495</span></span> | |
<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> 6 5 ENSG00000237491 LINC01409 8413</span></span> | |
<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> 7 6 ENSG00000177757 FAM87B 1947</span></span> | |
<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a><span class="co">#> 8 7 ENSG00000225880 LINC00115 1317</span></span> | |
<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="co">#> 9 8 ENSG00000230368 FAM41C 1971</span></span> | |
<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="co">#> 10 9 ENSG00000230699 RP11-54O7.1 3043</span></span> | |
<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> # ℹ 27,201 more rows</span></span></code></pre></div> | |
</div> | |
<!-- code folding --> | |
<!-- dynamically load mathjax for compatibility with self-contained --> | |
<script> | |
(function () { | |
var script = document.createElement("script"); | |
script.type = "text/javascript"; | |
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; | |
document.getElementsByTagName("head")[0].appendChild(script); | |
})(); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment