Created
July 22, 2023 22:10
-
-
Save kdauria/420ca40aaa89cfc8156f0e1793c0ebd3 to your computer and use it in GitHub Desktop.
Read BAM with noodles that has a non-compliant platform tag
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::env; | |
use std::fs::File; | |
use std::io; | |
use std::io::Read; | |
use std::path::Path; | |
use std::process; | |
use byteorder::{LittleEndian, ReadBytesExt}; | |
use noodles::bam; | |
use noodles::bgzf::{self as bgzf, VirtualPosition}; | |
use noodles::sam; | |
/// Read a header of a BAM file, ignoring the PL tag in any read group | |
/// | |
/// This is because noodles follows SAM specification, but not everyone does | |
fn read_header(path: &Path) -> io::Result<(sam::Header, VirtualPosition)> { | |
let mut bgzf_reader = bgzf::Reader::new(File::open(path)?); | |
let mut magic_number = [0; 4]; | |
bgzf_reader.read_exact(&mut magic_number)?; | |
const EXPECTED_MAGIC_NUMBER: &[u8; 4] = b"BAM\x01"; | |
let is_valid_bam = &magic_number == EXPECTED_MAGIC_NUMBER; | |
if !is_valid_bam { | |
let error_message = "Invalid BAM file"; | |
return Err(io::Error::new(io::ErrorKind::InvalidData, error_message)); | |
} | |
let header_text_length = bgzf_reader.read_u32::<LittleEndian>()?; | |
let mut header_text_bytes = vec![0; header_text_length as usize]; | |
bgzf_reader.read_exact(&mut header_text_bytes)?; | |
let header_text = String::from_utf8_lossy(&header_text_bytes).into_owned(); | |
let header_text_without_platform_tags = remove_platform_tags(header_text.as_str()); | |
let header = header_text_without_platform_tags | |
.parse() | |
.map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Invalid SAM header"))?; | |
// Move the file cursor to the end of the binary reference sequence dictionary | |
let n_ref = bgzf_reader.read_u32::<LittleEndian>().unwrap() as usize; | |
for _ in 0..n_ref { | |
let byte_count = bgzf_reader.read_u32::<LittleEndian>().unwrap() as usize; | |
let mut discard_buffer = vec![0; byte_count]; | |
bgzf_reader.read_exact(&mut discard_buffer).unwrap(); | |
bgzf_reader.read_u32::<LittleEndian>().unwrap(); | |
} | |
let records_position: VirtualPosition = bgzf_reader.virtual_position(); | |
Ok((header, records_position)) | |
} | |
fn remove_platform_tags(header: &str) -> String { | |
header | |
.lines() | |
.map(|line| { | |
if line.starts_with("@RG") { | |
let fields: Vec<&str> = line | |
.split('\t') | |
.filter(|field| !field.starts_with("PL:")) | |
.collect(); | |
fields.join("\t") | |
} else { | |
line.to_string() | |
} | |
}) | |
.collect::<Vec<String>>() | |
.join("\n") | |
} | |
fn main() -> io::Result<()> { | |
// Skip the first argument, which is the program name | |
let args: Vec<String> = env::args().skip(1).collect(); | |
if args.len() != 1 { | |
eprintln!("Usage: <program> <bam_file_path>"); | |
process::exit(1); | |
} | |
let path = Path::new(&args[0]); | |
let (header, records_position) = match read_header(&path) { | |
Ok((header, records_position)) => { | |
println!( | |
"Successfully read header. Records start at position: {:?}", | |
records_position | |
); | |
(header, records_position) | |
} | |
Err(e) => { | |
eprintln!("Failed to read header: {}", e); | |
process::exit(1); | |
} | |
}; | |
let mut reader = bam::reader::Builder.build_from_path(path)?; | |
reader.seek(records_position)?; | |
let mut read_counter = 0; | |
for _ in reader.records(&header) { | |
read_counter += 1; | |
} | |
println!("Number of reads: {}", read_counter); | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment