Created
September 30, 2017 02:06
-
-
Save ConradStack/a71dd8f0bf560110e933b99333055f48 to your computer and use it in GitHub Desktop.
Extract sequences from a fasta file, preserving read name comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a new fasta file given a fasta file and list of sequence names | |
# - outputting the long_name does/did not seem to work properly in the faidx script that is packaged with pyfaidx | |
from pyfaidx import * | |
# read fasta file | |
fa = Fasta('test.fa') | |
# test.read.names is newline-delimited list of sequence names, not including any text after the first space (i.e., the long_name) | |
with open('test.read.names') as f: | |
lines = [line.rstrip('\n') for line in f] | |
# loop over the list of read names and write the long_name (name + "comments") and sequence to a new fasta file. | |
with open('test.files.fasta', 'w') as outfile: | |
for key in lines: | |
read = fa[key] | |
# Write new fasta header for this subsequence to output file | |
outfile.write('>' + read.long_name + '\n') | |
# Write wrapped subsequence for output file | |
outfile.write( read[:].seq + '\n' ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment