Created
June 12, 2023 21:26
-
-
Save rmrao/940fb8df30dcabda3f97d7633a9dd009 to your computer and use it in GitHub Desktop.
Count the number of sequences or the total number of amino acids in a fasta file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
count_nseqs() { | |
file="$1" | |
if [ -f "$file" ]; then | |
if [[ "$file" == *.gz ]]; then | |
count=$(zcat "$file" | grep -c "^>") | |
else | |
count=$(grep -c "^>" "$file") | |
fi | |
echo "$count" | |
else | |
echo "File does not exist: $file" >&2 | |
exit 1 | |
fi | |
} | |
count_ntoks() { | |
file="$1" | |
if [ -f "$file" ]; then | |
if [[ "$file" == *.gz ]]; then | |
sum=0 | |
zcat "$file" | awk '/^>/ { if (seq != "") { sum += length(seq) } seq = "" } /^>/ {next} { seq = seq $0 } END { sum += length(seq); print sum }' | |
else | |
sum=0 | |
awk '/^>/ { if (seq != "") { sum += length(seq) } seq = "" } /^>/ {next} { seq = seq $0 } END { sum += length(seq); print sum }' "$file" | |
fi | |
else | |
echo "File does not exist: $file" >&2 | |
exit 1 | |
fi | |
} | |
# Main script | |
if [ $# -eq 0 ]; then | |
echo "Usage: ./fasta_length.sh [-ntok] <fasta_file>" | |
exit 1 | |
fi | |
file_flag="" | |
fasta_file="" | |
# Parse command-line arguments | |
while [ "$1" != "" ]; do | |
case $1 in | |
-ntok ) file_flag="ntok" | |
;; | |
* ) fasta_file="$1" | |
esac | |
shift | |
done | |
if [ "$file_flag" == "ntok" ]; then | |
count_ntoks "$fasta_file" | |
else | |
count_nseqs "$fasta_file" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment