First post of 2016! Just a quick post about how I measure sequence total, sequence length distribution, and assembly quality (when it’s important). I wrote a python script that handles both fastas and fastqs to report a variety of metrics.
Here’s my code:
#!/usr/bin/python2.7
'''USAGE: fasta_stats.py seqFile
This script calculates various statistics about the provided fasta or fastq file.
'''
import sys
import os
# This function reads in fasta file, appends the length of each sequence to a list, and counts all Gs & Cs.
# It returns a sorted list of sequence lengths with the G+C total as the last element.
def readFasta(FastaFile):
tempseq = ''
GC_count = 0
N_count = 0
lenList = []
for line in FastaFile:
if line.startswith('>'):
lenList.append(len(tempseq))
GC_count = GC_count + tempseq.count('C') + tempseq.count('c') + tempseq.count('G') + tempseq.count('g')
N_count = N_count + tempseq.count('N') + tempseq.count('n')
tempseq = ''
continue
else:
tempseq = tempseq + line.strip()
lenList.append(len(tempseq))
lenList.remove(0)
lenList.sort()
lenList.append(GC_count)
lenList.append(N_count)
FastaFile.close()
return lenList
# This function reads in fastq files and returns all sequence lengths in a list
def readFastq(FastqFile):
GC_count = 0
N_count = 0
lenList = []
line_count = 3
for line in FastqFile:
line_count += 1
if line_count == 5:
seq = line.strip()
lenList.append(len(seq))
GC_count = GC_count + seq.count('C') + seq.count('c') + seq.count('G') + seq.count('g')
N_count = N_count + seq.count('N') + seq.count('n')
line_count = 1
continue
else:
continue
lenList.sort()
lenList.append(GC_count)
lenList.append(N_count)
FastqFile.close()
return lenList
# This function calculates and returns all the printed statistics.
def calcStats(lengths):
Ns = lengths[-1]
del lengths[-1]
GCs = lengths[-1] # extract saved GC count
del lengths[-1]
total_seq = len(lengths) # Total number of sequences
len_sum = sum(lengths) # Total number of residues
total_Mb = len_sum/1000000.00 # Total number of residues expressed in Megabases
GC_content = (float(GCs)/float(len_sum))*100 # GC content as a percent of total residues
# interquartile range
if total_seq >= 4:
median_len = lengths[int(round(total_seq/2))] # Median sequence length
q1_range = lengths[0:(int(total_seq/2)-1)]
q1 = q1_range[int(len(q1_range)/2)]
q3_range = lengths[(int(total_seq/2)+1):-1]
q3 = q3_range[int(len(q3_range)/2)]
iqr = int(q3 - q1)
else:
iqr = 'Too few sequences to calculate'
median_len = 'Too few sequences to calculate'
#n50 calculation loop
current_bases = 0
n50 = 0
n90 = 0
seqs_1000 = 0
seqs_5000 = 0
percent50_bases = int(round(len_sum*0.5))
percent90_bases = int(round(len_sum*0.1))
for object in lengths:
current_bases += object
if object > 1000:
seqs_1000 += 1
if object > 5000:
seqs_5000 += 1
if current_bases >= percent50_bases and n50 == 0:
n50 = object
if current_bases >= percent90_bases and n90 == 0:
n90 = object
if total_seq < 4:
n50 = 'Too few sequences to calculate'
n90 = 'Too few sequences to calculate'
l50 = 'Too few sequences to calculate'
else:
l50 = lengths.count(n50)
return(total_seq, total_Mb, n50, median_len, iqr, GC_content, n90, seqs_1000, seqs_5000, Ns, l50)
if os.stat(str(sys.argv[1])).st_size == 0:
print('ERROR: Empty input file.')
sys.exit()
fasta_suffix = ['fasta', 'fa', 'fna', 'faa', 'ffn', 'frn']
fastq_suffix = ['fastq', 'fq']
file_suffix = str(sys.argv[1]).split('.')[-1]
if file_suffix in fasta_suffix:
file_type = 'Fasta'
seq_lengths = readFasta(open(sys.argv[1], 'r'))
elif file_suffix in fastq_suffix:
file_type = 'Fastq'
seq_lengths = readFastq(open(sys.argv[1], 'r'))
else:
print('ERROR: Invalid file format provided.')
sys.exit()
stat_list = calcStats(seq_lengths)
output_string = """# Input file name: {filename}
# File type: {filetype}
# Total sequences: {total_seq}
# Total bases: {total_mb} Mb
# Sequence N50: {n50}
# Sequence L50: {l50}
# Sequence N90: {n90}
# Median sequence length: {median_len}
# Interquartile range: {iqr}
# Shortest sequence length: {short}
# Longest sequence length: {long}
# Sequences > 1 kb: {seqs_1000}
# Sequences > 5 kb: {seqs_5000}
# G-C content: {gc}%
# Ns included: {ns}
""".format(filename = str(sys.argv[1]).split('/')[-1], filetype = file_type, total_seq = stat_list[0], total_mb = "%.2f" % stat_list[1], n50 = stat_list[2], median_len = stat_list[3], iqr = stat_list[4], short = seq_lengths[0], long = seq_lengths[-1], gc = "%.2f" % stat_list[5], n90 = stat_list[6], seqs_1000 = stat_list[7], seqs_5000 = stat_list[8], ns = stat_list[9], l50 = stat_list[10])
print output_string
Output is printed to the terminal and appears like this:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Input file name: example.fasta
# File type: Fasta
# Total sequences: 2362551
# Total bases: 1462.60 Mb
# Sequence N50: 682
# Sequence L50: 2575
# Sequence N90: 394
# Median sequence length: 532
# Interquartile range: 331
# Shortest sequence length: 251
# Longest sequence length: 190893
# Sequences > 1 kb: 260467
# Sequences > 5 kb: 632
# G-C content: 42.76%
# Ns included: 0
Pretty straight forward, but I haven’t found any others like it that give all of these stats together. It makes read QC and evaluating assemblies a lot more convenient. So if you like it, give it a try! I’ve also made a Github repo for the scripts I use in this blog and linked it on the resources page.