First post of 2016! Just a quick post about how I measure sequence total, sequence length distribution, and assembly quality (when it’s important). I wrote a python script that handles both fastas and fastqs to report a variety of metrics.

Here’s my code:

#!/usr/bin/python2.7
'''USAGE: fasta_stats.py seqFile 
This script calculates various statistics about the provided fasta or fastq file.
'''
import sys
import os

# This function reads in fasta file, appends the length of each sequence to a list, and counts all Gs & Cs.
# It returns a sorted list of sequence lengths with the G+C total as the last element.
def readFasta(FastaFile):

	tempseq = ''
	GC_count = 0
	N_count = 0
	lenList = []

	for line in FastaFile: 

		if line.startswith('>'): 
			lenList.append(len(tempseq))
			GC_count = GC_count + tempseq.count('C') + tempseq.count('c') + tempseq.count('G') + tempseq.count('g')
			N_count = N_count + tempseq.count('N') + tempseq.count('n')
			tempseq = ''
			continue

		else:
			tempseq = tempseq + line.strip()

	lenList.append(len(tempseq))

	lenList.remove(0)
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastaFile.close()

	return lenList


# This function reads in fastq files and returns all sequence lengths in a list
def readFastq(FastqFile):

	GC_count = 0
	N_count = 0
	lenList = []
	line_count = 3 

	for line in FastqFile: 
	
		line_count += 1
	
		if line_count == 5:
			seq = line.strip()
			lenList.append(len(seq))
			GC_count = GC_count + seq.count('C') + seq.count('c') + seq.count('G') + seq.count('g')
			N_count = N_count + seq.count('N') + seq.count('n')
			line_count = 1
			continue
	
		else:
			continue
	
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastqFile.close()

	return lenList

	
# This function calculates and returns all the printed statistics.
def calcStats(lengths):

	Ns = lengths[-1]
	del lengths[-1]
	GCs = lengths[-1] # extract saved GC count
	del lengths[-1] 
                  	
	total_seq = len(lengths) # Total number of sequences
	len_sum = sum(lengths) # Total number of residues
	total_Mb = len_sum/1000000.00 # Total number of residues expressed in Megabases
	GC_content = (float(GCs)/float(len_sum))*100 # GC content as a percent of total residues

	# interquartile range
	if total_seq >= 4:
		median_len = lengths[int(round(total_seq/2))] # Median sequence length
		q1_range = lengths[0:(int(total_seq/2)-1)]
		q1 = q1_range[int(len(q1_range)/2)]
		q3_range = lengths[(int(total_seq/2)+1):-1]
		q3 = q3_range[int(len(q3_range)/2)]
		iqr = int(q3 - q1)
	else:
		iqr = 'Too few sequences to calculate'
		median_len = 'Too few sequences to calculate'

	#n50 calculation loop
	current_bases = 0
	n50 = 0
	n90 = 0
	seqs_1000 = 0
	seqs_5000 = 0
	percent50_bases = int(round(len_sum*0.5))
	percent90_bases = int(round(len_sum*0.1))

	for object in lengths:

		current_bases += object
	
		if object > 1000:
			seqs_1000 += 1
		if object > 5000:
			seqs_5000 += 1
		
		if current_bases >= percent50_bases and n50 == 0:
			n50 = object
		if current_bases >= percent90_bases and n90 == 0:
			n90 = object

	if total_seq < 4:
		n50 = 'Too few sequences to calculate'
		n90 = 'Too few sequences to calculate'
		l50 = 'Too few sequences to calculate'
	else:	
		l50 = lengths.count(n50)

	return(total_seq, total_Mb, n50, median_len, iqr, GC_content, n90, seqs_1000, seqs_5000, Ns, l50)


if os.stat(str(sys.argv[1])).st_size == 0:
	print('ERROR: Empty input file.')
	sys.exit()

fasta_suffix = ['fasta', 'fa', 'fna', 'faa', 'ffn', 'frn']
fastq_suffix = ['fastq', 'fq']
file_suffix = str(sys.argv[1]).split('.')[-1]

if file_suffix in fasta_suffix:
	file_type = 'Fasta'	
	seq_lengths = readFasta(open(sys.argv[1], 'r'))  

elif file_suffix in fastq_suffix:	
	file_type = 'Fastq'
	seq_lengths = readFastq(open(sys.argv[1], 'r'))

else:
	print('ERROR: Invalid file format provided.')
	sys.exit()

stat_list = calcStats(seq_lengths)	

output_string = """# Input file name: {filename}
# File type: {filetype}
# Total sequences: {total_seq}
# Total bases: {total_mb} Mb
# Sequence N50: {n50}
# Sequence L50: {l50}
# Sequence N90: {n90}
# Median sequence length: {median_len}
# Interquartile range: {iqr}
# Shortest sequence length: {short}
# Longest sequence length: {long}
# Sequences > 1 kb: {seqs_1000}
# Sequences > 5 kb: {seqs_5000}
# G-C content: {gc}%
# Ns included: {ns}
""".format(filename = str(sys.argv[1]).split('/')[-1], filetype = file_type, total_seq = stat_list[0], total_mb = "%.2f" % stat_list[1], n50 = stat_list[2], median_len = stat_list[3], iqr = stat_list[4], short = seq_lengths[0], long = seq_lengths[-1], gc = "%.2f" % stat_list[5], n90 = stat_list[6], seqs_1000 = stat_list[7], seqs_5000 = stat_list[8], ns = stat_list[9], l50 = stat_list[10])

print output_string

Output is printed to the terminal and appears like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Input file name: example.fasta
# File type: Fasta
# Total sequences: 2362551
# Total bases: 1462.60 Mb
# Sequence N50: 682
# Sequence L50: 2575
# Sequence N90: 394
# Median sequence length: 532
# Interquartile range: 331
# Shortest sequence length: 251
# Longest sequence length: 190893
# Sequences > 1 kb: 260467
# Sequences > 5 kb: 632
# G-C content: 42.76%
# Ns included: 0

Pretty straight forward, but I haven’t found any others like it that give all of these stats together. It makes read QC and evaluating assemblies a lot more convenient. So if you like it, give it a try! I’ve also made a Github repo for the scripts I use in this blog and linked it on the resources page.

Published Again!

After a long review process, the final publication from my dissertation work is finally published! In it were able to show that not only ...… Continue reading

Major Changes!

Published on August 17, 2017