First post of 2016! Just a quick post about how I measure sequence total, sequence length distribution, and assembly quality (when it’s important). I wrote a python script that handles both fastas and fastqs to report a variety of metrics.

Here’s my code:

#!/usr/bin/python2.7
'''USAGE: fasta_stats.py seqFile 
This script calculates various statistics about the provided fasta or fastq file.
'''
import sys
import os

# This function reads in fasta file, appends the length of each sequence to a list, and counts all Gs & Cs.
# It returns a sorted list of sequence lengths with the G+C total as the last element.
def readFasta(FastaFile):

	tempseq = ''
	GC_count = 0
	N_count = 0
	lenList = []

	for line in FastaFile: 

		if line.startswith('>'): 
			lenList.append(len(tempseq))
			GC_count = GC_count + tempseq.count('C') + tempseq.count('c') + tempseq.count('G') + tempseq.count('g')
			N_count = N_count + tempseq.count('N') + tempseq.count('n')
			tempseq = ''
			continue

		else:
			tempseq = tempseq + line.strip()

	lenList.append(len(tempseq))

	lenList.remove(0)
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastaFile.close()

	return lenList


# This function reads in fastq files and returns all sequence lengths in a list
def readFastq(FastqFile):

	GC_count = 0
	N_count = 0
	lenList = []
	line_count = 3 

	for line in FastqFile: 
	
		line_count += 1
	
		if line_count == 5:
			seq = line.strip()
			lenList.append(len(seq))
			GC_count = GC_count + seq.count('C') + seq.count('c') + seq.count('G') + seq.count('g')
			N_count = N_count + seq.count('N') + seq.count('n')
			line_count = 1
			continue
	
		else:
			continue
	
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastqFile.close()

	return lenList

	
# This function calculates and returns all the printed statistics.
def calcStats(lengths):

	Ns = lengths[-1]
	del lengths[-1]
	GCs = lengths[-1] # extract saved GC count
	del lengths[-1] 
                  	
	total_seq = len(lengths) # Total number of sequences
	len_sum = sum(lengths) # Total number of residues
	total_Mb = len_sum/1000000.00 # Total number of residues expressed in Megabases
	GC_content = (float(GCs)/float(len_sum))*100 # GC content as a percent of total residues

	# interquartile range
	if total_seq >= 4:
		median_len = lengths[int(round(total_seq/2))] # Median sequence length
		q1_range = lengths[0:(int(total_seq/2)-1)]
		q1 = q1_range[int(len(q1_range)/2)]
		q3_range = lengths[(int(total_seq/2)+1):-1]
		q3 = q3_range[int(len(q3_range)/2)]
		iqr = int(q3 - q1)
	else:
		iqr = 'Too few sequences to calculate'
		median_len = 'Too few sequences to calculate'

	#n50 calculation loop
	current_bases = 0
	n50 = 0
	n90 = 0
	seqs_1000 = 0
	seqs_5000 = 0
	percent50_bases = int(round(len_sum*0.5))
	percent90_bases = int(round(len_sum*0.1))

	for object in lengths:

		current_bases += object
	
		if object > 1000:
			seqs_1000 += 1
		if object > 5000:
			seqs_5000 += 1
		
		if current_bases >= percent50_bases and n50 == 0:
			n50 = object
		if current_bases >= percent90_bases and n90 == 0:
			n90 = object

	if total_seq < 4:
		n50 = 'Too few sequences to calculate'
		n90 = 'Too few sequences to calculate'
		l50 = 'Too few sequences to calculate'
	else:	
		l50 = lengths.count(n50)

	return(total_seq, total_Mb, n50, median_len, iqr, GC_content, n90, seqs_1000, seqs_5000, Ns, l50)


if os.stat(str(sys.argv[1])).st_size == 0:
	print('ERROR: Empty input file.')
	sys.exit()

fasta_suffix = ['fasta', 'fa', 'fna', 'faa', 'ffn', 'frn']
fastq_suffix = ['fastq', 'fq']
file_suffix = str(sys.argv[1]).split('.')[-1]

if file_suffix in fasta_suffix:
	file_type = 'Fasta'	
	seq_lengths = readFasta(open(sys.argv[1], 'r'))  

elif file_suffix in fastq_suffix:	
	file_type = 'Fastq'
	seq_lengths = readFastq(open(sys.argv[1], 'r'))

else:
	print('ERROR: Invalid file format provided.')
	sys.exit()

stat_list = calcStats(seq_lengths)	

output_string = """# Input file name: {filename}
# File type: {filetype}
# Total sequences: {total_seq}
# Total bases: {total_mb} Mb
# Sequence N50: {n50}
# Sequence L50: {l50}
# Sequence N90: {n90}
# Median sequence length: {median_len}
# Interquartile range: {iqr}
# Shortest sequence length: {short}
# Longest sequence length: {long}
# Sequences > 1 kb: {seqs_1000}
# Sequences > 5 kb: {seqs_5000}
# G-C content: {gc}%
# Ns included: {ns}
""".format(filename = str(sys.argv[1]).split('/')[-1], filetype = file_type, total_seq = stat_list[0], total_mb = "%.2f" % stat_list[1], n50 = stat_list[2], median_len = stat_list[3], iqr = stat_list[4], short = seq_lengths[0], long = seq_lengths[-1], gc = "%.2f" % stat_list[5], n90 = stat_list[6], seqs_1000 = stat_list[7], seqs_5000 = stat_list[8], ns = stat_list[9], l50 = stat_list[10])

print output_string

Output is printed to the terminal and appears like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Input file name: example.fasta
# File type: Fasta
# Total sequences: 2362551
# Total bases: 1462.60 Mb
# Sequence N50: 682
# Sequence L50: 2575
# Sequence N90: 394
# Median sequence length: 532
# Interquartile range: 331
# Shortest sequence length: 251
# Longest sequence length: 190893
# Sequences > 1 kb: 260467
# Sequences > 5 kb: 632
# G-C content: 42.76%
# Ns included: 0

Pretty straight forward, but I haven’t found any others like it that give all of these stats together. It makes read QC and evaluating assemblies a lot more convenient. So if you like it, give it a try! I’ve also made a Github repo for the scripts I use in this blog and linked it on the resources page.

After all of the filtering is done, I’m now finally ready to subsample the read files. Also, since the awk one-liners for subsampling isn’t working I wrote my own in python.

Below are the totals of reads per file. The goal is to subsample the groups of reads (paired and orphan) to the lowest acceptable value among them.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
Condition 1 - Plus
read 1:  164655029
read 2:  164655029
orphan:  14957436

Condition 1 - Minus
read 1:  147430303
read 2:  147430303
orphan:  14500000

Condition 2 - Plus
read 1:  113675146
read 2:  113675146
orphan:  11505508

Condition 2 - Minus
read 1:  110752263
read 2:  110752263
orphan:  12084021

Condition 3 - Plus
read 1:  126954804
read 2:  126954804
orphan:  16455611

Condition 3 - Minus
read 1:  115484599
read 2:  115484599
orphan:  11895524

Negative Control - Plus
read 1:  138811772
read 2:  138811772
orphan:  11936178

Positive Control - Plus
read 1:  170018546
read 2:  170018546
orphan:  19394415

This is great news. It looks like I still have >100 million paired-end reads and >10 million orphaned reads are left per sample after subsampling. I’ll post some rarefaction curves when I get them.

The awk commands I found are not working properly, so I wrote this python script to subsample both the interleaved pair-end and single-end reads. Make sure you install argparse before trying to use it though!

#!/usr/bin/env python
'''USAGE: subsample_fasta.py file.fasta --size n --total n' --pair y/n
Randomly subsamples a fasta file to the given size, handles both single and paired-end read data
'''

# It's important to know the correct number of sequences per file for this script to run properly,
# please run seq_stats.py prior to running subsample_fasta.py

import sys
import random
import argparse


parser = argparse.ArgumentParser(description='Subsample reads from a fasta file.')
parser.add_argument('newfile')
parser.add_argument('--size', default=0, help='Number of sequences to subsample (# of pairs for paired-end)')
parser.add_argument('--total', default=0, help='Total number of sequences in the fasta file')
parser.add_argument('--pair', default='n', help='Indicates if the file is interleaved, pired-end reads')
args = parser.parse_args()

if int(args.size) == 0 or int(args.total) == 0:
	print('ERROR: File or subsample value are of size 0')
	sys.exit()


# Check if the subsample is that same or larger than the total sequence count, and then
# generates a random list of positions to pick from fasta file
print 'Creating subsampling distribution...'

if args.pair == 'n':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size) + 1)))
elif args.pair == 'y':
	sample_list = range(1, int(args.total) + 1, 2)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list_forward = sorted(random.sample(sample_list, int(args.size) + 1))
		sample_list_reverse = []
		for index in sample_list_forward: sample_list_reverse.append(index + 1)
		sample_list = set(sorted(sample_list_forward + sample_list_reverse))
elif args.pair not in ['y', 'n']:
	print('ERROR: Invalid input file type')
	sys.exit()


# Name and open output file
outfile_str = str(args.newfile).rstrip('fasta') + 'pick.fasta' 
outfile = open(outfile_str, 'w')

# Create logfile of subsampling
logfile_str = str(args.newfile).rstrip('fasta') + 'pick.log' 
logfile = open(logfile_str, 'w')

# Label the input as pe or se
if args.pair == 'y':
	file_type = 'Paired-end'
else:
	file_type = 'Single-end'

log_str = '''Input file name: {infile}
Input file type:  {type}
Output file name: {outfile}
Total sequences: {total}
Subsample size: {size}
'''.format(infile=str(args.newfile), type=file_type, outfile=outfile_str, total=str(args.total), size=str(int(args.size)))
logfile.write(log_str)
logfile.close()

# Open and begin looping through fasta file and picking sequences
print 'Writing subsampled fasta file...'
with open(args.newfile, 'r') as infile:

	outfile = open(outfile_str, 'w')

	seq_count = 0
	include = 0
	iteration = 0
	pair_include = 0

	for line in infile:
	
		line = line.strip()
	
		if line[0] == '>':
			
			include = 0
			seq_count += 1
			iteration += 1
			
			if iteration == 10000:
				iteration = 0
				progress_str = str(seq_count) + ' of ' + str(args.total)
				print(progress_str)

			if seq_count in sample_list:
				sample_list.discard(seq_count)
				outfile.write(line)
				outfile.write('\n')
				include = 1
				continue
				
		elif line[0] != '>' and include == 1: 
			outfile.write(line)
			outfile.write('\n')
			continue
				
	
outfile.close()			
print 'Done.'

I streamlined the process as much as possible by making it only necessary to loop through the large fastas a single time. It also prints a logfile so you can remember what the heck you did in a month.

After all of the filtering and tweaking the subsampling algorithm, the jobs are finally running. As soon as they finish, I’ll map the residual reads to the C. difficile 630 genome and get some useable data.