Latest Updates

I ran in to an instance that I needed to subsample separate forward and reverse read files equally. I ran in to an issue when I realized that python doesn’t duplicate variables during assignment. I figured out a workaround using deepcopy, but an even better solution was to use izip (a part of itertools). In this way I can loop through both reads and sample them simultaneously.

I am still running some tests, but here is the updated code:

#!/usr/bin/env python
'''USAGE: subsample_fasta.py file.fasta --size n --total n' --pair y/n
Randomly subsamples a fasta file to the given size, handles both single and paired-end read data
'''

# It's important to know the correct number of sequences per file for this script to run properly,
# please run seq_stats.py prior to running subsample_fasta.py

import sys
import random
import argparse
from itertools import izip


parser = argparse.ArgumentParser(description='Subsample reads from a fasta file.')
parser.add_argument('--reads', default='none', help='Single-end or interleaved paire-end read file')
parser.add_argument('--forward', default='none', help='Forward paired-end read.')
parser.add_argument('--reverse', default='none', help='Reverse paired-end read.')
parser.add_argument('--size', default=0, help='Number of sequences to subsample (# of pairs for paired-end)')
parser.add_argument('--total', default=0, help='Total number of sequences in the fasta file')
parser.add_argument('--paired', default='n', help='Indicates if the file(s) are paired-end reads')
args = parser.parse_args()

if int(args.size) == 0 or int(args.total) == 0:
	print('ERROR: File or subsample value are of size 0')
	sys.exit()
elif args.reads == 'none':
	if args.forward == 'none' or args.reverse == 'none':
		print('ERROR: Input file(s) not provided')
		sys.exit()
	
	
# Check if the subsample is that same or larger than the total sequence count, and then
# generates a random list of positions to pick from fasta file
print 'Creating subsampling distribution...'

if args.paired == 'n':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))
elif args.paired == 'y' and args.forward == 'none' and args.reverse == 'none':
	sample_list = range(1, int(args.total) + 1, 2)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list_forward = sorted(random.sample(sample_list, int(args.size)))
		sample_list_reverse = []

		# These steps add the pair to each of the 
		for index in sample_list_forward: sample_list_reverse.append(index + 1)
		sample_list = set(sorted(sample_list_forward + sample_list_reverse))
elif args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))		
elif args.paired not in ['y', 'n']:
	print('ERROR: Invalid input file type')
	sys.exit()
	

# Label the input as pe or se
if args.paired == 'y':
	file_type = 'Paired-end'
else:
	file_type = 'Single-end'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	# Name and open output file
	outfile_str1 = str(args.forward).rstrip('fasta') + 'pick.fasta' 
	outfile1 = open(outfile_str1, 'w')
	outfile_str2 = str(args.reverse).rstrip('fasta') + 'pick.fasta' 
	outfile2 = open(outfile_str2, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')

	log_str = '''Input file names : {infile1} {infile2}
Input file types:  {type}
Output file names: {outfile1} {outfile2}
Total sequences: {total}
Subsample size: {size}
'''.format(infile1=str(args.forward), infile2=str(args.reverse), type=file_type, outfile1=outfile_str1, outfile2=outfile_str2, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()
else:
	# Name and open output file
	outfile_str = str(args.reads).rstrip('fasta') + 'pick.fasta' 
	outfile = open(outfile_str, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')
	log_str = '''Input file name: {infile}
Input file type:  {type}
Output file name: {outfile}
Total sequences: {total}
Subsample size: {size}
'''.format(infile=str(args.reads), type=file_type, outfile=outfile_str, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()

# Open and begin looping through fasta file and picking sequences
print 'Writing subsampled fasta file...'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	
	seq_count = 0
	include = 0
	iteration = 0
	
	for line_1, line_2 in izip(open(args.forward, 'r'), open(args.reverse, 'r')):
	
		if line_1[0] == '>':
		
			include = 0
			seq_count += 1
			iteration += 1
		
			if iteration == 10000:
				iteration = 0
				progress_str = str(seq_count) + ' of ' + str(args.total)
				print(progress_str)

			if seq_count in sample_list:
				sample_list.discard(seq_count)
				outfile1.write(line_1)
				outfile2.write(line_2)
				include = 1
				continue
	
		elif line_1[0] != '>' and include == 1: 
				outfile1.write(line_1)
				outfile2.write(line_2)
				continue
					
	outfile1.close()
	outfile2.close()
	

else:
	with open(args.reads, 'r') as infile:
		
		seq_count = 0
		include = 0
		iteration = 0
	
		for line in infile:
				
			if line[0] == '>':
			
				include = 0
				seq_count += 1
				iteration += 1
			
				if iteration == 10000:
					iteration = 0
					progress_str = str(seq_count) + ' of ' + str(args.total)
					print(progress_str)

				if seq_count in sample_list:
					sample_list.discard(seq_count)
					outfile.write(line)
					include = 1
					continue
		
			elif line[0] != '>' and include == 1: 
					outfile.write(line)
					continue
						
	outfile.close()			


print 'Done.'

Hopefully, the alignments I’m using the subsampled reads for are done soon so I can finally talk about something else in the next post.

First post of 2016! Just a quick post about how I measure sequence total, sequence length distribution, and assembly quality (when it’s important). I wrote a python script that handles both fastas and fastqs to report a variety of metrics.

Here’s my code:

#!/usr/bin/python2.7
'''USAGE: fasta_stats.py seqFile 
This script calculates various statistics about the provided fasta or fastq file.
'''
import sys
import os

# This function reads in fasta file, appends the length of each sequence to a list, and counts all Gs & Cs.
# It returns a sorted list of sequence lengths with the G+C total as the last element.
def readFasta(FastaFile):

	tempseq = ''
	GC_count = 0
	N_count = 0
	lenList = []

	for line in FastaFile: 

		if line.startswith('>'): 
			lenList.append(len(tempseq))
			GC_count = GC_count + tempseq.count('C') + tempseq.count('c') + tempseq.count('G') + tempseq.count('g')
			N_count = N_count + tempseq.count('N') + tempseq.count('n')
			tempseq = ''
			continue

		else:
			tempseq = tempseq + line.strip()

	lenList.append(len(tempseq))

	lenList.remove(0)
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastaFile.close()

	return lenList


# This function reads in fastq files and returns all sequence lengths in a list
def readFastq(FastqFile):

	GC_count = 0
	N_count = 0
	lenList = []
	line_count = 3 

	for line in FastqFile: 
	
		line_count += 1
	
		if line_count == 5:
			seq = line.strip()
			lenList.append(len(seq))
			GC_count = GC_count + seq.count('C') + seq.count('c') + seq.count('G') + seq.count('g')
			N_count = N_count + seq.count('N') + seq.count('n')
			line_count = 1
			continue
	
		else:
			continue
	
	lenList.sort()
	lenList.append(GC_count)
	lenList.append(N_count)

	FastqFile.close()

	return lenList

	
# This function calculates and returns all the printed statistics.
def calcStats(lengths):

	Ns = lengths[-1]
	del lengths[-1]
	GCs = lengths[-1] # extract saved GC count
	del lengths[-1] 
                  	
	total_seq = len(lengths) # Total number of sequences
	len_sum = sum(lengths) # Total number of residues
	total_Mb = len_sum/1000000.00 # Total number of residues expressed in Megabases
	GC_content = (float(GCs)/float(len_sum))*100 # GC content as a percent of total residues

	# interquartile range
	if total_seq >= 4:
		median_len = lengths[int(round(total_seq/2))] # Median sequence length
		q1_range = lengths[0:(int(total_seq/2)-1)]
		q1 = q1_range[int(len(q1_range)/2)]
		q3_range = lengths[(int(total_seq/2)+1):-1]
		q3 = q3_range[int(len(q3_range)/2)]
		iqr = int(q3 - q1)
	else:
		iqr = 'Too few sequences to calculate'
		median_len = 'Too few sequences to calculate'

	#n50 calculation loop
	current_bases = 0
	n50 = 0
	n90 = 0
	seqs_1000 = 0
	seqs_5000 = 0
	percent50_bases = int(round(len_sum*0.5))
	percent90_bases = int(round(len_sum*0.1))

	for object in lengths:

		current_bases += object
	
		if object > 1000:
			seqs_1000 += 1
		if object > 5000:
			seqs_5000 += 1
		
		if current_bases >= percent50_bases and n50 == 0:
			n50 = object
		if current_bases >= percent90_bases and n90 == 0:
			n90 = object

	if total_seq < 4:
		n50 = 'Too few sequences to calculate'
		n90 = 'Too few sequences to calculate'
		l50 = 'Too few sequences to calculate'
	else:	
		l50 = lengths.count(n50)

	return(total_seq, total_Mb, n50, median_len, iqr, GC_content, n90, seqs_1000, seqs_5000, Ns, l50)


if os.stat(str(sys.argv[1])).st_size == 0:
	print('ERROR: Empty input file.')
	sys.exit()

fasta_suffix = ['fasta', 'fa', 'fna', 'faa', 'ffn', 'frn']
fastq_suffix = ['fastq', 'fq']
file_suffix = str(sys.argv[1]).split('.')[-1]

if file_suffix in fasta_suffix:
	file_type = 'Fasta'	
	seq_lengths = readFasta(open(sys.argv[1], 'r'))  

elif file_suffix in fastq_suffix:	
	file_type = 'Fastq'
	seq_lengths = readFastq(open(sys.argv[1], 'r'))

else:
	print('ERROR: Invalid file format provided.')
	sys.exit()

stat_list = calcStats(seq_lengths)	

output_string = """# Input file name: {filename}
# File type: {filetype}
# Total sequences: {total_seq}
# Total bases: {total_mb} Mb
# Sequence N50: {n50}
# Sequence L50: {l50}
# Sequence N90: {n90}
# Median sequence length: {median_len}
# Interquartile range: {iqr}
# Shortest sequence length: {short}
# Longest sequence length: {long}
# Sequences > 1 kb: {seqs_1000}
# Sequences > 5 kb: {seqs_5000}
# G-C content: {gc}%
# Ns included: {ns}
""".format(filename = str(sys.argv[1]).split('/')[-1], filetype = file_type, total_seq = stat_list[0], total_mb = "%.2f" % stat_list[1], n50 = stat_list[2], median_len = stat_list[3], iqr = stat_list[4], short = seq_lengths[0], long = seq_lengths[-1], gc = "%.2f" % stat_list[5], n90 = stat_list[6], seqs_1000 = stat_list[7], seqs_5000 = stat_list[8], ns = stat_list[9], l50 = stat_list[10])

print output_string

Output is printed to the terminal and appears like this:

# Input file name: example.fasta
# File type: Fasta
# Total sequences: 2362551
# Total bases: 1462.60 Mb
# Sequence N50: 682
# Sequence L50: 2575
# Sequence N90: 394
# Median sequence length: 532
# Interquartile range: 331
# Shortest sequence length: 251
# Longest sequence length: 190893
# Sequences > 1 kb: 260467
# Sequences > 5 kb: 632
# G-C content: 42.76%
# Ns included: 0

Pretty straight forward, but I haven’t found any others like it that give all of these stats together. It makes read QC and evaluating assemblies a lot more convenient. So if you like it, give it a try! I’ve also made a Github repo for the scripts I use in this blog and linked it on the resources page.

Matt Jenior PhD

Computational Systems Biologist

Latest Updates

Adding Features to Subsampling Script

Checking Sequence File Metrics