I ran in to an instance that I needed to subsample separate forward and reverse read files equally. I ran in to an issue when I realized that python doesn’t duplicate variables during assignment. I figured out a workaround using deepcopy, but an even better solution was to use izip (a part of itertools). In this way I can loop through both reads and sample them simultaneously.

I am still running some tests, but here is the updated code:

#!/usr/bin/env python
'''USAGE: subsample_fasta.py file.fasta --size n --total n' --pair y/n
Randomly subsamples a fasta file to the given size, handles both single and paired-end read data
'''

# It's important to know the correct number of sequences per file for this script to run properly,
# please run seq_stats.py prior to running subsample_fasta.py

import sys
import random
import argparse
from itertools import izip


parser = argparse.ArgumentParser(description='Subsample reads from a fasta file.')
parser.add_argument('--reads', default='none', help='Single-end or interleaved paire-end read file')
parser.add_argument('--forward', default='none', help='Forward paired-end read.')
parser.add_argument('--reverse', default='none', help='Reverse paired-end read.')
parser.add_argument('--size', default=0, help='Number of sequences to subsample (# of pairs for paired-end)')
parser.add_argument('--total', default=0, help='Total number of sequences in the fasta file')
parser.add_argument('--paired', default='n', help='Indicates if the file(s) are paired-end reads')
args = parser.parse_args()

if int(args.size) == 0 or int(args.total) == 0:
	print('ERROR: File or subsample value are of size 0')
	sys.exit()
elif args.reads == 'none':
	if args.forward == 'none' or args.reverse == 'none':
		print('ERROR: Input file(s) not provided')
		sys.exit()
	
	
# Check if the subsample is that same or larger than the total sequence count, and then
# generates a random list of positions to pick from fasta file
print 'Creating subsampling distribution...'

if args.paired == 'n':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))
elif args.paired == 'y' and args.forward == 'none' and args.reverse == 'none':
	sample_list = range(1, int(args.total) + 1, 2)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list_forward = sorted(random.sample(sample_list, int(args.size)))
		sample_list_reverse = []

		# These steps add the pair to each of the 
		for index in sample_list_forward: sample_list_reverse.append(index + 1)
		sample_list = set(sorted(sample_list_forward + sample_list_reverse))
elif args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))		
elif args.paired not in ['y', 'n']:
	print('ERROR: Invalid input file type')
	sys.exit()
	

# Label the input as pe or se
if args.paired == 'y':
	file_type = 'Paired-end'
else:
	file_type = 'Single-end'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	# Name and open output file
	outfile_str1 = str(args.forward).rstrip('fasta') + 'pick.fasta' 
	outfile1 = open(outfile_str1, 'w')
	outfile_str2 = str(args.reverse).rstrip('fasta') + 'pick.fasta' 
	outfile2 = open(outfile_str2, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')

	log_str = '''Input file names : {infile1} {infile2}
Input file types:  {type}
Output file names: {outfile1} {outfile2}
Total sequences: {total}
Subsample size: {size}
'''.format(infile1=str(args.forward), infile2=str(args.reverse), type=file_type, outfile1=outfile_str1, outfile2=outfile_str2, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()
else:
	# Name and open output file
	outfile_str = str(args.reads).rstrip('fasta') + 'pick.fasta' 
	outfile = open(outfile_str, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')
	log_str = '''Input file name: {infile}
Input file type:  {type}
Output file name: {outfile}
Total sequences: {total}
Subsample size: {size}
'''.format(infile=str(args.reads), type=file_type, outfile=outfile_str, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()

# Open and begin looping through fasta file and picking sequences
print 'Writing subsampled fasta file...'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	
	seq_count = 0
	include = 0
	iteration = 0
	
	for line_1, line_2 in izip(open(args.forward, 'r'), open(args.reverse, 'r')):
	
		if line_1[0] == '>':
		
			include = 0
			seq_count += 1
			iteration += 1
		
			if iteration == 10000:
				iteration = 0
				progress_str = str(seq_count) + ' of ' + str(args.total)
				print(progress_str)

			if seq_count in sample_list:
				sample_list.discard(seq_count)
				outfile1.write(line_1)
				outfile2.write(line_2)
				include = 1
				continue
	
		elif line_1[0] != '>' and include == 1: 
				outfile1.write(line_1)
				outfile2.write(line_2)
				continue
					
	outfile1.close()
	outfile2.close()
	

else:
	with open(args.reads, 'r') as infile:
		
		seq_count = 0
		include = 0
		iteration = 0
	
		for line in infile:
				
			if line[0] == '>':
			
				include = 0
				seq_count += 1
				iteration += 1
			
				if iteration == 10000:
					iteration = 0
					progress_str = str(seq_count) + ' of ' + str(args.total)
					print(progress_str)

				if seq_count in sample_list:
					sample_list.discard(seq_count)
					outfile.write(line)
					include = 1
					continue
		
			elif line[0] != '>' and include == 1: 
					outfile.write(line)
					continue
						
	outfile.close()			


print 'Done.'

Hopefully, the alignments I’m using the subsampled reads for are done soon so I can finally talk about something else in the next post.

Review Now Online!

We were recently asked to contribute a review to Current Opinion in Microbiology on computational approaches that have been used to under...… Continue reading

UVA Postdoc Symposium

Published on November 23, 2021

Targeting K. pneumoniae Metabolism

Published on November 02, 2021