Adding Features to Subsampling Script

I ran in to an instance that I needed to subsample separate forward and reverse read files equally. I ran in to an issue when I realized that python doesn’t duplicate variables during assignment. I figured out a workaround using deepcopy, but an even better solution was to use izip (a part of itertools). In this way I can loop through both reads and sample them simultaneously.

I am still running some tests, but here is the updated code:

#!/usr/bin/env python
'''USAGE: subsample_fasta.py file.fasta --size n --total n' --pair y/n
Randomly subsamples a fasta file to the given size, handles both single and paired-end read data
'''

# It's important to know the correct number of sequences per file for this script to run properly,
# please run seq_stats.py prior to running subsample_fasta.py

import sys
import random
import argparse
from itertools import izip


parser = argparse.ArgumentParser(description='Subsample reads from a fasta file.')
parser.add_argument('--reads', default='none', help='Single-end or interleaved paire-end read file')
parser.add_argument('--forward', default='none', help='Forward paired-end read.')
parser.add_argument('--reverse', default='none', help='Reverse paired-end read.')
parser.add_argument('--size', default=0, help='Number of sequences to subsample (# of pairs for paired-end)')
parser.add_argument('--total', default=0, help='Total number of sequences in the fasta file')
parser.add_argument('--paired', default='n', help='Indicates if the file(s) are paired-end reads')
args = parser.parse_args()

if int(args.size) == 0 or int(args.total) == 0:
	print('ERROR: File or subsample value are of size 0')
	sys.exit()
elif args.reads == 'none':
	if args.forward == 'none' or args.reverse == 'none':
		print('ERROR: Input file(s) not provided')
		sys.exit()
	
	
# Check if the subsample is that same or larger than the total sequence count, and then
# generates a random list of positions to pick from fasta file
print 'Creating subsampling distribution...'

if args.paired == 'n':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))
elif args.paired == 'y' and args.forward == 'none' and args.reverse == 'none':
	sample_list = range(1, int(args.total) + 1, 2)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list_forward = sorted(random.sample(sample_list, int(args.size)))
		sample_list_reverse = []

		# These steps add the pair to each of the 
		for index in sample_list_forward: sample_list_reverse.append(index + 1)
		sample_list = set(sorted(sample_list_forward + sample_list_reverse))
elif args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	sample_list = range(1, int(args.total) + 1)
	if int(args.size) >= int(args.total):
		sample_list = set(sample_list)
		print 'Subsample size is greater than or equal to the total number of sequences.  Using all sequences.'
	else:
		sample_list = set(sorted(random.sample(sample_list, int(args.size))))		
elif args.paired not in ['y', 'n']:
	print('ERROR: Invalid input file type')
	sys.exit()
	

# Label the input as pe or se
if args.paired == 'y':
	file_type = 'Paired-end'
else:
	file_type = 'Single-end'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	# Name and open output file
	outfile_str1 = str(args.forward).rstrip('fasta') + 'pick.fasta' 
	outfile1 = open(outfile_str1, 'w')
	outfile_str2 = str(args.reverse).rstrip('fasta') + 'pick.fasta' 
	outfile2 = open(outfile_str2, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')

	log_str = '''Input file names : {infile1} {infile2}
Input file types:  {type}
Output file names: {outfile1} {outfile2}
Total sequences: {total}
Subsample size: {size}
'''.format(infile1=str(args.forward), infile2=str(args.reverse), type=file_type, outfile1=outfile_str1, outfile2=outfile_str2, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()
else:
	# Name and open output file
	outfile_str = str(args.reads).rstrip('fasta') + 'pick.fasta' 
	outfile = open(outfile_str, 'w')

	# Create logfile of subsampling
	logfile_str = str(args.reads).rstrip('fasta') + 'pick.log' 
	logfile = open(logfile_str, 'w')
	log_str = '''Input file name: {infile}
Input file type:  {type}
Output file name: {outfile}
Total sequences: {total}
Subsample size: {size}
'''.format(infile=str(args.reads), type=file_type, outfile=outfile_str, total=str(args.total), size=str(int(args.size)))
	logfile.write(log_str)
	logfile.close()

# Open and begin looping through fasta file and picking sequences
print 'Writing subsampled fasta file...'

if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
	
	seq_count = 0
	include = 0
	iteration = 0
	
	for line_1, line_2 in izip(open(args.forward, 'r'), open(args.reverse, 'r')):
	
		if line_1[0] == '>':
		
			include = 0
			seq_count += 1
			iteration += 1
		
			if iteration == 10000:
				iteration = 0
				progress_str = str(seq_count) + ' of ' + str(args.total)
				print(progress_str)

			if seq_count in sample_list:
				sample_list.discard(seq_count)
				outfile1.write(line_1)
				outfile2.write(line_2)
				include = 1
				continue
	
		elif line_1[0] != '>' and include == 1: 
				outfile1.write(line_1)
				outfile2.write(line_2)
				continue
					
	outfile1.close()
	outfile2.close()
	

else:
	with open(args.reads, 'r') as infile:
		
		seq_count = 0
		include = 0
		iteration = 0
	
		for line in infile:
				
			if line[0] == '>':
			
				include = 0
				seq_count += 1
				iteration += 1
			
				if iteration == 10000:
					iteration = 0
					progress_str = str(seq_count) + ' of ' + str(args.total)
					print(progress_str)

				if seq_count in sample_list:
					sample_list.discard(seq_count)
					outfile.write(line)
					include = 1
					continue
		
			elif line[0] != '>' and include == 1: 
					outfile.write(line)
					continue
						
	outfile.close()			


print 'Done.'

Hopefully, the alignments I’m using the subsampled reads for are done soon so I can finally talk about something else in the next post.

Adding Features to Subsampling Script

January 04, 2016

Review Now Online!

UVA Postdoc Symposium

Targeting K. pneumoniae Metabolism