I ran in to an instance that I needed to subsample separate forward and reverse read files equally. I ran in to an issue when I realized that python doesn’t duplicate variables during assignment. I figured out a workaround using deepcopy, but an even better solution was to use izip (a part of itertools). In this way I can loop through both reads and sample them simultaneously.
I am still running some tests, but here is the updated code:
#!/usr/bin/env python
'''USAGE: subsample_fasta.py file.fasta --size n --total n' --pair y/n
Randomly subsamples a fasta file to the given size, handles both single and paired-end read data
'''
# It's important to know the correct number of sequences per file for this script to run properly,
# please run seq_stats.py prior to running subsample_fasta.py
import sys
import random
import argparse
from itertools import izip
parser = argparse.ArgumentParser(description='Subsample reads from a fasta file.')
parser.add_argument('--reads', default='none', help='Single-end or interleaved paire-end read file')
parser.add_argument('--forward', default='none', help='Forward paired-end read.')
parser.add_argument('--reverse', default='none', help='Reverse paired-end read.')
parser.add_argument('--size', default=0, help='Number of sequences to subsample (# of pairs for paired-end)')
parser.add_argument('--total', default=0, help='Total number of sequences in the fasta file')
parser.add_argument('--paired', default='n', help='Indicates if the file(s) are paired-end reads')
args = parser.parse_args()
if int(args.size) == 0 or int(args.total) == 0:
print('ERROR: File or subsample value are of size 0')
sys.exit()
elif args.reads == 'none':
if args.forward == 'none' or args.reverse == 'none':
print('ERROR: Input file(s) not provided')
sys.exit()
# Check if the subsample is that same or larger than the total sequence count, and then
# generates a random list of positions to pick from fasta file
print 'Creating subsampling distribution...'
if args.paired == 'n':
sample_list = range(1, int(args.total) + 1)
if int(args.size) >= int(args.total):
sample_list = set(sample_list)
print 'Subsample size is greater than or equal to the total number of sequences. Using all sequences.'
else:
sample_list = set(sorted(random.sample(sample_list, int(args.size))))
elif args.paired == 'y' and args.forward == 'none' and args.reverse == 'none':
sample_list = range(1, int(args.total) + 1, 2)
if int(args.size) >= int(args.total):
sample_list = set(sample_list)
print 'Subsample size is greater than or equal to the total number of sequences. Using all sequences.'
else:
sample_list_forward = sorted(random.sample(sample_list, int(args.size)))
sample_list_reverse = []
# These steps add the pair to each of the
for index in sample_list_forward: sample_list_reverse.append(index + 1)
sample_list = set(sorted(sample_list_forward + sample_list_reverse))
elif args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
sample_list = range(1, int(args.total) + 1)
if int(args.size) >= int(args.total):
sample_list = set(sample_list)
print 'Subsample size is greater than or equal to the total number of sequences. Using all sequences.'
else:
sample_list = set(sorted(random.sample(sample_list, int(args.size))))
elif args.paired not in ['y', 'n']:
print('ERROR: Invalid input file type')
sys.exit()
# Label the input as pe or se
if args.paired == 'y':
file_type = 'Paired-end'
else:
file_type = 'Single-end'
if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
# Name and open output file
outfile_str1 = str(args.forward).rstrip('fasta') + 'pick.fasta'
outfile1 = open(outfile_str1, 'w')
outfile_str2 = str(args.reverse).rstrip('fasta') + 'pick.fasta'
outfile2 = open(outfile_str2, 'w')
# Create logfile of subsampling
logfile_str = str(args.reads).rstrip('fasta') + 'pick.log'
logfile = open(logfile_str, 'w')
log_str = '''Input file names : {infile1} {infile2}
Input file types: {type}
Output file names: {outfile1} {outfile2}
Total sequences: {total}
Subsample size: {size}
'''.format(infile1=str(args.forward), infile2=str(args.reverse), type=file_type, outfile1=outfile_str1, outfile2=outfile_str2, total=str(args.total), size=str(int(args.size)))
logfile.write(log_str)
logfile.close()
else:
# Name and open output file
outfile_str = str(args.reads).rstrip('fasta') + 'pick.fasta'
outfile = open(outfile_str, 'w')
# Create logfile of subsampling
logfile_str = str(args.reads).rstrip('fasta') + 'pick.log'
logfile = open(logfile_str, 'w')
log_str = '''Input file name: {infile}
Input file type: {type}
Output file name: {outfile}
Total sequences: {total}
Subsample size: {size}
'''.format(infile=str(args.reads), type=file_type, outfile=outfile_str, total=str(args.total), size=str(int(args.size)))
logfile.write(log_str)
logfile.close()
# Open and begin looping through fasta file and picking sequences
print 'Writing subsampled fasta file...'
if args.paired == 'y' and args.forward != 'none' and args.reverse != 'none':
seq_count = 0
include = 0
iteration = 0
for line_1, line_2 in izip(open(args.forward, 'r'), open(args.reverse, 'r')):
if line_1[0] == '>':
include = 0
seq_count += 1
iteration += 1
if iteration == 10000:
iteration = 0
progress_str = str(seq_count) + ' of ' + str(args.total)
print(progress_str)
if seq_count in sample_list:
sample_list.discard(seq_count)
outfile1.write(line_1)
outfile2.write(line_2)
include = 1
continue
elif line_1[0] != '>' and include == 1:
outfile1.write(line_1)
outfile2.write(line_2)
continue
outfile1.close()
outfile2.close()
else:
with open(args.reads, 'r') as infile:
seq_count = 0
include = 0
iteration = 0
for line in infile:
if line[0] == '>':
include = 0
seq_count += 1
iteration += 1
if iteration == 10000:
iteration = 0
progress_str = str(seq_count) + ' of ' + str(args.total)
print(progress_str)
if seq_count in sample_list:
sample_list.discard(seq_count)
outfile.write(line)
include = 1
continue
elif line[0] != '>' and include == 1:
outfile.write(line)
continue
outfile.close()
print 'Done.'
Hopefully, the alignments I’m using the subsampled reads for are done soon so I can finally talk about something else in the next post.