JohnLonginotto · August 30, 2016 22:59
diff --git a/gistfile1.py b/gistfile1.py

 import sys
 reads = sys.argv[1]
 index = sys.argv[2]

 # Get all read ids:
 all_readIDs = set() 	# Using a set because it will be quicker to find things in later. 
 			# A list would require python to check every item in the list. 
 			# A set can be thought of as an always-sorted list (elements are added in their sorted order) with no duplication of elements.
 			# The sorted order is based on the hash() value of the items however, which is essentially a random number.
 			# A further optimization of this would to not even bother storing the values in our hash-sorted list of values, but just
 			# the hashes themselves, which a friend just told me is called a "Bloom filter" and I don't get a prize for thinking it up :P

 with open(reads, "r") as f:
 	# The following is the simplest code to do what you want to do:
 	for line in f:
 		if line.startswith("@HWI"): all_readIDs.add(line)

 	# However a quicker method would to just be to add every 4th line to our set once we've found the first line that starts with @HWI.
 	# This is because python wouldn't have to check for @HWI on every line. Furthermore, for some FASTQ files the third row, usually "+", 
 	# is actually the same as the first row, the id. Trying to add the same value to a set twice is not a problem, but it takes time to check that it's
 	# already in there, and nothing will change, so it's better to just read every 4th line.

 if not all_readIDs: print "List is Empty" # Nice check. Like it.
 else:
 	# but its always a good idea to look at a few items in the set to, just to be sure:
 	for idx,item in enumerate(all_readIDs):
 		print item
 		if idx == 10: break

 with open(index, "r") as f2, open("filtered_index.fastq", "w") as f3: # using with in this way saves on some indentation. I also swapped f2 and f3 around.
 	# Here's one way to do the reading every 4th line, as mentioned above:
 	skip = 0
 	for line in f2:
 		if skip == 0:
 			if line in all_readIDs: f3.write(line)
 			skip = 3
 		else:
 			skip = skip - 1  # (can also be written as "skip -= 1")

 print "All done!"

	import sys
	reads = sys.argv[1]
	index = sys.argv[2]

	# Get all read ids:
	all_readIDs = set() # Using a set because it will be quicker to find things in later.
	# A list would require python to check every item in the list.
	# A set can be thought of as an always-sorted list (elements are added in their sorted order) with no duplication of elements.
	# The sorted order is based on the hash() value of the items however, which is essentially a random number.
	# A further optimization of this would to not even bother storing the values in our hash-sorted list of values, but just
	# the hashes themselves, which a friend just told me is called a "Bloom filter" and I don't get a prize for thinking it up :P

	with open(reads, "r") as f:
	# The following is the simplest code to do what you want to do:
	for line in f:
	if line.startswith("@HWI"): all_readIDs.add(line)

	# However a quicker method would to just be to add every 4th line to our set once we've found the first line that starts with @HWI.
	# This is because python wouldn't have to check for @HWI on every line. Furthermore, for some FASTQ files the third row, usually "+",
	# is actually the same as the first row, the id. Trying to add the same value to a set twice is not a problem, but it takes time to check that it's
	# already in there, and nothing will change, so it's better to just read every 4th line.

	if not all_readIDs: print "List is Empty" # Nice check. Like it.
	else:
	# but its always a good idea to look at a few items in the set to, just to be sure:
	for idx,item in enumerate(all_readIDs):
	print item
	if idx == 10: break

	with open(index, "r") as f2, open("filtered_index.fastq", "w") as f3: # using with in this way saves on some indentation. I also swapped f2 and f3 around.
	# Here's one way to do the reading every 4th line, as mentioned above:
	skip = 0
	for line in f2:
	if skip == 0:
	if line in all_readIDs: f3.write(line)
	skip = 3
	else:
	skip = skip - 1 # (can also be written as "skip -= 1")

	print "All done!"