Last active
April 29, 2017 12:48
-
-
Save arafsheikh/080c662f188fce5041faef8ba7c3ee9e to your computer and use it in GitHub Desktop.
Realtime Ideone scraper: Outputs all links in realtime which match the given STDIN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import queue | |
import signal | |
import threading | |
import urllib | |
from bs4 import BeautifulSoup | |
URL = "http://ideone.com/recent" | |
MAX_PROCESSED = 200 # Max number of IDs to store before cleanup() | |
DELAY = 20 # Sleep between fetching recent codes webpage to reduce load | |
class SetQueue(queue.Queue): | |
""" | |
A queue with only unique items. | |
""" | |
def __init__(self): | |
self.queue = set() | |
def _put(self, item): | |
self.queue.add(item) | |
def _get(self): | |
return self.queue.pop() | |
def _empty(self): | |
return len(self.queue) == 0 | |
def _len(self): | |
return len(self.queue) | |
def fetchRecents(): | |
threading.Timer(DELAY, fetchRecents).start() | |
soup = BeautifulSoup(urllib.request.urlopen(URL).read(), "lxml") | |
for source in soup.find_all("div", " header"): | |
_id = source.find("a").text[1:] | |
if not _id in processed: | |
q._put(_id) | |
def process(match_input): | |
while not q._empty(): | |
_id = q._get() | |
soup = BeautifulSoup(urllib.request.urlopen( | |
"http://ideone.com/"+_id).read(), "lxml") | |
stdin = soup.find("div", id="view_stdin").find_all("div")[2].text | |
if match_input: | |
stdin = stdin.lower().replace("\n", "").replace(" ", "") | |
if stdin == match_input: | |
print("Result found at: http://ideone.com/"+_id) | |
else: | |
print(stdin) | |
print("-"*30) | |
processed.append(_id) | |
else: | |
print("Done processing the queue, waiting for the next batch of IDs") | |
def cleanup(): | |
if len(processed) >= MAX_PROCESSED: | |
# Delete all except the last 50 IDs so that we don't miss unprocessed | |
# IDs | |
del processed[:-50] | |
def print_stats_and_exit(signal, frame): | |
print("\nTotal records processed: ", len(processed)) | |
print("Records yet to process: ", q._len()) | |
os._exit(0) | |
# Print stats and exit | |
signal.signal(signal.SIGINT, print_stats_and_exit) | |
q = SetQueue() | |
processed = [] | |
if __name__ == "__main__": | |
print("Enter stdin to match (Ctrl-Y to end input): ") | |
match_input = "\n".join(iter(input, "\x19")) | |
match_input = match_input.lower().replace("\n", "").replace(" ", "") | |
if not match_input: | |
print("No input to match. Program will run in debug mode.\n") | |
fetchRecents() | |
while True: | |
process(match_input) | |
cleanup() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment