|
import requests |
|
from bs4 import BeautifulSoup as bs |
|
import re |
|
import csv |
|
|
|
TAGS=["python"] |
|
MAX_PAGES_PER_TAGS = 20 #int |
|
FILENAME_PREFIX = "Dataset_" |
|
|
|
# def writeToCSV(fileName): |
|
# global FILENAME_PREFIX |
|
# with open(FILENAME_PREFIX + fileName+'.csv', 'wb') as csvfile: |
|
# spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) |
|
# spamwriter.writerow(['Spam'] * 5 + ['Baked Beans']) |
|
# spamwriter.writerow(['Spam', 'Lovely Spam', 'Wonderful Spam']) |
|
|
|
flag = True |
|
def get_data(url): |
|
data = requests.get(url) |
|
return data.content |
|
|
|
|
|
def extractUrl(url, csvWriter): |
|
global flag |
|
soup = bs(get_data(url), "lxml") |
|
qs = soup.findAll("div", {"class":"question-summary"}) |
|
if len(qs) == 0: |
|
flag = False |
|
return |
|
for i in range(len(qs)): |
|
### |
|
## Time |
|
ans=[] |
|
try: |
|
q = qs[i].find("div", {"class":"summary"}).find("span", {"class":"relativetime"})["title"].encode("utf-8") |
|
ans.append(q) |
|
except: |
|
ans.append("None") |
|
## Rating/Reputation |
|
try: |
|
|
|
z = qs[i].find("div", {"class":"summary"}).findAll("div", {"class":"user-details"}) |
|
if len(z) == 1: |
|
w1 = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("span", {"class":"reputation-score", "title":re.compile(r"reputation(\s)score(\s?)")})#.string.encode("utf-8") |
|
w2 = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("span", {"class":"reputation-score", "title":re.compile(r"reputation(\s)score(\s){1}(.+)")}) |
|
if w2: |
|
ans.append(w2["title"][len("reputation score "):]) |
|
elif w1: |
|
ans.append(w1.string.encode("utf-8")) |
|
else: |
|
ans.append("None") |
|
elif len(z) == 2: |
|
ans.append("None") |
|
except: |
|
ans.append("None") |
|
## Username and link |
|
try: |
|
e = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("a").string.encode("utf-8") |
|
ans.append(e) |
|
except: |
|
e = qs[i].find("div", {"class":"summary"}).find("div", {"class":re.compile(r"user-info(\s?)")}).find("div", {"class":"user-details"}) |
|
if e.find("span", {"class":"community-wiki"}): |
|
ans.append("community-wiki") |
|
else: |
|
ans.append(e.get_text().strip().encode("utf-8")) |
|
|
|
## Votes |
|
try: |
|
r = qs[i].find("div", {"class":"statscontainer"}).find("span", {"class": re.compile(r"vote-count-post(\s*)")}).string.encode("utf-8") |
|
ans.append(r) |
|
except: |
|
print("Error") |
|
ans.append("None") |
|
try: |
|
## Answers accepted |
|
t = qs[i].find("div", {"class":"statscontainer"}).find("div", {"class":"status"}).find("strong").string.encode("utf-8") |
|
ans.append(t) |
|
except: |
|
print("Error") |
|
ans.append("None") |
|
try: |
|
## Views |
|
## Views # views supernova |
|
y = qs[i].find("div", {"class":"statscontainer"}).find("div", {"class":re.compile(r"(views(\s))")})["title"].encode("utf-8")[:-6] |
|
ans.append(y) |
|
except: |
|
print("Error") |
|
ans.append("None") |
|
print(ans) |
|
csvWriter.writerow(ans) |
|
# print("---------------------------------") |
|
|
|
|
|
for tag_i in TAGS: |
|
csvfile = open(FILENAME_PREFIX + tag_i +'.csv', 'wb') |
|
csv_writer = csv.writer(csvfile , lineterminator="\n") |
|
pageNo = 1 |
|
while flag == True and pageNo <= MAX_PAGES_PER_TAGS: |
|
url = "https://stackoverflow.com/questions/tagged/"+tag_i.lower()+"?page="+str(pageNo)+"&sort=frequent&pageSize=50" |
|
pageNo += 1 |
|
extractUrl(url, csv_writer) |
|
csvfile.close() |