Created
August 11, 2023 17:43
-
-
Save lobstrio/9ca72017c01e27323007a9e9b37a2537 to your computer and use it in GitHub Desktop.
🧙 Scrape all topics from the famous French GrowthHacking.fr forum — 'scraping' category only!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
GrowthHacking.fr Forum Scraper | |
This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category. | |
It retrieves information about forum topics and saves it as CSV data. | |
Usage: | |
1. Install the required library using the following command: | |
$ pip install requests | |
2. Run this script using the following command: | |
$ python growthhackingfr_scraper.py | |
Note: Make sure you have Python installed on your system. | |
Author: Sasha Bouloudnine | |
Date: 11/08/2023 | |
--- | |
Required Library: | |
- requests: Used for making HTTP requests to the forum API. | |
""" | |
import requests | |
import time | |
import csv | |
import random | |
FIELDNAMES = [ | |
'id', | |
'title', | |
'fancy_title', | |
'slug', | |
'posts_count', | |
'reply_count', | |
'highest_post_number', | |
'image_url', | |
'created_at', | |
'last_posted_at', | |
'bumped', | |
'bumped_at', | |
'archetype', | |
'unseen', | |
'pinned', | |
'unpinned', | |
'visible', | |
'closed', | |
'archived', | |
'bookmarked', | |
'liked', | |
'views', | |
'like_count', | |
'has_summary', | |
'last_poster_username', | |
'category_id', | |
'pinned_globally', | |
'featured_link', | |
] | |
def scrap_growthhackingforum(): | |
CURL = """curl 'https://www.growthhacking.fr/c/scraping/8/l/latest.json?ascending=false&page=2000' \ | |
-H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"' \ | |
-H 'Discourse-Present: true' \ | |
-H 'X-CSRF-Token: GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw' \ | |
-H 'sec-ch-ua-mobile: ?0' \ | |
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \ | |
-H 'Discourse-Logged-In: true' \ | |
-H 'Accept: application/json, text/javascript, */*; q=0.01' \ | |
-H 'Referer: https://www.growthhacking.fr/c/scraping/8' \ | |
-H 'X-Requested-With: XMLHttpRequest' \ | |
-H 'sec-ch-ua-platform: "macOS"' \ | |
--compressed""" | |
TOPIC_EXAMPLE = """{ | |
"id":31591, | |
"title":"Scrapper adresse mail avocats Paris ", | |
"fancy_title":"Scrapper adresse mail avocats Paris ", | |
"slug":"scrapper-adresse-mail-avocats-paris", | |
"posts_count":3, | |
"reply_count":0, | |
"highest_post_number":3, | |
"image_url":"None", | |
"created_at":"2023-06-12T15:16:36.493Z", | |
"last_posted_at":"2023-06-16T02:28:35.774Z", | |
"bumped":true, | |
"bumped_at":"2023-06-16T02:28:35.774Z", | |
"archetype":"regular", | |
"unseen":false, | |
"pinned":false, | |
"unpinned":"None", | |
"visible":true, | |
"closed":false, | |
"archived":false, | |
"bookmarked":"None", | |
"liked":"None", | |
"tags_descriptions":{ | |
}, | |
"views":202, | |
"like_count":0, | |
"has_summary":false, | |
"last_poster_username":"Arnaud2017", | |
"category_id":8, | |
"pinned_globally":false, | |
"featured_link":"None", | |
"posters":[ | |
{ | |
"extras":"None", | |
"description":"Créateur du sujet", | |
"user_id":54674, | |
"primary_group_id":"None", | |
"flair_group_id":"None" | |
}, | |
{ | |
"extras":"None", | |
"description":"Auteur fréquent", | |
"user_id":52085, | |
"primary_group_id":"None", | |
"flair_group_id":"None" | |
}, | |
{ | |
"extras":"latest", | |
"description":"Auteur le plus récent", | |
"user_id":50548, | |
"primary_group_id":"None", | |
"flair_group_id":"None" | |
} | |
] | |
}""" | |
s = requests.Session() | |
DATA = [] | |
TIMESTAMP = str(time.time()).replace('.','') | |
page = 1 | |
while True: | |
headers = { | |
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'Discourse-Present': 'true', | |
'X-CSRF-Token': 'GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw', | |
'sec-ch-ua-mobile': '?0', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'Discourse-Logged-In': 'true', | |
'Accept': 'application/json, text/javascript, */*; q=0.01', | |
'Referer': 'https://www.growthhacking.fr/c/scraping/8', | |
'X-Requested-With': 'XMLHttpRequest', | |
'sec-ch-ua-platform': '"macOS"', | |
} | |
params = { | |
'ascending': 'false', | |
'page': page, | |
} | |
print('> accessing page %s' % page) | |
response = requests.get('https://www.growthhacking.fr/c/scraping/8/l/latest.json', params=params, headers=headers) | |
assert response.status_code == 200 | |
j = response.json() | |
topics = j['topic_list']['topics'] | |
if not topics: | |
break | |
for t in topics: | |
d = {} | |
for k in FIELDNAMES: | |
d[k] = t[k] | |
DATA.append(d) | |
page += 1 | |
return DATA | |
def write_data(DATA): | |
print('> writing data') | |
TIMESTAMP = str(time.time()).replace('.','') | |
with open('results_growthhackingscraping_%s.csv' % TIMESTAMP, 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=FIELDNAMES) | |
writer.writeheader() | |
for d in DATA: | |
writer.writerow(d) | |
print('done/cool') | |
if __name__ == '__main__': | |
DATA = scrap_growthhackingforum() | |
assert DATA | |
write_data(DATA) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment