Skip to content

Instantly share code, notes, and snippets.

@markrittman
Created August 2, 2024 23:48
Show Gist options
  • Save markrittman/00d756f31485c75797010e24d0e8c703 to your computer and use it in GitHub Desktop.
Save markrittman/00d756f31485c75797010e24d0e8c703 to your computer and use it in GitHub Desktop.
Scrape Squarespace blog posts and save to CSV file
!pip install requests beautifulsoup4
import sys
import subprocess
import time
import random
# Install required packages
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
print("Installing required packages...")
install('requests')
install('beautifulsoup4')
install('urllib3') # Adding this for better HTTPS support
print("Packages installed successfully.")
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Set up a retry strategy
def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def scrape_squarespace_blog(url, session):
try:
# Send a GET request to the URL with retry logic
response = session.get(url, timeout=30)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the title
title_element = soup.find('h1', class_='entry-title')
title = title_element.text.strip() if title_element else "Title not found"
# Extract the author
author_element = soup.find('a', class_='blog-author-name')
author = author_element.text.strip() if author_element else "Author not found"
# Extract the date
date_element = soup.find('time', class_='dt-published')
date = date_element['datetime'] if date_element else 'Date not found'
# Extract the main content
content_div = soup.find('div', class_='blog-item-content')
if content_div:
content = ' '.join([p.text for p in content_div.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6'])])
else:
content = "Content not found"
return {
'url': url,
'title': title,
'author': author,
'date': date,
'content': content
}
except requests.exceptions.RequestException as e:
print(f"An error occurred while scraping {url}: {e}")
return None
# List of URLs to scrape
urls = [
'https://www.rittmananalytics.com/blog/2023/2/18/kpi-dashboards-and-balanced-scorecards-using-looker-dbt-and-google-bigquery',
'https://www.rittmananalytics.com/blog/2023/2/19/the-dbt-semantic-layer-data-orchestration-and-the-modern-enterprise-data-stack',
'https://www.rittmananalytics.com/blog/2023/2/19/modern-data-stack-healthcheck-service-from-rittman-analytics',
'https://www.rittmananalytics.com/blog/2023/3/1/behavioural-analytics-querying-fast-and-slow',
'https://www.rittmananalytics.com/blog/2019/12/18/previewing-the-new-looker-dashboard-experience-and-whats-coming-in-looker-7',
'https://www.rittmananalytics.com/blog/2019/01/21/2019-1-21-looker-london-meetup-on-thursday-february-7th-2019-registration-now-open',
'https://www.rittmananalytics.com/blog/2018/11/27/2018-11-27-mjr-analytics-sessions-at-ukoug-tech18-liverpool-acc-3rd-5th-december-2018',
'https://www.rittmananalytics.com/blog/2018/11/14/2018-11-14-slides-and-forbescom-article-from-data-warehouse-like-a-tech-startup-with-oracle-autonomous-data-warehouse-cloud',
'https://www.rittmananalytics.com/blog/2018/11/12/event-level-digital-analytics-using-google-analytics-fivetran-bigquery-andnbsplooker',
'https://www.rittmananalytics.com/blog/2018/10/28/nlnhb0kr2emevhbm2qee7jcjot1qtx',
'https://www.rittmananalytics.com/blog/2023/10/29/coalesce-2023-and-the-new-dbt-cloud-cli',
'https://www.rittmananalytics.com/blog/2023/9/7/bringing-dbt-and-analytics-engineering-to-oracle-autonomous-data-warehouse',
'https://www.rittmananalytics.com/blog/2021/5/30/extending-the-dbt-test-pipeline-to-downstream-looker-content-using-spectacles-dbtcloud-andnbspgithub',
'https://www.rittmananalytics.com/blog/2024/7/5/ai-powered-conversational-data-analyst-chatbot',
'https://www.rittmananalytics.com/blog/2024/6/17/automated-insights-gemini-1-5-flash',
'https://www.rittmananalytics.com/blog/2024/2/5/generative-ai-comes-to-looker-via-vertex-ai-and-bigquery-bqml',
'https://www.rittmananalytics.com/blog/2024/1/14/automate-your-contacts-list-segmentation-using-google-bigquery-vertex-ai-and-the-mlgeneratetext-function',
'https://www.rittmananalytics.com/blog/2023/5/11/ra-assistant-our-gpt-35-turbo-powered-modern-data-stack-chatbot',
'https://www.rittmananalytics.com/blog/2023/3/26/chatgpt-large-language-models-and-the-future-of-dbt-and-analytics-consulting',
'https://www.rittmananalytics.com/blog/2016/09/27/2016-09-27-drill-to-detail-ep-2-the-future-of-sql-on-hadoop-with-special-guest-dan-mcclary',
'https://www.rittmananalytics.com/blog/2023/10/12/rittman-analytics-and-coalesce-2023-san-diego-well-be-there',
'https://www.rittmananalytics.com/blog/2023/6/18/wednesday-webinar-series-how-rittman-analytics-builds-modern-data-stacks-using-cube-preset-and-dagster-wednesday-28th-june-2023',
'https://www.rittmananalytics.com/blog/2023/5/23/upcoming-30-minute-webinar-how-rittman-analytics-automates-dbt-looker-and-cubedev-project-delivery-presented-by-lewis-baker',
'https://www.rittmananalytics.com/blog/2023/2/20/cube-semantic-layer-webinar',
'https://www.rittmananalytics.com/blog/2024/7/10/making-smart-buildings-smarter-for-facility-solutions-group-with-embeddable',
'https://www.rittmananalytics.com/blog/2024/4/25/dynamic-data-model-definition-in-cube-using-python-and-jinja',
'https://www.rittmananalytics.com/blog/2023/2/24/building-up-a-semantic-layer-with-dbt-metrics-cube-and-droughty',
'https://www.rittmananalytics.com/blog/2022/12/21/customer-first-order-segmentation-using-looker-and-google-bigquery',
'https://www.rittmananalytics.com/blog/2024/4/30/rittman-analytics-achieves-the-data-analytics-partner-specialization-in-google-cloud-partner-advantage',
'https://www.rittmananalytics.com/blog/2024/3/17/data-lineage-for-your-google-bigquery-dbt-and-cloud-composer-data-pipelines-using-dataplex-and-data-catalog',
'https://www.rittmananalytics.com/blog/2023/09/28/google-cloud-cortex-framework',
'https://www.rittmananalytics.com/blog/2021/7/25/event-based-analytics-and-bigquery-export-comes-to-google-analytics-4-how-does-it-worknbsp-and-whats-thenbspcatch',
'https://www.rittmananalytics.com/blog/2021/1/09/new-121-release-of-ra-data-warehouse-for-dbt-fivetran-bigquery-segment-and-now-snowflake-dw',
'https://www.rittmananalytics.com/blog/2020/5/20/happy-10th-birthday-google-bigquery-our-preferred-cloud-data-warehousing-platform',
'https://www.rittmananalytics.com/blog/2020/3/6/bigquery-ios-dbt-numerics',
'https://www.rittmananalytics.com/blog/2019/4/14/supermetrics-google-bigquery-and-data-pipelines-for-digital-marketers',
'https://www.rittmananalytics.com/blog/2016/11/19/2016-11-19-google-bigquery-and-why-big-data-is-about-to-have-its-gmail-moment',
'https://www.rittmananalytics.com/blog/2024/4/4/the-rittman-analytics-guide-to-modernising-your-data-stack-innbsp2024',
'https://www.rittmananalytics.com/blog/2024/3/4/looker-benchmarking-financial-analytics',
'https://www.rittmananalytics.com/blog/2023/5/25/london-to-brighton-bike-ride-2023-fundraising-for-dementia-uk',
'https://www.rittmananalytics.com/blog/2023/4/6/building-a-mobile-friendly-kpi-dashboard-using-looker-studio-integration-with-looker-universal-semantic-model',
'https://www.rittmananalytics.com/blog/2023/2/18/kpi-dashboards-and-balanced-scorecards-using-looker-dbt-and-google-bigquery',
'https://www.rittmananalytics.com/blog/2022/5/5/presenting-on-dbt-amp-firebolt-at-the-budapest-dbt-meetup-tuesday-10th-may-2022',
'https://www.rittmananalytics.com/blog/2022/2/1/lightdash-looker-and-dbt-as-the-bi-tool-metrics-layer',
'https://www.rittmananalytics.com/blog/2021/12/16/using-looker-to-analyze-and-visualise-your-customer-concentration',
'https://www.rittmananalytics.com/blog/2021/11/21/adding-forecasting-to-your-looker-reports-and-dashboards',
'https://www.rittmananalytics.com/blog/2021/6/20/rfm-analysis-and-customer-segmentation-using-looker-dbt-and-google-bigquery',
'https://www.rittmananalytics.com/blog/2021/2/22/customer-cohorting-retention-curves-and-predictive-lifetime-value-using-looker-and-google-bigquery',
'https://www.rittmananalytics.com/blog/2020/1/7/forecasting-hubspot-deal-revenue-and-resourcing-needs-using-dbt-and-looker',
'https://www.rittmananalytics.com/blog/2023/12/8/how-rittman-analytics-does-web-marketing-anaytics',
'https://www.rittmananalytics.com/blog/2023/5/9/building-your-own-ga4-rules-based-marketing-attribution-models-using-google-bigquery-andnbsplooker',
'https://www.rittmananalytics.com/blog/2022/12/02/improving-wordpress-search-keyword-performance-using-looker-google-search-console-and-fivetran',
'https://www.rittmananalytics.com/blog/2023/2/18/medium-squarespace-or-githubnbsp-content-marketing-value-analytics-using-looker-dbt-and-segment',
'https://www.rittmananalytics.com/blog/2022/5/30/stitching-identity-across-the-customer-journey-using-segment-google-bigquery-and-looker',
'https://www.rittmananalytics.com/blog/2022/2/20/rudderstack-snowplow-and-open-source-cdp-alternatives-to-segment',
'https://www.rittmananalytics.com/blog/2021/2/15/customer-360-degree-analysis-and-hightouch',
'https://www.rittmananalytics.com/blog/2020/9/19/ad-spend-and-campaign-roi-analytics-using-segment-looker-dbt-and-googlenbspbigquery',
'https://www.rittmananalytics.com/blog/2020/7/16/connecting-intercom-to-segment-personas-for-more-relevant-and-cost-effective-customer-services-agents',
'https://www.rittmananalytics.com/blog/2020/2/8/multichannel-attribution-bigquery-dbt-looker-segment',
'https://www.rittmananalytics.com/blog/2019/11/6/e058gepqwiyyx4mop3eb06mn9ckjkp',
'https://www.rittmananalytics.com/blog/2019/5/22/0r1fgtifyovghse903ha3vwdwwbp7j',
'https://www.rittmananalytics.com/blog/2022/4/25/analyzing-the-hacker-news-public-dataset-using-firebolt-data-warehouse-and-looker',
'https://www.rittmananalytics.com/blog/2020/9/21/newlookforlooker7',
'https://www.rittmananalytics.com/blog/2020/6/4/drill-to-detail-ep82-looker-development-automated-testing-and-spectacles-with-special-guest-josh-temple',
'https://www.rittmananalytics.com/blog/2020/6/3/column-level-data-profiling-for-google-bigquery-datasets-using-dbt',
'https://www.rittmananalytics.com/blog/2020/4/26/coronavirus-ncf',
'https://www.rittmananalytics.com/blog/2020/1/3/modelling-slowly-changing-dimensions-type-23-and-6-using-dbt-and-looker',
'https://www.rittmananalytics.com/blog/2019/8/19/hubspot-data-actions-harvest-analytical-workflows-and-looker-data-platform',
'https://www.rittmananalytics.com/blog/2019/7/18/new-features-in-looker-616-conditional-alerts-beta-content-curation-beta-and-lookml-ide-folders',
'https://www.rittmananalytics.com/blog/2019/7/7/news-rittman-analytics-is-now-a-uk-consulting-partner-for-dbt-data-build-tool',
'https://www.rittmananalytics.com/blog/2019/07/01/news-on-the-second-london-looker-developer-meetup-10th-july-2019-at-gocardless-london',
'https://www.rittmananalytics.com/blog/2019/6/12/rittman-analytics-is-now-a-segment-certified-implementation-partner',
'https://www.rittmananalytics.com/blog/2019/6/10/continuous-integration-feature-branches-and-automated-build-tests-using-dbtcloud',
'https://www.rittmananalytics.com/blog/2023/11/12/how-rittman-analytics-builds-data-stacks-for-growth-stage-businesses-using-cube-dagster-and-preset',
'https://www.rittmananalytics.com/blog/2022/12/06/10-ways-your-modern-data-stack-project-can-fail',
'https://www.rittmananalytics.com/blog/2022/5/30/how-rittman-analytics-does-analytics-part-2-building-our-modern-data-stack-using-dbt-google-bigquery-looker-segment-and-rudderstack',
'https://www.rittmananalytics.com/blog/2021/3/12/customer-data-warehouses-are-the-new-customer-data-platform',
'https://www.rittmananalytics.com/blog/2021/1/17/deduplicating-dbt-saas-data-warehousing',
'https://www.rittmananalytics.com/blog/2020/5/27/introducing-the-ra-warehouse-dbt-framework-how-rittman-analytics-does-data-centralization',
'https://www.rittmananalytics.com/blog/2019/5/7/how-rittman-analytics',
'https://www.rittmananalytics.com/blog/2024/6/25/oracle-and-google-cloud-partnership',
'https://www.rittmananalytics.com/blog/2018/11/17/2018-11-17-five-thoughts-about-thomas-kurians-move-to-google-cloud-platform',
'https://www.rittmananalytics.com/blog/2016/09/29/2016-09-29-oracles-big-data-platform-goes-cloud-becomes-elastic-and-suddenly-looks-very-interesting',
'https://www.rittmananalytics.com/blog/2016/09/25/2016-09-25-new-oracle-magazine-article-on-oracle-big-data-spatial-graph-for-social-network-analysis',
'https://www.rittmananalytics.com/blog/2016/09/25/2016-09-25-obiee12c-pushing-up-daisies-or-more-relevant-than-ever-in-the-world-of-bimodal-it',
'https://www.rittmananalytics.com/blog/2024/4/23/data-analytics-project-planning-checklistthe-definitive-guide-to-planning-your-data-analytics-initiative',
'https://www.rittmananalytics.com/blog/2023/12/29/end-of-year-special-best-of-the-rittman-analytics-blog-2023-pdf-ebook',
'https://www.rittmananalytics.com/blog/2019/12/15/segmentcdpemailtracking',
'https://www.rittmananalytics.com/blog/2019/8/4/financial-reporting-in-looker-using-g-accon-for-xero-bigquery-and-dbt',
'https://www.rittmananalytics.com/blog/2019/10/7/presenting-on-oracle-autonomous-data-warehouse-cloud-and-looker-at-the-uk-oracle-user-group-analytics-modernisation-forum-8th-october-2019',
'https://www.rittmananalytics.com/blog/2019/5/27/drill-to-detail-ep66-etl-incorta-and-the-death-of-the-star-schema-with-special-guest-matthew-halliday',
'https://www.rittmananalytics.com/blog/2019/4/5/2019-4-5-join-us-at-looker-join-2019-london-on-april-9th-the-brewery-52-chiswell-street',
'https://www.rittmananalytics.com/blog/2019/04/01/2019-4-1-released-today-the-mome-project-multiple-olap-machine-emulator',
'https://www.rittmananalytics.com/blog/2019/3/11/mjr-analytics-is-now-rittman-analytics-and-an-update-on-our-first-six-months',
'https://www.rittmananalytics.com/blog/2019/03/05/2019-3-5-drill-to-detail-podcast-returns-with-ep60-a-deeper-look-into-looker-with-special-guest-lloyd-tabb',
'https://www.rittmananalytics.com/blog/2018/11/11/2018-11-11-digital-analytics-bi-and-big-data-meetup-in-copenhagen-22111967',
'https://www.rittmananalytics.com/blog/2018/10/22/2018-10-22-mjr-analytics-presenting-at-oracle-openworld-2018-san-francisco',
'https://www.rittmananalytics.com/blog/2018/09/17/2018-09-17-introducing-mjr-analytics-and-how-two-years-go-so-fast-when-youre-learning-something-new',
'https://www.rittmananalytics.com/blog/2018/08/27/2018-08-27-date-partitioning-and-table-clustering-in-google-bigquery-and-looker-pdts',
'https://www.rittmananalytics.com/blog/2018/06/02/2018-06-02-oracle-big-data-cloud-event-hub-and-analytics-cloud-data-lake-edition-pt-3',
'https://www.rittmananalytics.com/blog/2018/05/28/2018-05-28-oracle-big-data-cloud-event-hub-and-analytics-cloud-data-lake-edition-pt-2',
'https://www.rittmananalytics.com/blog/2018/05/10/2018-05-10-oracle-big-data-cloud-event-hub-and-analytics-cloud-data-lake-edition-pt-1',
'https://www.rittmananalytics.com/blog/2018/05/01/2018-05-02-using-looker-data-actions-to-make-monzo-spend-analysis-more-interactive-and-actionable',
'https://www.rittmananalytics.com/blog/2018/04/27/2018-04-27-updates-to-oracle-analytics-cloud-oracle-biee-12c-and-oracle-dv-desktop',
'https://www.rittmananalytics.com/blog/2018/04/21/2018-04-21-connecting-looker-to-oracle-autonomous-data-warehouse-cloud',
'https://www.rittmananalytics.com/blog/2018/04/16/2018-04-16-first-impressions-of-oracle-autonomous-data-warehouse-cloud',
'https://www.rittmananalytics.com/blog/2018/04/03/2018-04-03-timeline-charts-derived-tables-and-analytic-functions-in-looker-5',
'https://www.rittmananalytics.com/blog/2018/03/05/2018-03-05-the-drill-to-detail-podcast-50th-episode-special-and-top-10-episodes-by-download',
'https://www.rittmananalytics.com/blog/2018/02/05/2018-02-05-monzo-bigquery-looker-and-fintech-the-other-london-tech-startup-scene',
'https://www.rittmananalytics.com/blog/2017/12/31/2017-12-31-oracle-analytics-cloud-and-the-welcome-return-of-the-enterprise-bi-platform',
'https://www.rittmananalytics.com/blog/2017/12/03/2017-12-03-ukoug-tech17-and-the-incredible-world-of-ecommerce-analytics-machine-learning-and-11-marketing',
'https://www.rittmananalytics.com/blog/2017/11/24/2017-11-24-query-federation-comes-to-looker-5-with-new-data-merge-feature',
'https://www.rittmananalytics.com/blog/2017/11/24/2017-11-24-google-cloud-dataprep-spreadsheet-style-data-wrangling-powered-by-google-cloud-dataflow',
'https://www.rittmananalytics.com/blog/2017/10/21/2017-10-21-druid-imply-and-looker-5-bring-olap-analysis-to-bigquerys-data-warehouse',
'https://www.rittmananalytics.com/blog/2017/07/31/2017-07-31-using-google-bigquery-google-cloud-natural-language-api-and-looker-to-work-out-exactly-how-much',
'https://www.rittmananalytics.com/blog/2017/06/10/2017-06-10-google-bigquery-large-table-joins-and-how-nested-repeated-values-and-the-capacitor-storage-format',
'https://www.rittmananalytics.com/blog/2017/05/29/2017-05-29-analytic-views-oracle-database-12-2',
'https://www.rittmananalytics.com/blog/2017/05/01/2017-05-01-what-bi-development-looks-like-with-bigquery-google-cloud-apis-looker-and-fluentd-courtesy-of',
'https://www.rittmananalytics.com/blog/2017/02/23/2017-02-23-slides-from-my-new-world-hadoop-architectures-what-problems-they-really-solve-for-oracle-dbas',
'https://www.rittmananalytics.com/blog/2017/02/21/2017-02-21-bigquery-looker-and-big-datas-rediscovery-of-data-warehousing-and-semantic-models-at-google',
'https://www.rittmananalytics.com/blog/2017/02/03/2017-02-03-graph-analysis-in-the-how-a-tweet-went-viral-conference-presentation',
'https://www.rittmananalytics.com/blog/2017/01/24/2017-01-24-qubits-journey-to-petabyte-scale-machine-learning-and-analytics-on-google-cloud-platform-and',
'https://www.rittmananalytics.com/blog/2016/12/31/2016-12-31-drill-to-detail-podcast-looking-back-at-2016-and-whats-new-and-planned-for-2017',
'https://www.rittmananalytics.com/blog/2016/12/02/2016-12-02-data-lakes-at-google-scale-the-end-of-meaningless-customer-experiences-and-ukoug-tech16-in',
'https://www.rittmananalytics.com/blog/2016/10/30/2016-10-30-data-capital-competitive-strategy-and-the-economics-of-big-data-drill-to-detail-podcast-ep-6',
'https://www.rittmananalytics.com/blog/2016/10/17/2016-10-17-slides-from-the-story-behind-the-11hr-cup-of-tea-wifi-kettles-how-it-was-all-about-data',
'https://www.rittmananalytics.com/blog/2016/10/16/2016-10-16-interested-in-oracle-big-data-ml-and-next-gen-analytics-in-the-enterprise',
'https://www.rittmananalytics.com/blog/2016/10/13/2016-10-13-the-ikettle-the-eleven-hour-struggle-to-make-a-cup-of-tea-and-why-it-was-all-about-data',
'https://www.rittmananalytics.com/blog/2016/10/05/2016-10-05-drill-to-detail-ep',
'https://www.rittmananalytics.com/blog/2016/09/25/2016-09-25-presenting-second-in-the-gluent-new-world-webinar-series-on-sql-on-hadoop-concepts-and',
'https://www.rittmananalytics.com/blog/2016/09/25/2016-09-25-last-stop-budapest-and-five-new-bi-and-analytics-technologies-coming-soon-for-hadoop',
'https://www.rittmananalytics.com/blog/2016/09/25/2016-09-25-from-lots-of-reports-with-some-data-analysis-to-massive-data-analysis-with-some-reporting',
'https://www.rittmananalytics.com/blog/2016/09/24/2016-09-24-building-predictive-analytics-models-against-wearables-smart-home-and-smartphone-app-data-heres',
'https://www.rittmananalytics.com/blog/2016/09/24/2016-09-24-podcast-episode-1-now-live-drill-to-detail-with-mark-rittman-with-special-guest-stewart-bryson'
# Add more URLs here
]
# Create a session with retry logic
session = requests_retry_session()
# Scrape each URL and store the results
results = []
for url in urls:
print(f"Scraping {url}...")
max_retries = 3
for attempt in range(max_retries):
try:
result = scrape_squarespace_blog(url, session)
if result:
results.append(result)
print("Done!")
break
except Exception as e:
if attempt < max_retries - 1:
wait_time = random.uniform(1, 3)
print(f"An error occurred: {e}. Retrying in {wait_time:.2f} seconds...")
time.sleep(wait_time)
else:
print(f"Failed to scrape {url} after {max_retries} attempts.")
# Save results to a CSV file
if results:
csv_filename = f'scraped_blogs_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['url', 'title', 'author', 'date', 'content']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in results:
writer.writerow(result)
print(f"Results saved to {csv_filename}")
# Display the first result
print("\nFirst scraped article:")
for key, value in results[0].items():
if key == 'content':
print(f"{key}: {value[:500]}...") # Print only first 500 characters of content
else:
print(f"{key}: {value}")
else:
print("No results were successfully scraped.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment