Skip to content

Instantly share code, notes, and snippets.

@lordjabez
Last active August 16, 2024 13:44
Show Gist options
  • Save lordjabez/cc72fa8f4e11ef4f39f1ddefa486ae5f to your computer and use it in GitHub Desktop.
Save lordjabez/cc72fa8f4e11ef4f39f1ddefa486ae5f to your computer and use it in GitHub Desktop.
Extract all posts from a Wordpress blog using the API
#!/usr/bin/env python3
import datetime
import sys
import cachier
import feedparser
import pandas
import requests
blog_domain = sys.argv[1]
@cachier.cachier(stale_after=datetime.timedelta(days=1))
def get_posts(blog_domain, page):
print(blog_domain, page)
api_url = f'https://{blog_domain}/wp-json/wp/v2/posts'
params = {'per_page': 10, 'page': page}
response = requests.get(api_url, params=params)
response.raise_for_status()
return response.json()
def download_all_posts(blog_domain):
all_posts = []
page = 1
while True:
posts = get_posts(blog_domain, page)
if not posts:
break
all_posts.extend(posts)
page += 1
return all_posts
posts = download_all_posts(blog_domain)
def create_post(entry):
post_id = entry['id']
link = entry['link']
title = entry['title']
content = entry['content'][0]['value']
return {'id': post_id, 'link': link, 'title': title, 'content': content}
@cachier.cachier()
def download_posts(blog_domain):
print(f'Downloading posts from {blog_domain}')
posts = []
page = 1
while True:
feed_url = f'https://{blog_domain}?feed=rss2&paged={page}'
entries = feedparser.parse(feed_url)['entries']
if not entries:
break
posts.extend(create_post(e) for e in entries)
page += 1
return pandas.DataFrame(posts)
posts = download_posts(blog_domain)
posts.to_csv(content_filename)
for post in posts:
print(post['title']['rendered'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment