Created
April 18, 2024 15:35
-
-
Save JiveDig/d8d620c7df6e7863c653eedbf1c4ab2d to your computer and use it in GitHub Desktop.
A PHP class for WordPress to import posts (or pages, cpt) as basic HTML, including importing images from content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Prevent direct file access. | |
defined( 'ABSPATH' ) || die; | |
/** | |
* WP-CLI to delete all content. | |
* | |
wp post delete $(wp post list --post_type=post --format=ids) --force | |
wp post delete $(wp post list --post_type=page --format=ids) --force | |
wp post delete $(wp post list --post_type=attachment --format=ids) --force | |
wp term delete category $(wp term list category --field=term_id) | |
wp term delete post_tag $(wp term list post_tag --field=term_id) | |
*/ | |
/** | |
* Instantiate the class. | |
* | |
* @since 0.1.0 | |
* | |
* @return void | |
*/ | |
new Mai_CLI_Sitemap_Importer; | |
/** | |
* Gets it started. | |
* | |
* @since 0.1.0 | |
* | |
* @link https://docs.wpvip.com/how-tos/write-custom-wp-cli-commands/ | |
* @link https://webdevstudios.com/2019/10/08/making-wp-cli-commands/ | |
* | |
* @return void | |
*/ | |
add_action( 'cli_init', function() { | |
WP_CLI::add_command( 'maisitemap', 'Mai_CLI_Sitemap_Importer' ); | |
}); | |
/** | |
* Split testing class. | |
* | |
* @version 0.1.0 | |
*/ | |
class Mai_CLI_Sitemap_Importer { | |
/** | |
* Gets environment. | |
* | |
* Usage: wp maisitemap get_environment | |
* | |
* @return void | |
*/ | |
function get_environment() { | |
WP_CLI::log( sprintf( 'Environment: %s', wp_get_environment_type() ) ); | |
} | |
/** | |
* Import pages from sitemap. | |
* | |
* Usage: | |
* wp maisitemap import --sitemap_url=https://example.com/sitemap.xml | |
* wp maisitemap import --sitemap_url=https://www.example.com/sitemap.xml --xpath="(//div[@class='page-content'])[1]" --post_type=page | |
* | |
* @since 0.1.0 | |
* | |
* @param array $args Standard command args. | |
* @param array $assoc_args Keyed args like --search and --fields. | |
* | |
* @return void | |
*/ | |
function import( $args, $assoc_args ) { | |
// Parse args. | |
$assoc_args = wp_parse_args( | |
$assoc_args, | |
[ | |
'sitemap_url' => '', // Required. The sitemap URL. | |
'xpath' => '//main', // The XPath to the content. | |
'post_type' => 'post', | |
'post_status' => 'publish', | |
] | |
); | |
// Bail if no URL. | |
if ( ! $assoc_args['sitemap_url'] ) { | |
WP_CLI::error( 'Please provide a sitemap URL.' ); | |
return; | |
} | |
// Get sitemap data. | |
$response = wp_remote_get( $assoc_args['sitemap_url'] ); | |
$code = wp_remote_retrieve_response_code( $response ); | |
// Bail if error. | |
if ( is_wp_error( $response ) ) { | |
WP_CLI::line( $response->get_error_message() ); | |
return; | |
} | |
// Bail if error. | |
if ( 200 !== $code ) { | |
// Build error message. | |
$message = $code; | |
// Add error message if available. | |
if ( isset( $body['error']['message'] ) ) { | |
$message .= ' ' . $body['error']['message']; | |
} | |
WP_CLI::line( $message ); | |
return; | |
} | |
// Get body and parse XML. | |
$body = wp_remote_retrieve_body( $response ); | |
$xml = simplexml_load_string( $body ); | |
// Bail if no XML. | |
if ( ! $xml ) { | |
WP_CLI::error( 'Could not parse XML.' ); | |
return; | |
} | |
// Bail if no URLs. | |
if ( ! $xml->url || ! count( $xml->url ) ) { | |
WP_CLI::error( 'No URLs found in sitemap.' ); | |
return; | |
} | |
// Loop through URLs | |
foreach ( $xml->url as $url ) { | |
$url = (string) $url->loc; | |
// Skip if no URL. | |
if ( ! $url ) { | |
continue; | |
} | |
// Fetch HTML content for each URL. | |
$html_response = wp_remote_get( $url ); | |
// Skip if error. | |
if ( is_wp_error( $html_response ) ) { | |
WP_CLI::line( $response->get_error_message() ); | |
return; | |
} | |
// Get HTML. | |
$html = wp_remote_retrieve_body( $html_response ); | |
// Create the new document. | |
$dom = new DOMDocument(); | |
// Modify state. | |
$libxml_previous_state = libxml_use_internal_errors( true ); | |
// Load the content in the document HTML. | |
$dom->loadHTML( $html ); | |
// Setup XPath. | |
$xpath = new DOMXPath( $dom ); | |
// Parse URL. | |
$parse = wp_parse_url( $url ); | |
// Get the host. | |
$host = $parse['host']; | |
// Get slug. | |
$slug = $parse['path']; | |
$slug = trim( $slug, '/' ); | |
// Get first h1. | |
$nodes = $xpath->query( '(//h1)[1]' ); | |
$title = $nodes->item(0)->nodeValue; | |
// If no h1, get title from slug. | |
if ( ! $title ) { | |
$title = str_replace( '-', ' ', $slug ); | |
$title = ucwords( $title ); | |
} | |
// Get main content. | |
$nodes = $xpath->query( $assoc_args['xpath'] ); | |
// Skip if no nodes. | |
if ( ! $nodes->length ) { | |
continue; | |
} | |
// Save new HTML. | |
$content = $dom->saveHTML( $nodes->item(0) ); | |
// Remove content of <script> and <style> tags | |
$content = preg_replace( '/<script\b[^>]*>.*?<\/script>/is', '', $content ); | |
$content = preg_replace( '/<style\b[^>]*>.*?<\/style>/is', '', $content ); | |
// Get valid tags. | |
$tags = [ | |
'p', | |
'a', | |
'ul', | |
'ol', | |
'li', | |
'h1', | |
'h2', | |
'h3', | |
'h4', | |
'h5', | |
'h6', | |
'blockquote', | |
'img', | |
'figure', | |
'figcaption', | |
'iframe', | |
'video', | |
'audio', | |
'source', | |
'pre', | |
'code', | |
'br', | |
'hr', | |
'em', | |
'strong', | |
]; | |
// Strip tags. | |
$content = strip_tags( $content, $tags ); | |
// Loop through tags. | |
foreach ( $tags as $tag ) { | |
// Set up tag processor. | |
$tags = new WP_HTML_Tag_Processor( $content ); | |
// Loop through tags. | |
while ( $tags->next_tag( [ 'tag_name' => $tag ] ) ) { | |
$tags->remove_attribute( 'id' ); | |
$tags->remove_attribute( 'class' ); | |
$tags->remove_attribute( 'style' ); | |
} | |
$content = $tags->get_updated_html(); | |
} | |
// Set up post args. | |
$post_args = [ | |
'post_title' => $title, | |
'post_name' => $slug, | |
// 'post_content' => $content, // Added later, after we have a post ID for images. | |
'post_status' => $assoc_args['post_status'], | |
'post_type' => $assoc_args['post_type'], | |
]; | |
// Insert the post. | |
$post_id = wp_insert_post( $post_args ); | |
// Skip if no post ID. | |
if ( is_wp_error( $post_id ) ) { | |
WP_CLI::line( $post_id->get_error_message() ); | |
continue; | |
} | |
// Set up tag processor to import images. | |
$tags = new WP_HTML_Tag_Processor( $content ); | |
// First, get the featured image. | |
$first = true; | |
// Loop through tags. | |
while ( $tags->next_tag( [ 'tag_name' => 'img' ] ) ) { | |
// Get src. | |
$src = $tags->get_attribute( 'src' ); | |
// Skip if no src. | |
if ( ! $src ) { | |
continue; | |
} | |
// Parse the image src. | |
$parse = wp_parse_url( $src ); | |
// If no host, add it. | |
if ( ! wp_parse_url( $src, PHP_URL_HOST ) ) { | |
$scheme = $parse['scheme'] ?: 'https'; | |
$src = $scheme . '://' . trailingslashit( $host ) . ltrim( $src, '/' ); | |
} | |
// Maybe upload the image. | |
$image_id = $this->upload_image( $src, 'ref_url', $src, $post_id ); | |
// If we have an image ID. | |
if ( $image_id ) { | |
// Get image url. | |
$image_url = wp_get_attachment_image_url( $image_id, 'large' ); | |
// Update the src. | |
$tags->set_attribute( 'src', $image_url ); | |
// If first image, set as featured image. | |
if ( $first ) { | |
// Set the featured image. | |
set_post_thumbnail( $post_id, $image_id ); | |
// Not first. | |
$first = false; | |
} | |
WP_CLI::line( sprintf( 'Image imported: %s', $image_url ) ); | |
} | |
} | |
// Update the content. | |
$content = $tags->get_updated_html(); | |
// Update the post content. | |
wp_update_post( | |
[ | |
'ID' => $post_id, | |
'post_content' => $content, | |
] | |
); | |
// Log post inserted. | |
WP_CLI::line( sprintf( '%s inserted: %s', ucwords( $assoc_args['post_type'] ), get_permalink( $post_id ) ) ); | |
} | |
WP_CLI::success( 'Import complete.' ); | |
} | |
/** | |
* Downloads a remote file and inserts it into the WP Media Library. | |
* | |
* @access private | |
* | |
* @see https://developer.wordpress.org/reference/functions/media_handle_sideload/ | |
* | |
* @param string $ref_uri The reference URI of a remote file. | |
* @param string $ref_key The reference key of a remote file. | |
* @param string $url HTTP URL address of a remote file. | |
* @param int $post_id The post ID the media is associated with. | |
* | |
* @return int|WP_Error The ID of the attachment or a WP_Error on failure. | |
*/ | |
function upload_image( $ref_uri, $ref_key, $image_url, $post_id ) { | |
// Make sure we have the functions we need. | |
if ( ! function_exists( 'download_url' ) || ! function_exists( 'media_handle_sideload' ) ) { | |
require_once( ABSPATH . 'wp-admin/includes/media.php' ); | |
require_once( ABSPATH . 'wp-admin/includes/file.php' ); | |
require_once( ABSPATH . 'wp-admin/includes/image.php' ); | |
} | |
// Check if there is an attachment with places_url meta key and value of $image_url. | |
$existing_ids = get_posts( | |
[ | |
'post_type' => 'attachment', | |
'post_status' => 'any', | |
'meta_key' => $ref_key, | |
'meta_value' => $ref_uri, | |
'meta_compare' => '=', | |
'fields' => 'ids', | |
] | |
); | |
// Get existing ID. | |
$existing_id = $existing_ids && isset( $existing_ids[0] ) ? $existing_ids[0] : 0; | |
// Bail if the image already exists. | |
if ( $existing_id ) { | |
return $existing_id; | |
} | |
// Get contents of the image url. | |
$image_hashed = md5( $image_url ) . '.jpg'; | |
$image_contents = file_get_contents( $image_url ); | |
// If contents. | |
if ( $image_contents ) { | |
// Get the uploads directory. | |
$upload_dir = wp_get_upload_dir(); | |
$upload_url = $upload_dir['baseurl']; | |
// Specify the path to the destination directory within uploads. | |
$destination_dir = $upload_dir['basedir'] . '/mai-sitemap-import/'; | |
// Create the destination directory if it doesn't exist. | |
if ( ! file_exists( $destination_dir ) ) { | |
mkdir( $destination_dir, 0755, true ); | |
} | |
// Specify the path to the destination file. | |
$destination_file = $destination_dir . $image_hashed; | |
// Save the image to the destination file. | |
file_put_contents( $destination_file, $image_contents ); | |
// Bail if the file doesn't exist. | |
if ( ! file_exists( $destination_file ) ) { | |
return 0; | |
} | |
$image_url = $image_hashed; | |
} | |
// Bail, no image contents. | |
else { | |
return 0; | |
} | |
// Build the image url. | |
$image_url = untrailingslashit( $upload_url ) . '/mai-sitemap-import/' . $image_hashed; | |
// Build a temp url. | |
$tmp = download_url( $image_url ); | |
// Remove the temp file. | |
wp_delete_file( $destination_file ); | |
// Bail if error. | |
if ( is_wp_error( $tmp ) ) { | |
// ray( $tmp->get_error_code() . ': upload_image() 1 ' . $image_url . ' ' . $tmp->get_error_message() ); | |
// Remove the original image and return the error. | |
wp_delete_file( $tmp ); | |
return 0; | |
} | |
// Build the file array. | |
$file_array = [ | |
'name' => basename( $image_url ), | |
'tmp_name' => $tmp, | |
]; | |
// Add the image to the media library. | |
$image_id = media_handle_sideload( $file_array, $post_id ); | |
// Bail if error. | |
if ( is_wp_error( $image_id ) ) { | |
// ray( $image_id->get_error_code() . ': upload_image() 2 ' . $image_url . ' ' . $image_id->get_error_message() ); | |
// Remove the original image and return the error. | |
wp_delete_file( $file_array[ 'tmp_name' ] ); | |
return $image_id; | |
} | |
// Remove the original image. | |
wp_delete_file( $file_array[ 'tmp_name' ] ); | |
// Set the reference url for possible reference later. | |
update_post_meta( $image_id, $ref_key, $ref_uri ); | |
return $image_id; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment