hook_update_N() function to convert <img> tags to Media tokens (Drupal)
* A helper function to convert <img> tags to Media tokens where possible.
function hook_update_N() {
// A cache to help us when we encounter the same <img> tag multiple times.
$src_info_cache = array();
// We'll work directly in the database for performance reasons.
$tables = array(
'field_data_body' => 'body_value',
'field_revision_body' => 'body_value',
'field_data_comment_body' => 'comment_body_value',
'field_revision_comment_body' => 'comment_body_value',
foreach ($tables as $table => $column) {
if (!db_table_exists($table)) {
$result = db_select($table, 'd')
foreach ($result as $row) {
// Small performance hack to skip field items without any images in them.
if (strpos($row->$column, '<img') !== FALSE) {
$dom = new DOMDocument();
$changed = FALSE;
// Loop through all <img> tags 'src' attributes.
$xpath = new DOMXPath($dom);
$list = $xpath->query('//img');
foreach ($list as $imgTag) {
$src = $imgTag->getAttribute('src');
// Attempt to determine if the image is on this site, and find it's
// underlying FID and image style.
$src_info = array();
if (!empty($src_info_cache[$src])) {
$src_info = $src_info_cache[$src];
else {
$src_info = array(
'fid' => NULL,
'filename' => NULL,
'style' => 'panopoly_image_original',
// NOTE: This only works for 'public' files - we'd need to make a
// few changes to support private images too.
if (preg_match('/sites\/[^\/]+\/files\/((?:styles\/)([^\/]+)(?:\/public\/))?(.*)$/', $src, $matches)) {
if (count($matches) == 4) {
$src_info['style'] = $matches[2];
$src_info['filename'] = $matches[3];
else {
$src_info['filename'] = $matches[1];
if (!empty($src_info['filename'])) {
// Remove any GET arguments from the filename.
if ($index = strpos($src_info['filename'], '?')) {
$src_info['filename'] = substr($src_info['filename'], 0, $index);
// Finally, do the actually FID lookup based on filename.
$src_info['fid'] = db_select('file_managed', 'f')
->fields('f', array('fid'))
->condition('f.uri', 'public://' . $src_info['filename'])
$src_info_cache[$src] = $src_info;
// If this does refer to an image on this site, then we replace the
// <img> tag with a Media token.
if (!empty($src_info['fid'])) {
$media_info = array(
'type' => 'media',
'fid' => $src_info['fid'],
'fields' => array(),
'attributes' => array('style' => ''),
// Translate the style into a File view mode.
// NOTE: The site I was working with, already used panopoly_images - but your site probably
// has it's own set of image styles to convert - feel free to change this!
$view_mode = '';
switch ($src_info['style']) {
case 'panopoly_image_featured':
case 'panopoly_image_full':
case 'panopoly_image_original':
$view_mode = 'default';
case 'panopoly_image_half':
case 'panopoly_image_quarter':
case 'panopoly_image_square':
case 'panopoly_image_thumbnail':
$view_mode = 'teaser';
// If we can't work it out, we just use default.
$view_mode = 'default';
$media_info['view_mode'] = $media_info['fields']['format'] = $view_mode;
// Transfer simple attributes from the 'img' tag.
foreach (array('width', 'height', 'style') as $attr) {
if ($imgTag->hasAttribute($attr)) {
$media_info['attributes'][$attr] = $imgTag->getAttribute($attr);
// Transfer the special alt/title attributes.
foreach (array('alt', 'title') as $attr) {
$value = $imgTag->getAttribute($attr);
$media_info['attributes'][$attr] = $value;
$media_info['fields']["field_file_image_{$attr}_text[und][0][value]"] = $value;
// Process the original 'class' attribute into the media token.
// NOTE: These classes are specific to the site I originally wrote this code for!
// Feel free to remove this section or replace with stuff that makes sense.
if ($imgTag->hasAttribute('class')) {
// This builds an array like array('class_name' => 1) for each
// class on the old <img> tag. Hurray for functional programming!
$class_index = array_fill_keys(array_filter(array_map('trim', explode(' ', $imgTag->getAttribute('class')))), 1);
// Map some old classes to new values.
if (isset($class_index['img-no-border'])) {
$media_info['attributes']['class'] = 'img-no-border';
if (isset($class_index['image-center'])) {
$media_info['attributes']['style'] .= 'display: block; margin-left: auto; margin-right: auto';
elseif (isset($class_index['image-right'])) {
$media_info['attributes']['style'] .= 'float: right';
elseif (isset($class_index['image-left'])) {
$media_info['attributes']['style'] .= 'float: left';
$media_token = $dom->createTextNode('[[' . json_encode($media_info) . ']]');
$imgTag->parentNode->replaceChild($media_token, $imgTag);
$changed = TRUE;
// If any changes were made, then we serialize the DOM and update the
// field value.
if ($changed) {
// There is no variant on saveHTML() that will only do the innerHTML,
// it always includes the outer wrapper, so we have to loop!
$row->$column = '';
foreach ($dom->documentElement->firstChild->childNodes as $child) {
$row->$column .= $dom->saveHTML($child);
->condition('entity_type', $row->entity_type)
->condition('entity_id', $row->entity_id)
->condition('revision_id', $row->revision_id)
->condition('language', $row->language)
->condition('delta', $row->delta)
// Since we made the changes directly in the database, we need to clear
// the filter, field and page caches manually.
foreach (array('filter', 'field', 'page') as $cache) {
cache_clear_all('*', "cache_{$cache}", TRUE);
