You are here

html_to_text.inc in Mail System 6.2

Same filename and directory in other branches
  1. 8.2 html_to_text.inc
  2. 7.3 html_to_text.inc
  3. 7.2 html_to_text.inc

Copy of drupal_html_to_text improvements from issue #299138.

File

html_to_text.inc
View source
<?php

/**
 * @file
 * Copy of drupal_html_to_text improvements from issue #299138.
 */

/**
 * Perform format=flowed soft wrapping for mail (RFC 3676).
 *
 * We use delsp=yes wrapping, but only break non-spaced languages when
 * absolutely necessary to avoid compatibility issues.
 *
 * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS)
 * rather than "\r\n".
 *
 * @param $text
 *   The plain text to process.
 * @param array $options
 *   (optional) An array containing one or more of the following keys:
 *   - indent: A string to indent the text with. Only '>' characters are
 *     repeated on subsequent wrapped lines. Others are replaced by spaces.
 *   - max: The maximum length at which to wrap each line. Defaults to 80.
 *   - stuff: Whether to space-stuff special lines.  Defaults to TRUE.
 *   - hard: Whether to enforce the maximum line length even if no convenient
 *     space character is available.  Defaults to FALSE.
 *   - pad: A string to use for padding short lines to 'max' characters.  If
 *     more than one character, only the last will be repeated.
 *   - break: The line break sequence to insert.  The default is one of the
 *     following:
 *     - "\r\n": Windows, when $text does not contain a space character.
 *     - "\n": Non-Windows, when $text does not contain a space character.
 *     - " \r\n": On Windows, when $text contains at least one space.
 *     - " \n": Non-Windows, when $text contains at least one space.
 *
 * @see drupal_mail()
 */
function mailsystem_wrap_mail($text, array $options = array()) {
  static $defaults;
  if (!isset($defaults)) {
    $defaults = array(
      'indent' => '',
      'pad' => '',
      'pad_repeat' => '',
      'max' => 80,
      'stuff' => TRUE,
      'hard' => FALSE,
      'eol' => variable_get('mail_line_endings', MAIL_LINE_ENDINGS),
    );
  }
  $options += $defaults;
  if (!isset($options['break'])) {

    // Allow soft-wrap spaces only when $text contains at least one space.
    $options['break'] = (strpos($text, ' ') === FALSE ? '' : ' ') . $defaults['eol'];
  }
  $options['wrap'] = $options['max'] - drupal_strlen($options['indent']);
  if ($options['pad']) {
    $options['pad_repeat'] = drupal_substr($options['pad'], -1, 1);
  }

  // The 'clean' indent is applied to all lines after the first one.
  $options['clean'] = _mailsystem_html_to_text_clean($options['indent']);

  // Wrap lines according to RFC 3676.
  $lines = explode($defaults['eol'], $text);
  array_walk($lines, '_mailsystem_wrap_mail_line', $options);

  // Expand the lines array on newly-inserted line breaks.
  $lines = explode($defaults['eol'], implode($defaults['eol'], $lines));

  // Apply indentation, space-stuffing, and padding.
  array_walk($lines, '_mailsystem_indent_mail_line', $options);
  return implode($defaults['eol'], $lines);
}

/**
 * Transform an HTML string into plain text, preserving the structure of the
 * markup. Useful for preparing the body of a node to be sent by e-mail.
 *
 * The output will be suitable for use as 'format=flowed; delsp=yes' text
 * (RFC 3676) and can be passed directly to drupal_mail() for sending.
 *
 * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS)
 * rather than "\r\n".
 *
 * This function provides suitable alternatives for the following tags:
 *
 * <a> <address> <b> <blockquote> <br /> <caption> <cite> <dd> <div> <dl> <dt>
 * <em> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <ol> <p> <pre> <strong>
 * <table> <tbody> <td> <tfoot> <thead> <tr> <u> <ul>
 *
 * The following tag attributes are supported:
 * - <a href=...>: Hyperlink destination urls.
 * - <li value=...>: Ordered list item numbers.
 * - <ol start=...>: Ordered list start number.
 *
 * @param $string
 *   The string to be transformed.
 * @param $allowed_tags
 *   (optional) If supplied, a list of tags that will be transformed. If
 *   omitted, all supported tags are transformed.
 *
 * @return
 *   The transformed string.
 *
 * @see drupal_mail()
 */
function mailsystem_html_to_text($string, $allowed_tags = NULL) {
  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);

  // Cache list of supported tags.
  static $supported_tags;
  if (!isset($supported_tags)) {
    $supported_tags = array(
      'a',
      'address',
      'b',
      'blockquote',
      'br',
      'cite',
      'dd',
      'div',
      'dl',
      'dt',
      'em',
      'h1',
      'h2',
      'h3',
      'h4',
      'h5',
      'h6',
      'hr',
      'i',
      'li',
      'ol',
      'p',
      'pre',
      'strong',
      'table',
      'td',
      'tr',
      'u',
      'ul',
    );
  }

  // Make sure only supported tags are kept.
  $allowed_tags = isset($allowed_tags) ? array_intersect($supported_tags, $allowed_tags) : $supported_tags;

  // Parse $string into a DOM tree.
  $dom = filter_dom_load($string);
  $notes = array();

  // Recursively convert the DOM tree into plain text.
  $text = _mailsystem_html_to_text($dom->documentElement, $allowed_tags, $notes);

  // Hard-wrap at 1000 characters (including the line break sequence)
  // and space-stuff special lines.
  $text = mailsystem_wrap_mail($text, array(
    'max' => 1000 - strlen($eol),
    'hard' => TRUE,
  ));

  // Change non-breaking spaces back to regular spaces, and trim line breaks.
  // chr(160) is the non-breaking space character.
  $text = str_replace(chr(160), ' ', trim($text, $eol));

  // Add footnotes;
  if ($notes) {

    // Add a blank line before the footnote list.
    $text .= $eol;
    foreach ($notes as $url => $note) {
      $text .= $eol . '[' . $note . '] ' . $url;
    }
  }
  return $text;
}

/**
 * Helper function for drupal_html_to_text().
 *
 * Recursively converts $node to text, wrapping and indenting as necessary.
 *
 * @param $node
 *   The source DOMNode.
 * @param $allowed_tags
 *   A list of tags that will be transformed.
 * @param array &$notes
 *   A writeable array of footnote reference numbers, keyed by their
 *   respective hyperlink destination urls.
 * @param $line_length
 *   The maximum length of a line, for wrapping.  Defaults to 80 characters.
 * @param array $parents
 *   The list of ancestor tags, from nearest to most distant.  Defaults to an
 *   empty array().
 * @param $count
 *   The number to use for the next list item within an ordered list.  Defaults
 *   to 1.
 */
function _mailsystem_html_to_text(DOMNode $node, array $allowed_tags, array &$notes, $line_length = 80, array $parents = array(), &$count = NULL) {
  if (!isset($count)) {
    $count = 1;
  }
  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
  if ($node->nodeType === XML_TEXT_NODE) {

    // For text nodes, we just copy the text content.
    $text = $node->textContent;

    // Convert line breaks and trim trailing spaces.
    $text = preg_replace('/ *\\r?\\n/', $eol, $text);
    if (in_array('pre', $parents)) {

      // Within <pre> tags, all spaces become non-breaking.
      // chr(160) is the non-breaking space character.
      $text = str_replace(' ', chr(160), $text);
    }
    else {

      // Outside <pre> tags, collapse whitespace.
      $text = preg_replace('/[[:space:]]+/', ' ', $text);
    }
    return $text;
  }

  // Non-text node.
  $tag = '';
  $text = '';
  $child_text = '';
  $child_count = 1;
  $indent = '';
  $prefix = '';
  $suffix = '';
  $pad = '';
  if (isset($node->tagName) && in_array($node->tagName, $allowed_tags)) {
    $tag = $node->tagName;
    switch ($tag) {

      // Turn links with valid hrefs into footnotes.
      case 'a':
        $test = !empty($node->attributes);
        $test = $test && ($href = $node->attributes
          ->getNamedItem('href'));
        $test = $test && ($url = url(preg_replace('|^' . base_path() . '|', '', $href->nodeValue), array(
          'absolute' => TRUE,
        )));
        $test = $test && valid_url($url);
        if ($test) {

          // Only add links that have not already been added.
          if (isset($notes[$url])) {
            $note = $notes[$url];
          }
          else {
            $note = count($notes) + 1;
            $notes[$url] = $note;
          }
          $suffix = ' [' . $note . ']';
        }
        break;

      // Generic block-level tags.
      case 'address':
      case 'caption':
      case 'div':
      case 'p':
      case 'pre':

        // Start on a new line except as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        $suffix = $eol;
        break;

      // Forced line break.
      case 'br':
        $text = $eol;
        break;

      // Boldface by wrapping with "*" characters.
      case 'b':
      case 'strong':
        $prefix = '*';
        $suffix = '*';
        break;

      // Italicize by wrapping with "/" characters.
      case 'cite':
      case 'em':
      case 'i':
        $prefix = '/';
        $suffix = '/';
        break;

      // Underline by wrapping with "_" characters.
      case 'u':
        $prefix = '_';
        $suffix = '_';
        break;

      // Blockquotes are indented by "> " at each level.
      case 'blockquote':
        $text = $eol;

        // chr(160) is the non-breaking space character.
        $indent = '>' . chr(160);
        $suffix = $eol;
        break;

      // Dictionary definitions are indented by four spaces.
      case 'dd':

        // chr(160) is the non-breaking space character.
        $indent = chr(160) . chr(160) . chr(160) . chr(160);
        $suffix = $eol;
        break;

      // Dictionary list.
      case 'dl':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        $suffix = $eol;
        break;

      // Dictionary term.
      case 'dt':
        $suffix = $eol;
        break;

      // Header level 1 is prefixed by eight "=" characters.
      case 'h1':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '========' . chr(160);
        $pad = chr(160) . '=';
        $suffix = $eol;
        break;

      // Header level 2 is prefixed by six "-" characters.
      case 'h2':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '------' . chr(160);
        $pad = chr(160) . '-';
        $suffix = $eol;
        break;

      // Header level 3 is prefixed by four "." characters and a space.
      case 'h3':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '....' . chr(160);
        $suffix = $eol;
        break;

      // Header level 4 is prefixed by three "." characters and a space.
      case 'h4':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '...' . chr(160);
        $suffix = $eol;
        break;

      // Header level 5 is prefixed by two "." character and a space.
      case 'h5':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '..' . chr(160);
        $suffix = $eol;
        break;

      // Header level 6 is prefixed by one "." character and a space.
      case 'h6':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '.' . chr(160);
        $suffix = $eol;
        break;

      // Horizontal rulers become a line of "-" characters.
      case 'hr':
        $text = $eol;
        $child_text = '-';
        $pad = '-';
        $suffix = $eol;
        break;

      // List items are treated differently depending on the parent tag.
      case 'li':

        // Ordered list item.
        if (reset($parents) === 'ol') {

          // Check the value attribute.
          $test = !empty($node->attributes);
          $test = $test && ($value = $node->attributes
            ->getNamedItem('value'));
          if ($test) {
            $count = $value->nodeValue;
          }

          // chr(160) is the non-breaking space character.
          $indent = ($count < 10 ? chr(160) : '') . chr(160) . "{$count})" . chr(160);
          $count++;
        }
        else {

          // chr(160) is the non-breaking space character.
          $indent = chr(160) . '*' . chr(160);
        }
        $suffix = $eol;
        break;

      // Ordered lists.
      case 'ol':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }

        // Check the start attribute.
        $test = !empty($node->attributes);
        $test = $test && ($value = $node->attributes
          ->getNamedItem('start'));
        if ($test) {
          $child_count = $value->nodeValue;
        }
        break;

      // Tables require special handling.
      case 'table':
        return _mailsystem_html_to_text_table($node, $allowed_tags, $notes, $line_length);

      // Separate adjacent table cells by two non-breaking spaces.
      case 'td':
        if (!empty($node->nextSibling)) {

          // chr(160) is the non-breaking space character.
          $suffix = chr(160) . chr(160);
        }
        break;

      // End each table row with a newline.
      case 'tr':
        $suffix = $eol;
        break;

      // Unordered lists.
      case 'ul':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        break;
      default:

        // Coder review complains if there is no default case.
        break;
    }

    // Only add allowed tags to the $parents array.
    array_unshift($parents, $tag);
  }

  // Copy each child node to output.
  if ($node
    ->hasChildNodes()) {
    foreach ($node->childNodes as $child) {
      $child_text .= _mailsystem_html_to_text($child, $allowed_tags, $notes, $line_length - drupal_strlen($indent), $parents, $child_count);
    }
  }

  // We only add prefix and suffix if the child nodes were non-empty.
  if ($child_text > '') {

    // We capitalize the contents of h1 and h2 tags.
    if ($tag === 'h1' || $tag === 'h2') {
      $child_text = drupal_strtoupper($child_text);
    }

    // Don't add a newline to an existing newline.
    if ($suffix === $eol && drupal_substr($child_text, -drupal_strlen($eol)) === $eol) {
      $suffix = '';
    }

    // Trim spaces around newlines except with <pre> or inline tags.
    if (!in_array($tag, array(
      'a',
      'b',
      'cite',
      'em',
      'i',
      'pre',
      'strong',
      'u',
    ))) {
      $child_text = preg_replace('/ *' . $eol . ' */', $eol, $child_text);
    }

    // Soft-wrap at effective line length, but don't space-stuff.
    $child_text = mailsystem_wrap_mail($prefix . $child_text, array(
      // chr(160) is the non-breaking space character.
      'break' => chr(160) . $eol,
      'indent' => $indent,
      'max' => $line_length,
      'pad' => $pad,
      'stuff' => FALSE,
    )) . $suffix;
    if ($tag === 'pre') {

      // Perform RFC-3676 soft-wrapping.
      // chr(160) is the non-breaking space character.
      $child_text = str_replace(chr(160), ' ', $child_text);
      $child_text = mailsystem_wrap_mail($child_text, array(
        'max' => $line_length,
        'stuff' => FALSE,
      ));

      // chr(160) is the non-breaking space character.
      $child_text = str_replace(' ', chr(160), $child_text);
    }
    $text .= $child_text;
  }
  return $text;
}

/**
 * Helper function for _mailsystem_html_to_text().
 *
 * Renders a <table> DOM Node into plain text.  Attributes such as rowspan,
 * colspan, padding, border, etc. are ignored.
 *
 * @param DOMNode $node
 *   The DOMNode corresponding to the <table> tag and its contents.
 * @param $allowed_tags
 *   The list of allowed tags passed to _mailsystem_html_to_text().
 * @param array &$notes
 *   A writeable array of footnote reference numbers, keyed by their
 *   respective hyperlink destination urls.
 * @param $table_width
 *   The desired maximum table width, after word-wrapping each table cell.
 *
 * @return
 *   A plain text representation of the table.
 *
 * @see _mailsystem_html_to_text()
 */
function _mailsystem_html_to_text_table(DOMNode $node, $allowed_tags = NULL, array &$notes = array(), $table_width = 80) {
  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
  $header = array();
  $footer = array();
  $body = array();
  $text = $eol;
  $current = $node;
  while (TRUE) {
    if (isset($current->tagName)) {
      switch ($current->tagName) {
        case 'caption':

          // The table caption is added first.
          $text = _mailsystem_html_to_text($current, $allowed_tags, $notes, $table_width);
          break;
        case 'tr':
          switch ($current->parentNode->tagName) {
            case 'thead':
              $header[] = $current;
              break;
            case 'tfoot':
              $footer[] = $current;
              break;
            default:

              // Either 'tbody' or 'table'
              $body[] = $current;
              break;
          }
          break;
        default:
          if ($current
            ->hasChildNodes()) {
            $current = $current->firstChild;
            continue 2;
          }
      }
    }
    do {
      if ($current->nextSibling) {
        $current = $current->nextSibling;
        continue 2;
      }
      $current = $current->parentNode;
    } while ($current && !$current
      ->isSameNode($node));
    break;
  }

  // Merge the thead, tbody, and tfoot sections together.
  if ($rows = array_merge($header, $body, $footer)) {
    $num_rows = count($rows);

    // First just count the number of columns.
    $num_cols = 0;
    foreach ($rows as $row) {
      $row_cols = 0;
      foreach ($row->childNodes as $cell) {
        if (isset($cell->tagName) && in_array($cell->tagName, array(
          'td',
          'th',
        ))) {
          $row_cols++;
        }
      }
      $num_cols = max($num_cols, $row_cols);
    }

    // If any columns were found, calculate each column height and width.
    if ($num_cols) {

      // Set up a binary search for best wrap width for each column.
      $max = max($table_width - $num_cols - 1, 1);
      $max_wraps = array_fill(0, $num_cols, $max);
      $try = max(intval(($table_width - 1) / $num_cols - 1), 1);
      $try_wraps = array_fill(0, $num_cols, $try);
      $min_wraps = array_fill(0, $num_cols, 1);

      // Start searching...
      $change = FALSE;
      do {
        $change = FALSE;
        $widths = array_fill(0, $num_cols, 0);
        $heights = array_fill(0, $num_rows, 0);
        $table = array_fill(0, $num_rows, array_fill(0, $num_cols, ''));
        $breaks = array_fill(0, $num_cols, FALSE);
        foreach ($rows as $i => $row) {
          $j = 0;
          foreach ($row->childNodes as $cell) {
            if (!isset($cell->tagName) || !in_array($cell->tagName, array(
              'td',
              'th',
            ))) {

              // Skip text nodes.
              continue;
            }

            // Render the cell contents.
            $cell = _mailsystem_html_to_text($cell, $allowed_tags, $notes, $try_wraps[$j]);

            // Trim leading line-breaks and trailing whitespace.
            // chr(160) is the non-breaking space character.
            $cell = rtrim(ltrim($cell, $eol), ' ' . $eol . chr(160));
            $table[$i][$j] = $cell;
            if ($cell > '') {

              // Split the cell into lines.
              $lines = explode($eol, $cell);

              // The row height is the maximum number of lines among all the
              // cells in that row.
              $heights[$i] = max($heights[$i], count($lines));
              foreach ($lines as $line) {
                $this_width = drupal_strlen($line);

                // The column width is the maximum line width among all the
                // lines in that column.
                if ($this_width > $widths[$j]) {
                  $widths[$j] = $this_width;

                  // If the longest line in a column contains at least one
                  // space character, then the table can be made narrower.
                  $breaks[$j] = strpos(' ', $line) !== FALSE;
                }
              }
            }
            $j++;
          }
        }

        // Calculate the total table width;
        $this_width = array_sum($widths) + $num_cols + 1;
        if ($this_width > $table_width) {

          // Wider than desired.
          if (!in_array(TRUE, $breaks)) {

            // If there are no more break points, then the table is already as
            // narrow as it can get, so we're done.
            break;
          }
          foreach ($try_wraps as $i => $wrap) {
            $max_wraps[$i] = min($max_wraps[$i], $wrap);
            if ($breaks[$i]) {
              $new_wrap = intval(($min_wraps[$i] + $max_wraps[$i]) / 2);
              $new_wrap = min($new_wrap, $widths[$i] - 1);
              $new_wrap = max($new_wrap, $min_wraps[$i]);
            }
            else {

              // There's no point in trying to make the column narrower than
              // the widest un-wrappable line in the column.
              $min_wraps[$i] = $widths[$i];
              $new_wrap = $widths[$i];
            }
            if ($try_wraps[$i] > $new_wrap) {
              $try_wraps[$i] = $new_wrap;
              $change = TRUE;
            }
          }
        }
        elseif ($this_width < $table_width) {

          // Narrower than desired.
          foreach ($try_wraps as $i => $wrap) {
            if ($min_wraps[$i] < $wrap) {
              $min_wraps[$i] = $wrap;
            }
            $new_wrap = intval(($min_wraps[$i] + $max_wraps[$i]) / 2);
            $new_wrap = max($new_wrap, $widths[$i] + 1);
            $new_wrap = min($new_wrap, $max_wraps[$i]);
            if ($try_wraps[$i] < $new_wrap) {
              $try_wraps[$i] = $new_wrap;
              $change = TRUE;
            }
          }
        }
      } while ($change);

      // Pad each cell to column width and line height.
      for ($i = 0; $i < $num_rows; $i++) {
        if ($heights[$i]) {
          for ($j = 0; $j < $num_cols; $j++) {
            $cell = $table[$i][$j];

            // Pad each cell to the maximum number of lines in that row.
            $lines = array_pad(explode($eol, $cell), $heights[$i], '');
            foreach ($lines as $k => $line) {

              // Pad each line to the maximum width in that column.
              $repeat = $widths[$j] - drupal_strlen($line);
              if ($repeat > 0) {

                // chr(160) is the non-breaking space character.
                $lines[$k] .= str_repeat(chr(160), $repeat);
              }
            }
            $table[$i][$j] = $lines;
          }
        }
      }

      // Generate the row separator line.
      $separator = '+';
      for ($i = 0; $i < $num_cols; $i++) {
        $separator .= str_repeat('-', $widths[$i]) . '+';
      }
      $separator .= $eol;
      for ($i = 0; $i < $num_rows; $i++) {
        $text .= $separator;
        if (!$heights[$i]) {
          continue;
        }
        $row = $table[$i];

        // For each row, iterate first by lines within the row.
        for ($k = 0; $k < $heights[$i]; $k++) {

          // Add a vertical-bar at the beginning of each row line.
          $row_line = '|';
          $trimmed = '';

          // Within each row line, iterate by cells within that line.
          for ($j = 0; $j < $num_cols; $j++) {

            // Add a vertical bar at the end of each cell line.
            $row_line .= $row[$j][$k] . '|';

            // chr(160) is the non-breaking space character.
            $trimmed .= trim($row[$j][$k], ' ' . $eol . chr(160));
          }
          if ($trimmed > '') {

            // Only print rows that are non-empty.
            $text .= $row_line . $eol;
          }
        }
      }

      // Final output ends with a row separator.
      $text .= $separator;
    }
  }

  // Make sure formatted table content doesn't line-wrap.
  // chr(160) is the non-breaking space character.
  return str_replace(' ', chr(160), $text);
}

/**
 * Helper function for array_walk in drupal_wrap_mail().
 *
 * Inserts $values['break'] sequences to break up $line into parts of no more
 * than $values['wrap'] characters. Only breaks at space characters, unless
 * $values['hard'] is TRUE.
 */
function _mailsystem_wrap_mail_line(&$line, $key, $values) {
  $line = wordwrap($line, $values['wrap'], $values['break'], $values['hard']);
}

/**
 * Helper function for array_walk in drupal_wrap_mail().
 *
 * If $values['pad'] is non-empty, $values['indent'] will be added at the start
 * of each line, and $values['pad'] at the end, repeating the last character of
 * $values['pad'] until the line length equals $values['max'].
 *
 * If $values['pad'] is empty, $values['indent'] will be added at the start of
 * the first line, and $values['clean'] at the start of subsequent lines.
 *
 * If $values['stuff'] is true, then an extra space character will be added at
 * the start of any line beginning with a space, a '>', or the word 'From'.
 *
 * @see http://www.ietf.org/rfc/rfc3676.txt
 */
function _mailsystem_indent_mail_line(&$line, $key, $values) {
  if ($line == '') {
    return;
  }
  if ($values['pad']) {
    $line = $values['indent'] . $line;
    $count = $values['max'] - drupal_strlen($line) - drupal_strlen($values['pad']);
    if ($count >= 0) {
      $line .= $values['pad'] . str_repeat($values['pad_repeat'], $count);
    }
  }
  else {
    $line = $values[$key === 0 ? 'indent' : 'clean'] . $line;
  }
  if ($values['stuff']) {

    // chr(160) is the non-breaking space character.
    $line = preg_replace('/^(' . chr(160) . '| |>|From)/', ' $1', $line);
  }
}

/**
 * Helper function for drupal_wrap_mail() and drupal_html_to_text().
 *
 * Replace all non-quotation markers from a given piece of indentation with
 * non-breaking space characters.
 */
function _mailsystem_html_to_text_clean($indent) {

  // chr(160) is the non-breaking space character.
  return preg_replace('/[^>]/', chr(160), $indent);
}

/**
 * Parses an HTML snippet and returns it as a DOM object.
 *
 * This function loads the body part of a partial (X)HTML document
 * and returns a full DOMDocument object that represents this document.
 * You can use filter_dom_serialize() to serialize this DOMDocument
 * back to a XHTML snippet.
 *
 * @param $text
 *   The partial (X)HTML snippet to load. Invalid mark-up
 *   will be corrected on import.
 * @return
 *   A DOMDocument that represents the loaded (X)HTML snippet.
 */
function filter_dom_load($text) {
  $dom_document = new DOMDocument();

  // Ignore warnings during HTML soup loading.
  @$dom_document
    ->loadHTML('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' . $text . '</body></html>');
  return $dom_document;
}

/**
 * Converts a DOM object back to an HTML snippet.
 *
 * The function serializes the body part of a DOMDocument
 * back to an XHTML snippet.
 *
 * The resulting XHTML snippet will be properly formatted
 * to be compatible with HTML user agents.
 *
 * @param $dom_document
 *   A DOMDocument object to serialize, only the tags below
 *   the first <body> node will be converted.
 * @return
 *   A valid (X)HTML snippet, as a string.
 */
function filter_dom_serialize($dom_document) {
  $body_node = $dom_document
    ->getElementsByTagName('body')
    ->item(0);
  $body_content = '';
  foreach ($body_node
    ->getElementsByTagName('script') as $node) {
    filter_dom_serialize_escape_cdata_element($dom_document, $node);
  }
  foreach ($body_node
    ->getElementsByTagName('style') as $node) {
    filter_dom_serialize_escape_cdata_element($dom_document, $node, '/*', '*/');
  }
  foreach ($body_node->childNodes as $child_node) {
    $body_content .= $dom_document
      ->saveXML($child_node);
  }
  return preg_replace('|<([^> ]*)/>|i', '<$1 />', $body_content);
}

/**
 * Adds comments around the <!CDATA section in a dom element.
 *
 * DOMDocument::loadHTML in filter_dom_load() makes CDATA sections from the
 * contents of inline script and style tags.  This can cause HTML 4 browsers to
 * throw exceptions.
 *
 * This function attempts to solve the problem by creating a DocumentFragment
 * and immitating the behavior in drupal_get_js(), commenting the CDATA tag.
 *
 * @param $dom_document
 *   The DOMDocument containing the $dom_element.
 * @param $dom_element
 *   The element potentially containing a CDATA node.
 * @param $comment_start
 *   String to use as a comment start marker to escape the CDATA declaration.
 * @param $comment_end
 *   String to use as a comment end marker to escape the CDATA declaration.
 */
function filter_dom_serialize_escape_cdata_element($dom_document, $dom_element, $comment_start = '//', $comment_end = '') {
  foreach ($dom_element->childNodes as $node) {
    if (get_class($node) == 'DOMCdataSection') {

      // See drupal_get_js().  This code is more or less duplicated there.
      $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
      $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
      $fragment = $dom_document
        ->createDocumentFragment();
      $fragment
        ->appendXML($embed_prefix . $node->data . $embed_suffix);
      $dom_element
        ->appendChild($fragment);
      $dom_element
        ->removeChild($node);
    }
  }
}

Functions

Namesort descending Description
filter_dom_load Parses an HTML snippet and returns it as a DOM object.
filter_dom_serialize Converts a DOM object back to an HTML snippet.
filter_dom_serialize_escape_cdata_element Adds comments around the <!CDATA section in a dom element.
mailsystem_html_to_text Transform an HTML string into plain text, preserving the structure of the markup. Useful for preparing the body of a node to be sent by e-mail.
mailsystem_wrap_mail Perform format=flowed soft wrapping for mail (RFC 3676).
_mailsystem_html_to_text Helper function for drupal_html_to_text().
_mailsystem_html_to_text_clean Helper function for drupal_wrap_mail() and drupal_html_to_text().
_mailsystem_html_to_text_table Helper function for _mailsystem_html_to_text().
_mailsystem_indent_mail_line Helper function for array_walk in drupal_wrap_mail().
_mailsystem_wrap_mail_line Helper function for array_walk in drupal_wrap_mail().