You are here

function _mailsystem_html_to_text in Mail System 7.3

Same name and namespace in other branches
  1. 8.2 html_to_text.inc \_mailsystem_html_to_text()
  2. 6.2 html_to_text.inc \_mailsystem_html_to_text()
  3. 7.2 html_to_text.inc \_mailsystem_html_to_text()

Helper function for drupal_html_to_text().

Recursively converts $node to text, wrapping and indenting as necessary.

Parameters

DOMNode $node: The source DOMNode.

array $allowed_tags: A list of tags that will be transformed.

array &$notes: A writeable array of footnote reference numbers, keyed by their respective hyperlink destination urls.

int $line_length: The maximum length of a line, for wrapping. Defaults to 80 characters.

array $parents: The list of ancestor tags, from nearest to most distant. Defaults to an empty array().

int $count: The number to use for the next list item within an ordered list. Defaults to 1.

Return value

string The converted text.

2 calls to _mailsystem_html_to_text()
mailsystem_html_to_text in ./html_to_text.inc
Transform an HTML string into plain text, preserving structure of the Markup.
_mailsystem_html_to_text_table in ./html_to_text.inc
Helper function for _mailsystem_html_to_text().

File

./html_to_text.inc, line 166
Copy of drupal_html_to_text improvements from issue #299138.

Code

function _mailsystem_html_to_text(DOMNode $node, array $allowed_tags, array &$notes, $line_length = 80, array $parents = array(), &$count = NULL) {
  if (!isset($count)) {
    $count = 1;
  }
  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
  if ($node->nodeType === XML_TEXT_NODE) {

    // For text nodes, we just copy the text content.
    $text = $node->textContent;

    // Convert line breaks and trim trailing spaces.
    $text = preg_replace('/ *\\r?\\n/', $eol, $text);
    if (in_array('pre', $parents)) {

      // Within <pre> tags, all spaces become non-breaking.
      // chr(160) is the non-breaking space character.
      $text = str_replace(' ', chr(160), $text);
    }
    else {

      // Outside <pre> tags, collapse whitespace.
      $text = preg_replace('/[[:space:]]+/', ' ', $text);
    }
    return $text;
  }

  // Non-text node.
  $tag = '';
  $text = '';
  $child_text = '';
  $child_count = 1;
  $indent = '';
  $prefix = '';
  $suffix = '';
  $pad = '';
  if (isset($node->tagName) && in_array($node->tagName, $allowed_tags)) {
    $tag = $node->tagName;
    switch ($tag) {

      // Turn links with valid hrefs into footnotes.
      case 'a':
        $test = !empty($node->attributes);
        $test = $test && ($href = $node->attributes
          ->getNamedItem('href'));
        $test = $test && ($url = url(preg_replace('|^' . base_path() . '|', '', $href->nodeValue), array(
          'absolute' => TRUE,
        )));
        $test = $test && valid_url($url);
        if ($test) {

          // Only add links that have not already been added.
          if (isset($notes[$url])) {
            $note = $notes[$url];
          }
          else {
            $note = count($notes) + 1;
            $notes[$url] = $note;
          }
          $suffix = ' [' . $note . ']';
        }
        break;

      // Generic block-level tags.
      case 'address':
      case 'caption':
      case 'div':
      case 'p':
      case 'pre':

        // Start on a new line except as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        $suffix = $eol;
        break;

      // Forced line break.
      case 'br':
        $text = $eol;
        break;

      // Boldface by wrapping with "*" characters.
      case 'b':
      case 'strong':
        $prefix = '*';
        $suffix = '*';
        break;

      // Italicize by wrapping with "/" characters.
      case 'cite':
      case 'em':
      case 'i':
        $prefix = '/';
        $suffix = '/';
        break;

      // Underline by wrapping with "_" characters.
      case 'u':
        $prefix = '_';
        $suffix = '_';
        break;

      // Blockquotes are indented by "> " at each level.
      case 'blockquote':
        $text = $eol;

        // chr(160) is the non-breaking space character.
        $indent = '>' . chr(160);
        $suffix = $eol;
        break;

      // Dictionary definitions are indented by four spaces.
      case 'dd':

        // chr(160) is the non-breaking space character.
        $indent = chr(160) . chr(160) . chr(160) . chr(160);
        $suffix = $eol;
        break;

      // Dictionary list.
      case 'dl':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        $suffix = $eol;
        break;

      // Dictionary term.
      case 'dt':
        $suffix = $eol;
        break;

      // Header level 1 is prefixed by eight "=" characters.
      case 'h1':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '========' . chr(160);
        $pad = chr(160) . '=';
        $suffix = $eol;
        break;

      // Header level 2 is prefixed by six "-" characters.
      case 'h2':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '------' . chr(160);
        $pad = chr(160) . '-';
        $suffix = $eol;
        break;

      // Header level 3 is prefixed by four "." characters and a space.
      case 'h3':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '....' . chr(160);
        $suffix = $eol;
        break;

      // Header level 4 is prefixed by three "." characters and a space.
      case 'h4':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '...' . chr(160);
        $suffix = $eol;
        break;

      // Header level 5 is prefixed by two "." character and a space.
      case 'h5':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '..' . chr(160);
        $suffix = $eol;
        break;

      // Header level 6 is prefixed by one "." character and a space.
      case 'h6':
        $text = "{$eol}{$eol}";

        // chr(160) is the non-breaking space character.
        $indent = '.' . chr(160);
        $suffix = $eol;
        break;

      // Horizontal rulers become a line of "-" characters.
      case 'hr':
        $text = $eol;
        $child_text = '-';
        $pad = '-';
        $suffix = $eol;
        break;

      // List items are treated differently depending on the parent tag.
      case 'li':

        // Ordered list item.
        if (reset($parents) === 'ol') {

          // Check the value attribute.
          $test = !empty($node->attributes);
          $test = $test && ($value = $node->attributes
            ->getNamedItem('value'));
          if ($test) {
            $count = $value->nodeValue;
          }

          // chr(160) is the non-breaking space character.
          $indent = ($count < 10 ? chr(160) : '') . chr(160) . "{$count})" . chr(160);
          $count++;
        }
        else {

          // chr(160) is the non-breaking space character.
          $indent = chr(160) . '*' . chr(160);
        }
        $suffix = $eol;
        break;

      // Ordered lists.
      case 'ol':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }

        // Check the start attribute.
        $test = !empty($node->attributes);
        $test = $test && ($value = $node->attributes
          ->getNamedItem('start'));
        if ($test) {
          $child_count = $value->nodeValue;
        }
        break;

      // Tables require special handling.
      case 'table':
        return _mailsystem_html_to_text_table($node, $allowed_tags, $notes, $line_length);

      // Separate adjacent table cells by two non-breaking spaces.
      case 'td':
        if (!empty($node->nextSibling)) {

          // chr(160) is the non-breaking space character.
          $suffix = chr(160) . chr(160);
        }
        break;

      // End each table row with a newline.
      case 'tr':
        $suffix = $eol;
        break;

      // Unordered lists.
      case 'ul':

        // Start on a new line as the first child of a list item.
        if (!isset($parents[0]) || $parents[0] !== 'li' || !$node
          ->isSameNode($node->parentNode->firstChild)) {
          $text = $eol;
        }
        break;
      default:

        // Coder review complains if there is no default case.
        break;
    }

    // Only add allowed tags to the $parents array.
    array_unshift($parents, $tag);
  }

  // Copy each child node to output.
  if ($node
    ->hasChildNodes()) {
    foreach ($node->childNodes as $child) {
      $child_text .= _mailsystem_html_to_text($child, $allowed_tags, $notes, $line_length - drupal_strlen($indent), $parents, $child_count);
    }
  }

  // We only add prefix and suffix if the child nodes were non-empty.
  if ($child_text > '') {

    // We capitalize the contents of h1 and h2 tags.
    if ($tag === 'h1' || $tag === 'h2') {
      $child_text = drupal_strtoupper($child_text);
    }

    // Don't add a newline to an existing newline.
    if ($suffix === $eol && drupal_substr($child_text, -drupal_strlen($eol)) === $eol) {
      $suffix = '';
    }

    // Trim spaces around newlines except with <pre> or inline tags.
    if (!in_array($tag, array(
      'a',
      'b',
      'cite',
      'em',
      'i',
      'pre',
      'strong',
      'u',
    ))) {
      $child_text = preg_replace('/ *' . $eol . ' */', $eol, $child_text);
    }

    // Soft-wrap at effective line length, but don't space-stuff.
    $child_text = mailsystem_wrap_mail($prefix . $child_text, array(
      // chr(160) is the non-breaking space character.
      'break' => chr(160) . $eol,
      'indent' => $indent,
      'max' => $line_length,
      'pad' => $pad,
      'stuff' => FALSE,
    )) . $suffix;
    if ($tag === 'pre') {

      // Perform RFC-3676 soft-wrapping.
      // chr(160) is the non-breaking space character.
      $child_text = str_replace(chr(160), ' ', $child_text);
      $child_text = mailsystem_wrap_mail($child_text, array(
        'max' => $line_length,
        'stuff' => FALSE,
      ));

      // chr(160) is the non-breaking space character.
      $child_text = str_replace(' ', chr(160), $child_text);
    }
    $text .= $child_text;
  }
  return $text;
}