You are here

unicode-conversion.php in Typogrify 5

Same filename and directory in other branches
  1. 6 unicode-conversion.php
  2. 7 unicode-conversion.php

File

unicode-conversion.php
View source
<?php

// We rely on some functions from SmartyPants. If it hasn't been loaded already, we'll load it now.
if (!function_exists('SmartyPants')) {
  require_once dirname(__FILE__) . '/smartypants.php';
}

// Also, we need some regex code from SmartyPants
global $sp_tags_to_skip;
$sp_tags_to_skip = '<(/?)(?:pre|code|kbd|script|math)[\\s>]';

// See http://www.unicode.org/charts/PDF/UFB00.pdf
global $ligature_map;
$ligature_map = array(
  "ffi" => "&#xfb03;",
  "ffl" => "&#xfb04;",
  "ff" => "&#xfb00;",
  "fi" => "&#xfb01;",
  "fl" => "&#xfb02;",
  'ij' => '&#x0133;',
  'IJ' => '&#x0132;',
  "st" => "&#xfb06;",
  "ss" => "&szlig;",
);

// See http:#www.unicode.org/charts/PDF/U2000.pdf
global $punctuation_map;
$punctuation_map = array(
  "..." => "&#x2026;",
  ".." => "&#x2025;",
  ". . ." => "&#x2026;",
  "---" => "&mdash;",
  "--" => "&ndash;",
);

// See http:#www.unicode.org/charts/PDF/U2190.pdf
global $arrow_map;
$arrow_map = array(
  "->>" => "&#x21a0;",
  "<<-" => "&#x219e;",
  "->|" => "&#x21e5;",
  "|<-" => "&#x21e4;",
  "<->" => "&#x2194;",
  "->" => "&#x2192;",
  "<-" => "&#x2190;",
  "<=>" => "&#x21d4;",
  "=>" => "&#x21d2;",
  "<=" => "&#x21d0;",
);

// Declare a global array of ascii to unicode mappings
global $unicode_map;

// put some mappings into the ascii to unicode mappings
$unicode_map = array_merge($ligature_map, $arrow_map, $punctuation_map);
function convert_characters($text, $characters_to_convert) {

  // Paramaters:
  // $text                    text to be parsed
  // $characters_to_convert   array of ascii characters to convert
  if ($characters_to_convert == NULL || count($characters_to_convert) < 1) {

    // do nothing
    return $text;
  }

  // get ascii to unicode mappings
  global $unicode_map;
  foreach ($characters_to_convert as $ascii_string) {
    $unicode_strings[] = $unicode_map[$ascii_string];
  }
  $tokens = _TokenizeHTML($text);
  $result = '';
  $in_pre = 0;

  // Keep track of when we're inside <pre> or <code> tags
  foreach ($tokens as $cur_token) {
    if ($cur_token[0] == "tag") {

      // Don't mess with text inside tags, <pre> blocks, or <code> blocks
      $result .= $cur_token[1];

      // Get the tags to skip regex from SmartyPants
      global $sp_tags_to_skip;
      if (preg_match("@{$sp_tags_to_skip}@", $cur_token[1], $matches)) {
        $in_pre = isset($matches[1]) && $matches[1] == '/' ? 0 : 1;
      }
    }
    else {
      $t = $cur_token[1];
      if ($in_pre == 0) {
        $t = ProcessEscapes($t);
        $t = str_replace($characters_to_convert, $unicode_strings, $t);
      }
      $result .= $t;
    }
  }
  return $result;
}

// _TokenizeHTML is shared between PHP SmartyPants and PHP Markdown.
// We're borrowing it for Typogrify.module, too
// We only define it if it is not already defined.
if (!function_exists('_TokenizeHTML')) {
  function _TokenizeHTML($str) {

    //
    //   Parameter:  String containing HTML markup.
    //  Returns:    An array of the tokens comprising the input
    //              string. Each token is either a tag (possibly with nested,
    //              tags contained therein, such as <a href="<MTFoo>">, or a
    //              run of text between tags. Each element of the array is a
    //              two-element array; the first is either 'tag' or 'text';
    //              the second is the actual value.
    //
    //
    //  Regular expression derived from the _tokenize() subroutine in
    //  Brad Choate's MTRegex plugin.
    //  <http://www.bradchoate.com/past/mtregex.php>
    //
    $index = 0;
    $tokens = array();
    $match = '(?s:<!(?:--.*?--\\s*)+>)|' . '(?s:<\\?.*?\\?>)|' . '(?:<[/!$]?[-a-zA-Z0-9:]+\\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
    $parts = preg_split("{({$match})}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
    foreach ($parts as $part) {
      if (++$index % 2 && $part != '') {
        $tokens[] = array(
          'text',
          $part,
        );
      }
      else {
        $tokens[] = array(
          'tag',
          $part,
        );
      }
    }
    return $tokens;
  }
}