You are here

Utf8.php in Extensible BBCode 8.3

Same filename and directory in other branches
  1. 4.0.x src/Utf8.php

Namespace

Drupal\xbbcode

File

src/Utf8.php
View source
<?php

namespace Drupal\xbbcode;

use function chr;
use function ord;
use function strlen;

/**
 * Implementation of UTF-8 character utilities.
 *
 * @package Drupal\xbbcode
 */
class Utf8 {

  /**
   * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
   *
   * @param string $character
   *   A single UTF-8 character.
   *
   * @return int
   *   The character code, or -1 if an illegal character is found.
   *
   * @see \Drupal\Core\Transliteration\PhpTransliteration::ordUTF8()
   */
  public static function ord($character) : int {
    $first_byte = ord($character[0]);
    if (($first_byte & 0x80) === 0) {

      // Single-byte form: 0xxxxxxxx.
      return $first_byte;
    }
    if (($first_byte & 0xe0) === 0xc0) {

      // Two-byte form: 110xxxxx 10xxxxxx.
      return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
    }
    if (($first_byte & 0xf0) === 0xe0) {

      // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0xf) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
    }
    if (($first_byte & 0xf8) === 0xf0) {

      // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0x7) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
    }

    // Other forms are not legal.
    return -1;
  }

  /**
   * Encodes a numeric code point to a UTF-8 string.
   *
   * @param int $code
   *   A single unicode code point (an integer between 0 ... 0x10ffff).
   *
   * @return string
   *   The UTF-8 string, or an empty string for invalid code points.
   */
  public static function chr($code) : string {

    // Code point must be non-negative.
    if ($code < 0) {
      return '';
    }

    // Single byte (0xxxxxxx).
    if ($code < 0x80) {
      return chr($code);
    }

    // Two bytes (110xxxxx 10xxxxxx).
    if ($code < 0x800) {
      return chr(0xc0 | $code >> 6) . chr(0x80 | 0x3f & $code);
    }

    // Three bytes (1110xxxx 10xxxxxx 10xxxxxx).
    if ($code < 0x10000) {
      return chr(0xe0 | $code >> 12) . chr(0x80 | 0x3f & $code >> 6) . chr(0x80 | 0x3f & $code);
    }

    // Four bytes (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx).
    if ($code < 0x110000) {
      return chr(0xf0 | $code >> 18) . chr(0x80 | 0x3f & $code >> 12) . chr(0x80 | 0x3f & $code >> 6) . chr(0x80 | 0x3f & $code);
    }

    // Code point must be less than or equal to 0x10ffff.
    return '';
  }

  /**
   * Escape specified characters in a UTF8 string to \uXXXX and \UXXXXXXXX.
   *
   * (This resembles the escape sequences of json_encode, but uses a single
   * eight-digit hex code for non-BMP instead of a UTF16 surrogate pair.)
   *
   * Existing sequences will get an extra backslash. Backslashes before
   * existing and new sequences are doubled for distinction. Other backslashes
   * are left unchanged.
   *
   * @param string $string
   *   The string to encode.
   * @param string $characters
   *   A valid character group (without []) to match.
   *   Without a group, all non-ASCII characters are escaped.
   *
   * @return string
   *   The encoded string.
   */
  public static function encode($string, $characters = NULL) : string {
    $characters = $characters ?: '^\\x00-\\x7f';

    // Escape existing \uXXXX and \UXXXXXXXX sequences.
    // This is done by doubling the number of backslashes preceding them.
    $string = preg_replace('/(\\\\+)(u[\\da-fA-F]{4}|U[\\da-fA-F]{8})/', '$1$0', $string);

    // Encode all blacklisted characters (or all non-ASCII characters).
    // Double any backslashes preceding them.
    return preg_replace_callback('/(\\\\*)([' . $characters . '])/u', static function ($match) {
      $code = self::ord($match[2]);
      $sequence = sprintf($code < 0x10000 ? '\\u%04x' : '\\U%08x', $code);
      return $match[1] . $match[1] . $sequence;
    }, $string);
  }

  /**
   * Decode \uXXXX and \UXXXXXXXX sequences in a UTF8 string.
   *
   * @param string $string
   *   The string to decode.
   *
   * @return string
   *   The decoded string.
   */
  public static function decode($string) : string {

    // Decode sequences with an odd number of backslashes.
    $string = (string) preg_replace_callback('/(?<!\\\\)((?:\\\\\\\\)*)\\\\(u[\\da-fA-F]{4}|U[\\da-fA-F]{8})/', static function ($match) {
      $prefix = str_repeat('\\', strlen($match[1]) / 2);
      $code = substr($match[2], 1);
      return $prefix . self::chr(hexdec($code));
    }, $string);

    // Remove backslashes from escaped escape sequences.
    return preg_replace_callback('/(\\\\+)(u[\\da-fA-F]{4}|U[\\da-fA-F]{8})/', static function ($match) {
      return substr($match[1], strlen($match[1]) / 2) . $match[2];
    }, $string);
  }

}

Classes

Namesort descending Description
Utf8 Implementation of UTF-8 character utilities.