You are here

function ARC_ntriples_serializer::str2unicode_nfc in Taxonomy import/export via XML 6

Same name and namespace in other branches
  1. 5.2 arc/ARC_ntriples_serializer.php \ARC_ntriples_serializer::str2unicode_nfc()
  2. 5 arc/ARC_ntriples_serializer.php \ARC_ntriples_serializer::str2unicode_nfc()
  3. 6.2 arc/ARC_ntriples_serializer.php \ARC_ntriples_serializer::str2unicode_nfc()
1 call to ARC_ntriples_serializer::str2unicode_nfc()
ARC_ntriples_serializer::get_ntriples in arc/ARC_ntriples_serializer.php

File

arc/ARC_ntriples_serializer.php, line 38

Class

ARC_ntriples_serializer

Code

function str2unicode_nfc($str) {
  $result = "";

  /* try to detect encoding */
  $tmp = str_replace("?", "", $str);
  if (strpos(utf8_decode($tmp), "?") === false) {
    $str = utf8_decode($str);
  }
  for ($i = 0, $i_max = strlen($str); $i < $i_max; $i++) {
    $nr = 0;

    /* unicode dec nr */

    /* char */
    $char = $str[$i];

    /* utf8 binary */
    $utf8_char = utf8_encode($char);
    $bytes = strlen($utf8_char);
    if ($bytes == 1) {

      /* 0####### (0-127) */
      $nr = ord($utf8_char);
    }
    elseif ($bytes == 2) {

      /* 110##### 10###### = 192+x 128+x */
      $nr = (ord($utf8_char[0]) - 192) * 64 + (ord($utf8_char[1]) - 128);
    }
    elseif ($bytes == 3) {

      /* 1110#### 10###### 10###### = 224+x 128+x 128+x */
      $nr = (ord($utf8_char[0]) - 224) * 4096 + (ord($utf8_char[1]) - 128) * 64 + (ord($utf8_char[2]) - 128);
    }
    elseif ($bytes == 4) {

      /* 1111#### 10###### 10###### 10###### = 240+x 128+x 128+x 128+x */
      $nr = (ord($utf8_char[0]) - 240) * 262144 + (ord($utf8_char[1]) - 128) * 4096 + (ord($utf8_char[2]) - 128) * 64 + (ord($utf8_char[3]) - 128);
    }

    /* result (see http://www.w3.org/TR/rdf-testcases/#ntrip_strings) */
    if ($nr < 9) {

      /* #x0-#x8 (0-8) */
      $result .= "\\u" . sprintf("%04X", $nr);
    }
    elseif ($nr == 9) {

      /* #x9 (9) */
      $result .= '\\t';
    }
    elseif ($nr == 10) {

      /* #xA (10) */
      $result .= '\\n';
    }
    elseif ($nr < 13) {

      /* #xB-#xC (11-12) */
      $result .= "\\u" . sprintf("%04X", $nr);
    }
    elseif ($nr == 13) {

      /* #xD (13) */
      $result .= '\\t';
    }
    elseif ($nr < 32) {

      /* #xE-#x1F (14-31) */
      $result .= "\\u" . sprintf("%04X", $nr);
    }
    elseif ($nr < 34) {

      /* #x20-#x21 (32-33) */
      $result .= $char;
    }
    elseif ($nr == 34) {

      /* #x22 (34) */
      $result .= '\\"';
    }
    elseif ($nr < 92) {

      /* #x23-#x5B (35-91) */
      $result .= $char;
    }
    elseif ($nr == 92) {

      /* #x5C (92) */
      $result .= '\\';
    }
    elseif ($nr < 127) {

      /* #x5D-#x7E (93-126) */
      $result .= $char;
    }
    elseif ($nr < 65536) {

      /* #x7F-#xFFFF (128-65535) */
      $result .= "\\u" . sprintf("%04X", $nr);
    }
    elseif ($nr < 1114112) {

      /* #x10000-#x10FFFF (65536-1114111) */
      $result .= "\\U" . sprintf("%08X", $nr);
    }
    else {

      /* other chars are not defined => ignore */
    }
  }
  return $result;
}