You are here

function remove_invalid_sequences in Lingotek Translation 7.7

Removes invalid UTF-8 sequences from a string

Parameters

string element string to be checked:

Return value

string The new string

1 call to remove_invalid_sequences()
remove_invalid_xml_characters in ./lingotek.util.inc
Replaces invalid XML characters with the unicode replacement character

File

./lingotek.util.inc, line 551
Utility functions.

Code

function remove_invalid_sequences($element, $replacement) {
  $char_array = [];
  $replacement_length = strlen($replacement);
  for ($offset = 0; $offset < strlen($element); ++$offset) {
    $num_to_replace = 1;
    $char_array[0] = substr($element, $offset, 1) ? ord(substr($element, $offset, 1)) : NULL;
    $char_array[1] = substr($element, $offset + 1, 1) ? ord(substr($element, $offset + 1, 1)) : NULL;
    $char_array[2] = substr($element, $offset + 2, 1) ? ord(substr($element, $offset + 2, 1)) : NULL;
    $char_array[3] = substr($element, $offset + 3, 1) ? ord(substr($element, $offset + 3, 1)) : NULL;
    if ($char_array[0] >= 0 && $char_array[0] <= 0x7f) {
      continue;
    }
    if ($char_array[0] >= 0xc2 && $char_array[0] <= 0xdf) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
        $offset += 1;
        continue;
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] === 0xe0) {
      if ($char_array[1] >= 0xa0 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          $offset += 2;
          continue;
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] >= 0xe1 && $char_array[0] <= 0xec) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          $offset += 2;
          continue;
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] === 0xed) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0x9f) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          $offset += 2;
          continue;
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] >= 0xee && $char_array[0] <= 0xef) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          $offset += 2;
          continue;
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] === 0xf0) {
      if ($char_array[1] >= 0x90 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
            $offset += 3;
            continue;
          }
          else {
            $num_to_replace = $char_array[3] ? 4 : 3;
          }
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] >= 0xf1 && $char_array[0] <= 0xf3) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
            $offset += 3;
            continue;
          }
          else {
            $num_to_replace = $char_array[3] ? 4 : 3;
          }
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    elseif ($char_array[0] === 0xf4) {
      if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
        if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
          if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
            $offset += 3;
            continue;
          }
          else {
            $num_to_replace = $char_array[3] ? 4 : 3;
          }
        }
        else {
          $num_to_replace = $char_array[2] ? 3 : 2;
        }
      }
      else {
        $num_to_replace = $char_array[1] ? 2 : 1;
      }
    }
    $element = substr_replace($element, $replacement, $offset, $num_to_replace);
    $offset += $replacement_length - 1;
  }
  return $element;
}