function remove_invalid_sequences in Lingotek Translation 7.7
Removes invalid UTF-8 sequences from a string
Parameters
string element string to be checked:
Return value
string The new string
1 call to remove_invalid_sequences()
- remove_invalid_xml_characters in ./
lingotek.util.inc - Replaces invalid XML characters with the unicode replacement character
File
- ./
lingotek.util.inc, line 551 - Utility functions.
Code
function remove_invalid_sequences($element, $replacement) {
$char_array = [];
$replacement_length = strlen($replacement);
for ($offset = 0; $offset < strlen($element); ++$offset) {
$num_to_replace = 1;
$char_array[0] = substr($element, $offset, 1) ? ord(substr($element, $offset, 1)) : NULL;
$char_array[1] = substr($element, $offset + 1, 1) ? ord(substr($element, $offset + 1, 1)) : NULL;
$char_array[2] = substr($element, $offset + 2, 1) ? ord(substr($element, $offset + 2, 1)) : NULL;
$char_array[3] = substr($element, $offset + 3, 1) ? ord(substr($element, $offset + 3, 1)) : NULL;
if ($char_array[0] >= 0 && $char_array[0] <= 0x7f) {
continue;
}
if ($char_array[0] >= 0xc2 && $char_array[0] <= 0xdf) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
$offset += 1;
continue;
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] === 0xe0) {
if ($char_array[1] >= 0xa0 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
$offset += 2;
continue;
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] >= 0xe1 && $char_array[0] <= 0xec) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
$offset += 2;
continue;
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] === 0xed) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0x9f) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
$offset += 2;
continue;
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] >= 0xee && $char_array[0] <= 0xef) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
$offset += 2;
continue;
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] === 0xf0) {
if ($char_array[1] >= 0x90 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
$offset += 3;
continue;
}
else {
$num_to_replace = $char_array[3] ? 4 : 3;
}
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] >= 0xf1 && $char_array[0] <= 0xf3) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
$offset += 3;
continue;
}
else {
$num_to_replace = $char_array[3] ? 4 : 3;
}
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
elseif ($char_array[0] === 0xf4) {
if ($char_array[1] >= 0x80 && $char_array[1] <= 0xbf) {
if ($char_array[2] >= 0x80 && $char_array[2] <= 0xbf) {
if ($char_array[3] >= 0x80 && $char_array[3] <= 0xbf) {
$offset += 3;
continue;
}
else {
$num_to_replace = $char_array[3] ? 4 : 3;
}
}
else {
$num_to_replace = $char_array[2] ? 3 : 2;
}
}
else {
$num_to_replace = $char_array[1] ? 2 : 1;
}
}
$element = substr_replace($element, $replacement, $offset, $num_to_replace);
$offset += $replacement_length - 1;
}
return $element;
}