You are here

strip_utf8mb4.module in Strip 4-byte UTF8 7

Allow users to Strip 4-byte UTF8 characters. overly long 2 byte sequences, as well as characters above U+10000, and reject overly long 3 byte sequences and UTF-16

File

strip_utf8mb4.module
View source
<?php

/**
 * @file
 * Allow users to Strip 4-byte UTF8 characters. overly long 2 byte sequences, as well as characters above U+10000, and reject overly long 3 byte sequences and UTF-16
 */

/**
 * Implements hook_menu().
 */
function strip_utf8mb4_menu() {
  $items = array();
  $items['admin/config/content/strip_utf8mb4'] = array(
    'type' => MENU_NORMAL_ITEM,
    'title' => 'Strip 4-byte UTF8',
    'description' => t('Configure text fields to reject overly long 2 byte sequences, as well as characters above U+10000, reject overly long 3 byte sequences and UTF-16.'),
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'strip_utf8mb4_configuration_form',
    ),
    'access arguments' => array(
      'administer site configuration',
    ),
    'file' => 'strip_utf8mb4.admin.inc',
  );
  return $items;
}

/**
 * Implement hook_webform_submission_presave().
 */
function strip_utf8mb4_webform_submission_presave($node, &$submission) {
  $components = $node->webform['components'];

  // The submission has no data about the type of the values, so we go through
  // the componen saved in the node.
  foreach ($components as $cid => $component) {
    if (isset($component['type']) && _webform_strip_utf8mb4_for($component['type'])) {
      $component_value =& $submission->data[$cid]['value'];
      if (isset($component_value)) {
        if (is_array($component_value) && count($component_value) > 0) {
          foreach ($component_value as &$value) {
            $value = _strip_utf8mb4_for_text_fields($value, variable_get('strip_utf8mb4_replace_string', '--'));
          }
        }
        elseif (is_string($component_value) && $component_value != '') {
          $component_value = _strip_utf8mb4_for_text_fields($component_value, variable_get('strip_utf8mb4_replace_string', '--'));
        }
      }
    }
  }
}

/**
 * Implements hook_field_attach_presave().
 */
function strip_utf8mb4_field_attach_presave($entity_type, $entity) {
  list($id, $vid, $bundle) = entity_extract_ids($entity_type, $entity);
  foreach (field_info_instances($entity_type, $bundle) as $instance) {
    if (isset($instance['widget']['type']) && _strip_utf8mb4_for($instance['widget']['type'])) {

      // Grap the entity metadata wrapper for this field.
      $entity_wrapper = entity_metadata_wrapper($entity_type, $entity);

      // Grap the filed name from the instance.
      $field_name = $instance['field_name'];

      // Get text field values.
      $text_field_data = $entity_wrapper->{$field_name}
        ->value();

      // If we do have data in the field.
      if (is_array($text_field_data) && count($text_field_data) > 0) {

        // Reject not utf8 strings for the field value.
        if (isset($text_field_data['value'])) {
          $text_field_data['value'] = _strip_utf8mb4_for_text_fields($text_field_data['value'], variable_get('strip_utf8mb4_replace_string', '--'));
        }
        else {
          foreach ($text_field_data as $text_field_data_item_key => $text_field_data_item) {
            $text_field_data[$text_field_data_item_key] = _strip_utf8mb4_for_text_fields($text_field_data_item, variable_get('strip_utf8mb4_replace_string', '--'));
          }
        }

        // Reject not utf8 strings for the field summary if we do have.
        if (isset($text_field_data['summary'])) {
          $text_field_data['summary'] = _strip_utf8mb4_for_text_fields($text_field_data['summary'], variable_get('strip_utf8mb4_replace_string', '--'));
        }

        // Save the filtered field data in the entity object.
        $entity_wrapper->{$field_name}
          ->set($text_field_data);
      }
      elseif (is_string($text_field_data) && $text_field_data != '') {
        $text_field_data = _strip_utf8mb4_for_text_fields($text_field_data, variable_get('strip_utf8mb4_replace_string', '--'));

        // Save the filtered field data in the entity object.
        $entity_wrapper->{$field_name} = $text_field_data;
      }
    }
  }

  // If we want to strip none utf8 from Drupal core node's title.
  if (isset($entity->title) && _strip_utf8mb4_for('core_title')) {
    $entity->title = _strip_utf8mb4_for_text_fields($entity->title, variable_get('strip_utf8mb4_replace_string', '--'));
  }
}

/**
 * Return TRUE if the text field widget type is enabled from the configurations.  
 *
 * @param string $field_widget_type
 *
 * @return bool
 */
function _strip_utf8mb4_for($field_widget_type) {

  // Get list of text filed widget types to be filterd or replaced.
  $strip_utf8mb4_for = variable_get('strip_utf8mb4_for_text_field_widget_types', array(
    'text_textfield',
    'text_textarea',
    'text_textarea_with_summary',
    'core_title',
  ));
  return in_array($field_widget_type, $strip_utf8mb4_for, TRUE);
}

/**
 * Return TRUE if the component type is enabled from the configurations.
 * 
 * @param string $component_type
 * 
 * @return bool
 */
function _webform_strip_utf8mb4_for($component_type) {

  // Get list of text filed widget types to be filterd or replaced.
  $strip_utf8mb4_for = variable_get('webform_strip_utf8mb4_for_component_types', array(
    'textfield',
    'textarea',
  ));
  return in_array($component_type, $strip_utf8mb4_for, TRUE);
}

/**
 *  Retern the processed text which the none utf8 characters has been replaced.
 *
 * @param string $text_data
 * @param string $replace_text
 *
 * @return string
 */
function _strip_utf8mb4_for_text_fields($text_data, $replace_text = '') {
  $replacements_done = array();

  // Strip overly long 2 byte sequences, as well as characters
  //  above U+10000 and replace with $replace_text
  $processed_text_data = preg_replace('/[\\x00-\\x08\\x10\\x0B\\x0C\\x0E-\\x19\\x7F]' . '|[\\x00-\\x7F][\\x80-\\xBF]+' . '|([\\xC0\\xC1]|[\\xF0-\\xFF])[\\x80-\\xBF]*' . '|[\\xC2-\\xDF]((?![\\x80-\\xBF])|[\\x80-\\xBF]{2,})' . '|[\\xE0-\\xEF](([\\x80-\\xBF](?![\\x80-\\xBF]))|(?![\\x80-\\xBF]{2})|[\\x80-\\xBF]{3,})/S', $replace_text, $text_data, -1, $replacements_done[]);

  // Strip overly long 3 byte sequences and UTF-16 surrogates and replace with $replace_text
  $processed_text_data = preg_replace('/\\xE0[\\x80-\\x9F][\\x80-\\xBF]' . '|\\xED[\\xA0-\\xBF][\\x80-\\xBF]/S', $replace_text, $processed_text_data, -1, $replacements_done[]);
  if (array_sum($replacements_done) > 0) {
    $message = t('Unsupported characters in your text were replaced with "!replacement"', array(
      '!replacement' => $replace_text,
    ));
    drupal_set_message($message, 'warning', FALSE);
  }
  return $processed_text_data;
}

Functions

Namesort descending Description
strip_utf8mb4_field_attach_presave Implements hook_field_attach_presave().
strip_utf8mb4_menu Implements hook_menu().
strip_utf8mb4_webform_submission_presave Implement hook_webform_submission_presave().
_strip_utf8mb4_for Return TRUE if the text field widget type is enabled from the configurations.
_strip_utf8mb4_for_text_fields Retern the processed text which the none utf8 characters has been replaced.
_webform_strip_utf8mb4_for Return TRUE if the component type is enabled from the configurations.