You are here

transliteration.module in Transliteration 6.3

Converts non-latin text to US-ASCII and sanitizes file names.

@author Stefan M. Kudwien (http://drupal.org/user/48898)

File

transliteration.module
View source
<?php

/**
 * @file
 * Converts non-latin text to US-ASCII and sanitizes file names.
 *
 * @author Stefan M. Kudwien (http://drupal.org/user/48898)
 */

/**
 * Implements hook_menu().
 */
function transliteration_menu() {
  $items['admin/settings/file-system/settings'] = array(
    'title' => 'Settings',
    'file path' => drupal_get_path('module', 'system'),
    'weight' => -10,
    'type' => MENU_DEFAULT_LOCAL_TASK,
  );
  $items['admin/settings/file-system/transliteration'] = array(
    'title' => 'Transliteration',
    'description' => 'Convert existing file names to US-ASCII.',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'transliteration_retroactive',
    ),
    'access arguments' => array(
      'administer site configuration',
    ),
    'file' => 'transliteration.admin.inc',
    'weight' => 10,
    'type' => MENU_LOCAL_TASK,
  );
  return $items;
}

/**
 * Implements hook_form_FORM_ID_alter().
 *
 * Add transliteration settings to the file system configuration form.
 */
function transliteration_form_system_file_system_settings_alter(&$form, &$form_state) {
  $form['transliteration'] = array(
    '#type' => 'item',
    '#title' => t('Transliteration'),
    '#value' => '',
  );
  $form['transliteration']['transliteration_file_uploads'] = array(
    '#type' => 'checkbox',
    '#title' => t('Transliterate file names during upload.'),
    '#description' => t('Enable to convert file names to US-ASCII character set for cross-platform compatibility.'),
    '#default_value' => variable_get('transliteration_file_uploads', TRUE),
  );
  $form['transliteration']['transliteration_file_lowercase'] = array(
    '#type' => 'checkbox',
    '#title' => t('Lowercase transliterated file names.'),
    '#default_value' => variable_get('transliteration_file_lowercase', TRUE),
    '#description' => t('This is a recommended setting to prevent issues with case-insensitive file systems. It has no effect if transliteration has been disabled.'),
  );
  $form['buttons']['#weight'] = 1;
}

/**
 * Implements hook_form_FORM_ID_alter().
 *
 * Adds transliteration settings to the search settings form.
 */
function transliteration_form_search_admin_settings_alter(&$form, &$form_state) {
  $form['transliteration'] = array(
    '#type' => 'fieldset',
    '#title' => t('Transliteration'),
  );
  $form['transliteration']['transliteration_search'] = array(
    '#type' => 'checkbox',
    '#title' => t('Transliterate search index and searched strings.'),
    '#description' => t('Enable to allow searching and indexing using US-ASCII character set, i.e. to treat accented and unaccented letters the same.'),
    '#default_value' => variable_get('transliteration_search', TRUE),
  );
  $form['buttons']['#weight'] = 1;
}

/**
 * Transliterate and sanitize a file name.
 *
 * The resulting file name has white space replaced with underscores, consists
 * of only US-ASCII characters, and is converted to lowercase (if configured).
 * If multiple files have been submitted as an array, the names will be
 * processed recursively.
 *
 * @param $filename
 *   A file name, or an array of file names.
 * @param $source_langcode
 *   Optional ISO 639 language code that denotes the language of the input and
 *   is used to apply language-specific variations. If the source language is
 *   not known at the time of transliteration, it is recommended to set this
 *   argument to the site default language to produce consistent results.
 *   Otherwise the current display language will be used.
 * @return
 *   Sanitized file name, or array of sanitized file names.
 *
 * @see language_default()
 */
function transliteration_clean_filename($filename, $source_langcode = NULL) {
  if (is_array($filename)) {
    foreach ($filename as $key => $value) {
      $filename[$key] = transliteration_clean_filename($value, $source_langcode);
    }
    return $filename;
  }
  $filename = transliteration_get($filename, '', $source_langcode);

  // Replace whitespace.
  $filename = str_replace(' ', '_', $filename);

  // Remove remaining unsafe characters.
  $filename = preg_replace('![^0-9A-Za-z_.-]!', '', $filename);

  // Remove multiple consecutive non-alphabetical characters.
  $filename = preg_replace('/(_)_+|(\\.)\\.+|(-)-+/', '\\1\\2\\3', $filename);

  // Force lowercase to prevent issues on case-insensitive file systems.
  if (variable_get('transliteration_file_lowercase', TRUE)) {
    $filename = strtolower($filename);
  }
  return $filename;
}

/**
 * Transliterate text.
 *
 * Takes an input string in any language and character set, and tries to
 * represent it in US-ASCII characters by conveying, in Roman letters, the
 * pronunciation expressed by the text in some other writing system.
 *
 * @param $text
 *   UTF-8 encoded text input.
 * @param $unknown
 *   Replacement string for characters that do not have a suitable ASCII
 *   equivalent.
 * @param $source_langcode
 *   Optional ISO 639 language code that denotes the language of the input and
 *   is used to apply language-specific variations. If the source language is
 *   not known at the time of transliteration, it is recommended to set this
 *   argument to the site default language to produce consistent results.
 *   Otherwise the current display language will be used.
 * @return
 *   Transliterated text.
 *
 * @see language_default()
 */
function transliteration_get($text, $unknown = '?', $source_langcode = NULL) {
  static $loaded = FALSE;
  if (!$loaded) {
    module_load_include('inc', 'transliteration');
    $loaded = TRUE;
  }
  return transliteration_process($text, $unknown, $source_langcode);
}

/**
 * Implementation of hook_init().
 *
 * Sanitize file names during upload.
 */
function transliteration_init() {
  if (!empty($_FILES['files']) && variable_get('transliteration_file_uploads', TRUE)) {

    // Figure out language, which is available in $_POST['language'] for node
    // forms.
    $langcode = NULL;
    if (!empty($_POST['language'])) {
      $languages = language_list();
      if (isset($languages[$_POST['language']])) {
        $langcode = $_POST['language'];
      }
    }
    foreach ($_FILES['files']['name'] as $field => $filename) {

      // Keep a copy of the unaltered file name.
      $_FILES['files']['orig_name'][$field] = $filename;
      $_FILES['files']['name'][$field] = transliteration_clean_filename($filename, $langcode);
    }
  }
}

/**
 * Implements hook_search_preprocess().
 *
 * Transliterates text added to the search index and user submitted search
 * keywords.
 */
function transliteration_search_preprocess($text) {
  if (variable_get('transliteration_search', TRUE)) {
    return transliteration_get($text, '', language_default('language'));
  }
  return $text;
}

/**
 * Implements hook_filter().
 */
function transliteration_filter($op, $delta = 0, $format = -1, $text = '', $cache_id = 0) {
  switch ($op) {
    case 'list':
      return array(
        t('Convert all characters to US-ASCII'),
      );
    case 'no cache':
      return FALSE;
    case 'process':
      return transliteration_get($text, variable_get("transliteration_filter_no_known_transliteration_{$format}", '?'));
    case 'settings':
      return array(
        'filter_transliteration' => array(
          '#type' => 'fieldset',
          '#title' => 'Transliteration',
          '#collapsible' => TRUE,
          '#collapsed' => FALSE,
          "transliteration_filter_no_known_transliteration_{$format}" => array(
            '#type' => 'textfield',
            '#title' => t('Placeholder for characters with no known US-ASCII equivalent'),
            '#size' => 2,
            // The maximum length is 5 in order to accommodate unicode multibyte input.
            '#maxlength' => 5,
            '#default_value' => variable_get("transliteration_filter_no_known_transliteration_{$format}", '?'),
          ),
        ),
      );
    default:
      return $text;
  }
}

/**
 * Implements hook_filter_tips().
 */
function transliteration_filter_tips($delta, $format, $long) {
  return t('Non-latin text (e.g., å, ö, 漢) will be converted to US-ASCII equivalents (a, o, ?).');
}