You are here

search_files.module in Search Files 7.2

Same filename and directory in other branches
  1. 5 search_files.module
  2. 6.2 search_files.module

Organizes and provides helper functions for extracting text from files.

File

search_files.module
View source
<?php

/**
 * @file
 * Organizes and provides helper functions for extracting text from files.
 */

/**
 * Implements hook_help().
 */
function search_files_help($path, $arg) {
  $output = '';
  switch ($path) {
    case 'admin/config/search/search_files/helpers/autodetect':
      $output .= '<p>' . t('This will only work:') . '</p>';
      $output .= '<ul>';
      $output .= '<li>' . t('If PHP safemode is disabled (required by this module anyway)') . '</li>';
      $output .= '<li>' . t('If your web server is running Unix (Linux, BSD, Solaris, ...), not Windows (required "which" tool missing)') . '</li>';
      $output .= '<li>' . t('If the search PATH inside Apache/PHP environment is set to include the directories containing the helpers') . '</li>';
      $output .= '</ul>';
      break;
  }
  return $output;
}

/**
 * Implements hook_menu().
 */
function search_files_menu() {
  $items = array();
  $items['admin/config/search/search_files'] = array(
    'title' => 'Search Files',
    'description' => 'Manage searching for files in attachments and directories',
    'page callback' => 'search_files_overview',
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_NORMAL_ITEM,
  );
  $items['admin/config/search/search_files/helpers'] = array(
    'title' => 'Helpers',
    'description' => 'List Search Files helper applications',
    'page callback' => 'search_files_helper_list',
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_NORMAL_ITEM,
    'weight' => 0,
  );
  $items['admin/config/search/search_files/helpers/list'] = array(
    'title' => 'List',
    'type' => MENU_DEFAULT_LOCAL_TASK,
    'weight' => 0,
  );
  $items['admin/config/search/search_files/helpers/add'] = array(
    'title' => 'Add',
    'description' => 'Add Search Files helper application',
    'page callback' => 'search_files_edit',
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_LOCAL_TASK,
    'weight' => 1,
  );
  $items['admin/config/search/search_files/helpers/autodetect'] = array(
    'title' => 'Autodetect',
    'description' => 'Autodetect Search Files helper application',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'search_files_helper_autodetect',
    ),
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_LOCAL_TASK,
    'weight' => 2,
  );
  $items['admin/config/search/search_files/helpers/edit'] = array(
    'title' => 'Edit',
    'description' => 'Edit Search Files helper application',
    'page callback' => 'search_files_edit',
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_CALLBACK,
  );
  $items['admin/config/search/search_files/helpers/delete'] = array(
    'title' => 'Delete',
    'description' => 'Delete Search Files helper application',
    'page callback' => 'drupal_get_form',
    'page arguments' => array(
      'search_files_helper_confirm_delete_form',
    ),
    'access arguments' => array(
      'administer search_files configuration',
    ),
    'type' => MENU_CALLBACK,
  );
  return $items;
}

/**
 * Implements hook_permission().
 */
function search_files_permission() {
  return array(
    'administer search_files configuration' => array(
      'title' => t('Administer Search Files configuration'),
    ),
    'view search_files results' => array(
      'title' => t('View Search Files results'),
    ),
  );
}

/**
 * Form callback for path admin/config/search/search_files/helpers/autodetect.
 */
function search_files_helper_autodetect($form, &$form_state) {
  $form['submit'] = array(
    '#type' => 'submit',
    '#value' => t('Autodetect'),
  );
  return $form;
}

/**
 * Submit callback for search_files_helper_autodetect().
 */
function search_files_helper_autodetect_submit($form, &$form_state) {

  // load sample helper apps into database
  search_files_install_auto_helper_app_configuration();
  drupal_goto('admin/config/search/search_files/helpers/list');
}

/**
 * Detects helper applications and adds to database.
 */
function search_files_install_auto_helper_app_configuration() {

  // safe_mode will inhibit shell_exec()
  if (search_files_issafemode()) {
    drupal_set_message(t('Since PHP is running in safe mode we cannot detect all possible helpers and most will have to be added manually.'));

    // load sample helper apps into database
    search_files_helper_db_add(t("Portable Document Format"), "pdf", "/usr/bin/env pdftotext %file% -");
    search_files_helper_db_add(t("Text"), "txt", "/usr/bin/env cat %file%");
  }
  else {
    if (php_uname('s') == 'Darwin') {

      // we are running on a mac so make sure the default install location are
      // in the path
      $path = 'export PATH=/opt/local/bin:/opt/local/sbin:$PATH;';
    }
    else {
      $path = '';
    }

    // test for pdftotext
    $location = trim(shell_exec('which pdftotext'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Portable Document Format"), "pdf", $location . " %file% -");
      drupal_set_message(t('Helper app pdftotext has been detected and configured'));
    }
    else {
      $location = trim(shell_exec($path . 'which pstotext'));
      $location = preg_replace("/^no .*\$/", "", $location);
      if ($location) {
        search_files_helper_db_add(t("PDF"), "pdf", $location . " %file%");
        drupal_set_message(t('Helper app pstotext has been detected and configured'));
      }
    }

    // test for cat
    $location = trim(shell_exec($path . 'which cat'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Text files"), "txt", $location . " %file%");
      drupal_set_message(t('Helper app cat has been detected and configured'));
    }

    // test for catdoc
    $location = trim(shell_exec($path . 'which catdoc'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Word documents"), "doc", $location . " %file%");
      drupal_set_message(t('Helper app catdoc has been detected and configured'));
    }

    // test for docx2txt
    $location = trim(shell_exec($path . 'which docx2txt.pl'));
    $location = preg_replace("/^no .*\$/", "", $location);
    $perl_location = trim(shell_exec($path . 'which perl'));
    $perl_location = preg_replace("/^no .*\$/", "", $perl_location);
    if ($location && $perl_location) {
      search_files_helper_db_add("Word 2007 files", "docx", $perl_location . ' ' . $location . " %file% -");
      drupal_set_message(t('Helper app docx2txt has been detected and configured'));
    }

    // test for xls2csv
    $location = trim(shell_exec($path . 'which xls2csv'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Excel files"), "xls", $location . " %file%");
      drupal_set_message(t('Helper app xls2csv has been detected and configured'));
    }

    // test for catppt
    $location = trim(shell_exec($path . 'which catppt'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Power Point Presentations"), "ppt", $location . " %file%");
      drupal_set_message(t('Helper app catppt has been detected and configured'));
    }

    // test for unrtf
    $location = trim(shell_exec($path . 'which unrtf'));
    $location = preg_replace("/^no .*\$/", "", $location);
    if ($location) {
      search_files_helper_db_add(t("Rich Text Format files"), "rtf", $location . " %file%");
      drupal_set_message(t('Helper app unrtf has been detected and configured'));
    }
  }
}

/**
 * Page callback for admin/config/search/search_files path.
 */
function search_files_overview() {
  $output = '';
  $output .= t("Search Files in<ul><li>Attachments</li><li>Directories</li></ul> and extract their content for index and use with Drupal search.");
  $output .= '<p>' . l(t('Set up helper applications'), 'admin/config/search/search_files/helpers') . '</p>';
  $output .= '<p>' . l(t('Set up attachments search'), 'admin/config/search/search_files/attachments') . '</p>';
  $output .= '<p>' . l(t('Set up directories search'), 'admin/config/search/search_files/directories') . '</p>';
  return $output;
}

/**
 * Returns an array of helper applications.
 *
 * @return
 *   Array whose keys are extensions and values are the paths to helper apps.
 */
function search_files_get_helpers() {

  // Get all the registered helper applications and put them in static variable to elimiate unnecessary db queries
  // in search_files_nodeapi(). The query log feature of the dev module pointed out that this query was done
  // many times instead of once. Making $helpers a static variable reduced the number of queries by 25%.
  static $helpers;
  if (!isset($helpers)) {
    $helpers = array();
    $result = db_query("SELECT * FROM {search_files_helpers}")
      ->fetchAll();
    foreach ($result as $helper) {
      $helpers[$helper->extension] = $helper->helper_path;
    }
  }
  return $helpers;
}

/**
 * Checks whether we are running in PHP safe_mode.
 *
 * @return
 *   1 if we're in safe mode, 0 if not, FALSE if there's an error.
 */
function search_files_issafemode() {
  return preg_match('/(1|on)/i', @ini_get("safe_mode"));
}

/**
 * Form callback for confirming deletion of a helper application.
 */
function search_files_helper_confirm_delete_form($form, &$form_state, $helper_id) {
  $helper_id = intval($helper_id);
  $helper_name = db_query("SELECT name FROM {search_files_helpers} WHERE id = :hid", array(
    ':hid' => $helper_id,
  ))
    ->fetchField();
  $form['search_files_helper_id'] = array(
    '#type' => 'value',
    '#value' => $helper_id,
  );
  $form['search_files_helper_name'] = array(
    '#type' => 'value',
    '#value' => $helper_name,
  );
  return confirm_form($form, t('Delete %name helper?', array(
    '%name' => $helper_name,
  )), 'admin/config/search/search_files/helpers/list', t('Are you sure you want to delete the %name helper? The text extracted by this helper will remain in the search index until the directory is reindexed. This action cannot be undone.', array(
    '%name' => $helper_name,
  )), t('Delete'), t('Cancel'));
}

/**
 * Submit callback for search_files_helper_confirm_delete_form().
 *
 * Removes the helper application from the database table.
 */
function search_files_helper_confirm_delete_form_submit($form, &$form_state) {
  db_delete('search_files_helpers')
    ->condition('id', $form_state['values']['search_files_helper_id'])
    ->execute();
  drupal_set_message(t('Helper has been deleted'));
  drupal_goto('admin/config/search/search_files/helpers/list');
}

/**
 * Returns an HTML table of the current helper apps set up in the system.
 */
function search_files_helper_list() {
  $output = '';
  $header = array(
    t('Helper name'),
    t('Extension'),
    array(
      'data' => t('Operations'),
      'colspan' => '2',
    ),
  );
  $result = db_query("SELECT * FROM {search_files_helpers} ORDER BY extension")
    ->fetchAll();
  $helpers = array();
  foreach ($result as $helper) {
    $helpers[] = array(
      check_plain($helper->name),
      check_plain($helper->extension),
      l(t('edit'), 'admin/config/search/search_files/helpers/edit/' . $helper->id),
      l(t('delete'), 'admin/config/search/search_files/helpers/delete/' . $helper->id),
    );
  }
  $output .= theme('table', array(
    'header' => $header,
    'rows' => $helpers,
    'empty' => t('No helpers are configured yet. Use the Add or Autodect tab to configure helper functions.'),
  ));

  // safe_mode will inhibit shell_exec()
  if (search_files_issafemode()) {
    $output .= '<p>' . t('<b>WARNING!</b> This server has safe_mode enabled, which inhibits use of helper applications') . '</p>';
  }
  else {
    $output .= '<p>' . t("Good. This server has safe_mode disabled, which allows use of helper applications.") . '</p>';
  }
  return $output;
}

/**
 * Edits or adds a helper.
 */
function search_files_edit($hid = 0) {
  if ($hid) {
    $values = db_query('SELECT * FROM {search_files_helpers} WHERE id = :id', array(
      ':id' => $hid,
    ))
      ->fetchAssoc();
    return drupal_get_form('search_files_helper_edit_form', $values);
  }
  return drupal_get_form('search_files_helper_edit_form');
}

/**
 * Form callback for adding/editing a file helper.
 */
function search_files_helper_edit_form($form, &$form_state, $values = array()) {
  $values += array(
    'name' => t('Sample extractor'),
    'extension' => 'foo',
    'helper_path' => '/opt/bin/foo-extract %file%',
    'id' => 0,
  );
  $form['search_files_name'] = array(
    '#type' => 'textfield',
    '#title' => t('Helper name'),
    '#size' => 50,
    '#maxlength' => 50,
    '#default_value' => $values['name'],
    '#required' => TRUE,
    '#description' => t('A name for this helper configuration.'),
  );
  $form['search_files_extension'] = array(
    '#type' => 'textfield',
    '#title' => t('Extension'),
    '#size' => 10,
    '#maxlength' => 10,
    '#default_value' => $values['extension'],
    '#required' => TRUE,
    '#description' => t('Enter the extension for the files that you want the helper application to process. Do not include the period.'),
  );
  $form['search_files_helper_path'] = array(
    '#type' => 'textfield',
    '#title' => t('Helper path'),
    '#size' => 100,
    '#maxlength' => 255,
    '#default_value' => $values['helper_path'],
    '#validate' => array(
      'search_files_helpers_validate_add_edit' => array(),
    ),
    '#required' => TRUE,
    '#description' => t('Enter the path to the helper application installed on your server. "%file%" is a placeholder for the path of the attachment file and is required. Include any command-line parameters as well (for example, pdftotext requires a - after the file to be processed).'),
  );
  if ($values['id']) {

    // editing existing
    $form['search_files_id'] = array(
      '#type' => 'value',
      '#value' => $values['id'],
    );
    $form['submit_done'] = array(
      '#type' => 'submit',
      '#value' => t('Update'),
    );
  }
  else {

    // adding a new one
    $form['submit_done'] = array(
      '#type' => 'submit',
      '#value' => t('Save and done'),
    );
    $form['submit'] = array(
      '#type' => 'submit',
      '#value' => t('Save and add another'),
    );
    $form['#validate'][] = 'search_files_helper_add_form_validate';
  }
  return $form;
}

/**
 * Validate callback for search_files_helper_add_form field.
 */
function search_files_helpers_validate_add_edit($field) {
  if (!preg_match('/%file%/', $field['#value'])) {
    form_set_error($field['#title'], t('"%field" must contain the token %file%', array(
      '%field' => $field['#title'],
    )));
  }

  // Check to see if helper app can be found
  $helper_file = preg_replace('/\\s.+$/', '', $field['#value']);
  if (!file_exists($helper_file)) {
    form_set_error($field['#title'], t("Can't find helper app %helper -- please verify it is installed.", array(
      '%helper' => $helper_file,
    )));
  }
}

/**
 * Updates the helper to the database.
 */
function search_files_helper_edit_form_submit($form, &$form_state) {
  $id = 0;
  if (isset($form_state['values']['search_files_id'])) {
    $id = intval($form_state['values']['search_files_id']);
  }
  if ($id) {
    $num = db_update('search_files_helpers')
      ->fields(array(
      'name' => $form_state['values']['search_files_name'],
      'extension' => $form_state['values']['search_files_extension'],
      'helper_path' => $form_state['values']['search_files_helper_path'],
    ))
      ->condition('id', $id)
      ->execute();
    if ($num) {
      drupal_set_message(t('Helper app %helper_name has been updated', array(
        '%helper_name' => $form_state['values']['search_files_name'],
      )));
    }
    else {
      drupal_set_message(t('An error occurred'));
    }
    drupal_goto('admin/config/search/search_files/helpers/list');
  }
  else {
    $result = search_files_helper_db_add($form_state['values']['search_files_name'], $form_state['values']['search_files_extension'], $form_state['values']['search_files_helper_path']);
    if ($result) {
      drupal_set_message(t('%helper helper added', array(
        '%helper' => $form_state['values']['search_files_name'],
      )));
    }
    else {
      drupal_set_message(t('An error occurred'));
    }
    if ($form_state['clicked_button']['#id'] == 'edit-submit-done') {
      drupal_goto('admin/config/search/search_files/helpers/list');
    }
  }
}

/**
 * Validates uniqueness of name and extension.
 */
function search_files_helper_add_form_validate($form, $form_state) {
  $id = 0;
  if (isset($form_state['values']['search_file_id'])) {
    $id = intval($form_state['values']['search_file_id']);
  }
  $name = $form_state['values']['search_files_name'];
  $sql = "SELECT * FROM {search_files_helpers} WHERE name = :name AND id <> :id";
  $result = db_query($sql, array(
    ':name' => $name,
    ':id' => $id,
  ))
    ->fetchAll();
  foreach ($result as $row) {
    form_set_error("search_files_helper", t('Helper name already in list'));
    break;
  }
  $extension = $form_state['values']['search_files_extension'];
  $sql = "SELECT * FROM {search_files_helpers} WHERE extension = :ext AND id <> :id";
  $result = db_query($sql, array(
    ':ext' => $extension,
    ':id' => $id,
  ))
    ->fetchAll();
  foreach ($result as $row) {
    form_set_error("search_files_helper", t('Extension already in list'));
    break;
  }
}

/**
 * Inserts a helper into the database.
 */
function search_files_helper_db_add($name, $extension, $helper_path) {

  // if there is already a helper for that extension we remove it first so we
  // dont' end up with multiple helpers for the same extension
  db_delete('search_files_helpers')
    ->condition('extension', $extension)
    ->execute();
  return db_insert('search_files_helpers')
    ->fields(array(
    'name' => $name,
    'extension' => $extension,
    'helper_path' => $helper_path,
  ))
    ->execute();
}

/**
 * Converts text to UTF8 using the system's character encoding.
 */
function search_files_convert_to_utf8($text) {
  $encoding = mb_detect_encoding($text);
  $text = drupal_convert_to_utf8($text, $encoding);
  return $text;
}

/**
 * Returns an array of file helper extensions and names.
 *
 * @return
 *   Array whose keys are file extensions, and whose values are the
 *   helper names assigned to them.
 */
function search_files_get_file_extensions() {
  $extensions = array();
  $result = db_query("SELECT extension, name FROM {search_files_helpers}")
    ->fetchAll();
  foreach ($result as $helper) {
    $extensions[$helper->extension] = $helper->name;
  }
  return $extensions;
}

/**
 * Returns the name of a helper for a given extension.
 *
 * @param $ext
 *   Extension to find name of.
 *
 * @return
 *   Name of extension. Returns nothing if not found.
 */
function search_files_helper_name($ext) {
  $result = db_query("SELECT name FROM {search_files_helpers} WHERE extension = :ext", array(
    ':ext' => $ext,
  ))
    ->fetchField();
  if ($result) {
    return $result;
  }
}

/**
 * Updates the search totals for a given search type.
 *
 * Marks all words in the search index for the given type as "dirty", meaning
 * their search totals should be rebuilt, and then causes the totals to be
 * rebuilt.
 *
 * NOTE TO MAINTAINER OF THIS MODULE: You should not need to call the
 * search_dirty() part of this function, because search_index() calls
 * search_dirty() on all indexed content! But I've left it as-is for now...
 * Probably you should just call search_update_totals().
 *
 * @param $type
 *   The search type (search module) to update.
 */
function search_files_update_totals($type) {
  $result = db_query("SELECT data FROM {search_dataset} WHERE type = :type", array(
    ':type' => $type,
  ))
    ->fetchAll();
  foreach ($result as $item) {
    foreach (explode(" ", $item->data) as $word) {
      search_dirty($word);
    }
  }
  search_update_totals();
}

/**
 * return the file text using the appropriate helper
 *
 * @param - $path (string) the system path for the file to read
 *
 * @return - the text in the file
 */
function search_files_get_content($path) {
  $helpers = search_files_get_helpers();
  $file_name = explode('/', $path);
  $file_name = $file_name[count($file_name) - 1];
  $file_extension = explode('.', $file_name);
  $file_extension = $file_extension[count($file_extension) - 1];
  $quoted_file_path = '"' . escapeshellcmd(realpath($path)) . '"';
  $helper_command = preg_replace('/%file%/', $quoted_file_path, $helpers[$file_extension]);

  //error_log('$helper_command: ' . $helper_command);
  $descriptorspec = array(
    0 => array(
      "pipe",
      "r",
    ),
    // stdin is a pipe that the child will read from
    1 => array(
      "pipe",
      "w",
    ),
    // stdout is a pipe that the child will write to
    2 => array(
      "file",
      "/tmp/error-output.txt",
      "a",
    ),
  );
  $cwd = '/tmp';
  $env = array(
    'some_option' => 'aeiou',
  );
  $env = NULL;
  $process = proc_open($helper_command, $descriptorspec, $pipes, $cwd, $env);
  if (is_resource($process)) {

    // $pipes now looks like this:
    // 0 => writeable handle connected to child stdin
    // 1 => readable handle connected to child stdout
    // Any error output will be appended to /tmp/error-output.txt
    $text = stream_get_contents($pipes[1]);
    fclose($pipes[1]);

    // It is important that you close any pipes before calling
    // proc_close in order to avoid a deadlock
    $return_value = proc_close($process);
  }

  //return 'file name: ' . $quoted_file_path . ', text: ' . search_files_convert_to_utf8($text);
  return 'file name: ' . $quoted_file_path . ', text: ' . $text;
}

Functions

Namesort descending Description
search_files_convert_to_utf8 Converts text to UTF8 using the system's character encoding.
search_files_edit Edits or adds a helper.
search_files_get_content return the file text using the appropriate helper
search_files_get_file_extensions Returns an array of file helper extensions and names.
search_files_get_helpers Returns an array of helper applications.
search_files_help Implements hook_help().
search_files_helpers_validate_add_edit Validate callback for search_files_helper_add_form field.
search_files_helper_add_form_validate Validates uniqueness of name and extension.
search_files_helper_autodetect Form callback for path admin/config/search/search_files/helpers/autodetect.
search_files_helper_autodetect_submit Submit callback for search_files_helper_autodetect().
search_files_helper_confirm_delete_form Form callback for confirming deletion of a helper application.
search_files_helper_confirm_delete_form_submit Submit callback for search_files_helper_confirm_delete_form().
search_files_helper_db_add Inserts a helper into the database.
search_files_helper_edit_form Form callback for adding/editing a file helper.
search_files_helper_edit_form_submit Updates the helper to the database.
search_files_helper_list Returns an HTML table of the current helper apps set up in the system.
search_files_helper_name Returns the name of a helper for a given extension.
search_files_install_auto_helper_app_configuration Detects helper applications and adds to database.
search_files_issafemode Checks whether we are running in PHP safe_mode.
search_files_menu Implements hook_menu().
search_files_overview Page callback for admin/config/search/search_files path.
search_files_permission Implements hook_permission().
search_files_update_totals Updates the search totals for a given search type.