Provides a file attachment search implementation for use with the Apache Solr module

 * @file
 * Provides a file attachment search implementation for use with the Apache Solr module

 * Menu callback: Apache Solr Attachments settings tab.
function apachesolr_attachments_admin_page($environment = NULL) {
  if (empty($environment)) {
    $env_id = apachesolr_default_environment();
    $environment = apachesolr_environment_load($env_id);
  else {
    $env_id = $environment['env_id'];
  $output['apachesolr_attachments_settings'] = drupal_get_form('apachesolr_attachments_settings', $env_id);
  $output['apachesolr_attachments_index_action_form'] = drupal_get_form('apachesolr_attachments_index_action_form', $env_id);
  return $output;

 * Displays the Attachment Settings Form.
 * @see apachesolr_attachments_settings_validate()
 * @see apachesolr_attachments_settings_submit()
function apachesolr_attachments_settings($form, &$form_state, $env_id) {
  $default = implode(' ', apachesolr_attachments_default_excluded());
  $form['apachesolr_attachments_excluded_extensions'] = array(
    '#type' => 'textfield',
    '#title' => t('Excluded file extensions'),
    '#default_value' => variable_get('apachesolr_attachments_excluded_extensions', $default),
    '#size' => 80,
    '#maxlength' => 255,
    '#description' => t('File extensions that are excluded from indexing. Separate extensions with a space and do not include the leading dot. Extensions are internally mapped to a MIME type, so it is not necessary to put variations that map to the same type (e.g. tif is sufficient for tif and tiff)'),
  $form['apachesolr_attachments_extract_using'] = array(
    '#type' => 'radios',
    '#title' => t('Extract using'),
    '#options' => array(
      'tika' => t('Tika (local java application)'),
      'solr' => t('Solr (remote server)'),
    '#description' => t("Extraction will be faster if run locally using tika."),
    '#default_value' => variable_get('apachesolr_attachments_extract_using', 'tika'),
  $form['apachesolr_attachments_tika_path'] = array(
    '#type' => 'textfield',
    '#title' => t('Tika directory path'),
    '#size' => 80,
    '#maxlength' => 100,
    '#description' => t("The full path to the tika directory. All library jars must be in the same directory. If on Windows, use forward slashes in the path."),
    '#default_value' => variable_get('apachesolr_attachments_tika_path', ''),
  $form['apachesolr_attachments_tika_jar'] = array(
    '#type' => 'textfield',
    '#title' => t('Tika jar file'),
    '#size' => 20,
    '#description' => t("The name of the tika CLI application jar file, e.g. tika-app-1.1.jar."),
    '#default_value' => variable_get('apachesolr_attachments_tika_jar', 'tika-app-1.1.jar'),
  $form = system_settings_form($form);
  $form['#validate'][] = 'apachesolr_attachments_settings_validate';
  $form['#submit'][] = 'apachesolr_attachments_settings_submit';
  return $form;

 * Form validation for the Apache Solr Attachments settings form.
 * @see apachesolr_attachments_settings()
function apachesolr_attachments_settings_validate($form, &$form_state) {
  if ($form_state['values']['apachesolr_attachments_extract_using'] == 'tika') {
    $path = realpath($form_state['values']['apachesolr_attachments_tika_path']);
    if (!file_exists($path . '/' . $form_state['values']['apachesolr_attachments_tika_jar'])) {
      form_set_error('apachesolr_attachments_tika_path', t('Tika jar file not found at this path.'));

 * Form submit handler for the settings Form
 * @see apachesolr_attachments_settings()
function apachesolr_attachments_settings_submit($form, &$form_state) {

  // Delete this so it's rebuilt.
  drupal_set_message(t('If you changed the allowed file extensions, you may need to delete and re-index all attachments.'));

 * Form builder for the Apachesolr Attachments actions form.
function apachesolr_attachments_index_action_form($form, &$form_state, $env_id) {
  $form = array();
  $form['action'] = array(
    '#type' => 'fieldset',
    '#title' => t('Actions'),
    '#collapsible' => TRUE,
  $form['action']['env_id'] = array(
    '#type' => 'value',
    '#value' => $env_id,
  $form['action']['reset'] = array(
    '#prefix' => '<div>',
    '#suffix' => '</div>',
    '#type' => 'submit',
    '#value' => t('Clear the attachment text extraction cache'),
    '#submit' => array(
  $form['action']['delete'] = array(
    '#prefix' => '<div>',
    '#suffix' => '</div>',
    '#type' => 'submit',
    '#value' => t('Delete the attachments from the index'),
    '#submit' => array(
  $form['action']['extract'] = array(
    '#prefix' => '<div>',
    '#suffix' => '</div>',
    '#type' => 'submit',
    '#value' => t('Test your tika extraction'),
    '#submit' => array(
  return $form;

 * Submit handler for the Indexer actions form, test button.
function apachesolr_attachments_index_action_form_extraction_submit($form, &$form_state) {
  $destination = array();
  if (isset($_GET['destination'])) {
    $destination = drupal_get_destination();
  $env_id = $form_state['values']['env_id'];
  $form_state['redirect'] = array(
      'query' => $destination,

 * Submit handler for the Indexer actions form, reset button.
function apachesolr_attachments_index_action_form_reset_submit($form, &$form_state) {
  $destination = array();
  if (isset($_GET['destination'])) {
    $destination = drupal_get_destination();
  $env_id = $form_state['values']['env_id'];
  $form_state['redirect'] = array(
      'query' => $destination,

 * Submit handler for the Indexer actions form, delete button.
function apachesolr_attachments_index_action_form_delete_submit($form, &$form_state) {
  $destination = array();
  if (isset($_GET['destination'])) {
    $destination = drupal_get_destination();
  $env_id = $form_state['values']['env_id'];
  $form_state['redirect'] = array(
      'query' => $destination,

 * Index confirmation form
 * @see apachesolr_attachments_confirm_submit()
function apachesolr_attachments_confirm($form, $form_state, $operation) {
  $form = array();
  $form['operation'] = array(
    '#type' => 'value',
    '#value' => $operation,
  switch ($operation) {
    case 'delete':
      $text = t('Are you sure you want to delete and re-index the text of all file attachments?');
    case 'clear-cache':
      $text = t('Are you sure you want to delete the cache of extracted text from file attachments?');
  return confirm_form($form, $text, 'admin/config/search/apachesolr/attachments', NULL, t('Confirm'), t('Cancel'));

 * Form submit handler for the index confirmation form
 * @see apachesolr_attachments_confirm()
function apachesolr_attachments_confirm_submit($form, &$form_state) {
  switch ($form_state['values']['operation']) {
    case 'delete':
      if (apachesolr_attachments_delete_index() && apachesolr_attachments_solr_reindex()) {
        drupal_set_message(t('File text has been deleted from the Apache Solr index. You must now <a href="@url">run cron</a> until all files have been re-indexed.', array(
          '@url' => url('admin/reports/status/run-cron', array(
            'query' => array(
              'destination' => 'admin/config/search/apachesolr/index',
      else {
        if (module_exists('dblog')) {
          drupal_set_message(t('Could not delete file text from the Apache Solr index. Check <a href="@url">recent log messages</a>.', array(
            '@url' => url('admin/reports/dblog'),
        else {
          drupal_set_message(t('Could not delete file text from the Apache Solr index.'));
    case 'clear-cache':
      drupal_set_message(t('The local cache of extracted text has been deleted.'));
  $form_state['redirect'] = 'admin/config/search/apachesolr/attachments';

 * Function to test if our extracting with tika succeeds
function apachesolr_attachments_test_tika_extraction() {
  module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index');
  $indexer_table = apachesolr_get_indexer_table('file');

  // Create new file
  $file = new stdClass();
  $file->uri = drupal_get_path('module', 'apachesolr_attachments') . '/tests/test-tika.pdf';
  $file->filemime = 'application/pdf';
  $file->fid = 0;
  $text = apachesolr_attachments_get_attachment_text($file);

  // Check if the text can be succesfully extracted. Only checking 1 word is
  // sufficient
  if (strpos($text, 'extraction')) {
    drupal_set_message(t('Text can be succesfully extracted'));
  else {
    drupal_set_message(t('Text can not be succesfully extracted. Please check your settings'), 'error');

  // Delete our test file from indexing table
    ->condition('entity_id', $file->fid)

 * @see apachesolr_delete_index()
function apachesolr_attachments_delete_index() {
  try {
    $solr = apachesolr_get_solr();
      ->deleteByQuery("entity_type:file AND hash:" . apachesolr_site_hash());
    module_load_include('inc', 'apachesolr', 'apachesolr.index');
    return TRUE;
  } catch (Exception $e) {
    watchdog('Apache Solr Attachments', nl2br(check_plain($e
      ->getMessage())), NULL, WATCHDOG_ERROR);
  return FALSE;


