You are here

function boost_crawler_run in Boost 6

Same name and namespace in other branches
  1. 7 boost_crawler/boost_crawler.module \boost_crawler_run()

The brains of the crawler.

Parameters

$expire: Has the site changed, if so get expire column

1 call to boost_crawler_run()
boost_cron in ./boost.module
Implementation of hook_cron(). Performs periodic actions.
1 string reference to 'boost_crawler_run'
boost_menu in ./boost.module
Implementation of hook_menu().

File

./boost.module, line 5729
Provides static file caching for Drupal text output. Pages, Feeds, ect...

Code

function boost_crawler_run($expire = -1) {
  global $base_url, $_boost;
  $this_thread = isset($_GET['thread']) && is_numeric($_GET['thread']) ? $_GET['thread'] : NULL;
  $total_threads = isset($_GET['total']) && is_numeric($_GET['total']) ? $_GET['total'] : NULL;
  $expire = $expire == -1 && isset($_GET['expire']) && is_numeric($_GET['expire']) ? $_GET['expire'] : $expire;
  $self = BOOST_CRAWLER_SELF;
  $GLOBALS['_boost_max_execution_time'] = ini_get('max_execution_time');
  $GLOBALS['_boost_output_buffering'] = ini_get('output_buffering');
  if ($_GET['q'] == 'boost-crawler') {

    // if not called via cron, require key to be present in url
    if ($_GET['key'] != variable_get('boost_crawler_key', FALSE)) {
      drupal_access_denied();
      exit;
    }

    // Test for access on status page
    if ($_GET['test']) {
      echo '<h1>OK</h1>';
      exit;
    }

    // Stop button code
    if (_boost_variable_get('boost_crawler_stopped')) {

      // Wait 0 to 0.1 seconds before grabbing number of threads.
      usleep(mt_rand(0, 100000));
      db_lock_table('variable');
      $threads = _boost_variable_get('boost_crawler_number_of_threads');
      _boost_variable_set('boost_crawler_number_of_threads', (int) $threads - 1);

      // Clock out
      _boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
      db_unlock_tables();
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_stop'])) {
        watchdog('boost', 'Crawler - Thread %num stopped.', array(
          '%num' => $this_thread,
        ));
      }
      ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
      ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
      exit;
    }

    // Kill this thread if it doesn't have a thread number assigned to it.
    if (!isset($this_thread)) {
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_rogue'])) {
        watchdog('boost', 'Crawler - Rogue thread killed.');
      }
      exit;
    }

    // Try to prevent crawler from stalling.
    ini_set('max_execution_time', 600);

    // Return html so connection closes
    boost_async_opp('async');

    // Turn off output buffer.
    ini_set('output_buffering', 'off');

    // Fetch the cron semaphore
    $semaphore = variable_get('cron_semaphore', FALSE);

    // Wait 15 seconds if cron still running and try again (let cron finish); if longer then 5 minutes stop stalling and start crawling.
    if ($semaphore == TRUE && BOOST_TIME - $semaphore < 300) {
      if (_boost_variable_get('boost_crawler_sleeping')) {

        // Kill this thread; multiple crawlers sleeping.
        ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
        ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
        exit;
      }
      _boost_variable_set('boost_crawler_sleeping', TRUE);
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_sleep'])) {
        watchdog('boost', 'Crawler Sleep for 15 seconds');
      }
      sleep(15);
      _boost_variable_set('boost_crawler_sleeping', FALSE);
      boost_async_call_crawler($self, 1, NULL, $expire);
      exit;
    }

    // Crawler was forced to stop last run, wait extra time before starting up again.
    if (variable_get('boost_crawler_stopped', FALSE) && !isset($this_thread) && !isset($total_threads)) {
      if (_boost_variable_get('boost_crawler_sleeping')) {
        ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
        ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
        exit;
      }
      _boost_variable_set('boost_crawler_sleeping', TRUE);
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_shutdown'])) {
        watchdog('boost', 'Crawler sleeping for @x seconds, do to forced shutdown.', array(
          '@x' => 2 * BOOST_CRAWLER_THREADS * BOOST_CRAWLER_BATCH_SIZE,
        ));
      }
      $i = BOOST_CRAWLER_BATCH_SIZE;
      while ($i > 0) {
        _boost_set_time_limit(0);
        sleep(2 * BOOST_CRAWLER_THREADS);
        $i--;
      }
      variable_set('boost_crawler_stopped', FALSE);
      _boost_variable_set('boost_crawler_sleeping', FALSE);
      boost_async_call_crawler($self, 1, NULL, $expire);
      exit;
    }

    // Add URL's to crawler table, call self and exit
    if (!boost_crawler_seed_tables($expire)) {
      boost_async_call_crawler($self, $this_thread, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
      exit;
    }

    // Calc Threads
    $total = boost_crawler_total_count() - BOOST_CRAWLER_BATCH_SIZE;
    $threads = _boost_variable_get('boost_crawler_number_of_threads');
    $threads = $threads > 0 ? $threads : BOOST_CRAWLER_THREADS;
    if ($total / BOOST_CRAWLER_BATCH_SIZE < BOOST_CRAWLER_THREADS) {
      $threads = floor($total / BOOST_CRAWLER_BATCH_SIZE);
    }

    // Sanity Check
    if (abs($threads) > BOOST_CRAWLER_THREADS) {

      // Kill this thread
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_kill'])) {
        watchdog('boost', 'Crawler - Thread %num of %total Killed.', array(
          '%num' => $this_thread,
          '%total' => $total_threads,
        ));
      }
      ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
      ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
      exit;
    }

    // Start the clock on first run
    if (!_boost_variable_get('boost_crawler_start_time')) {
      _boost_variable_set('boost_crawler_start_time', BOOST_TIME);
      _boost_variable_set('boost_crawler_number_of_threads', (int) $threads);

      // Clock in
      _boost_variable_set('boost_crawler_thread_num_' . $this_thread, BOOST_TIME);
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_startup'])) {
        watchdog('boost', 'Crawler - Thread @num of @total started', array(
          '@num' => 1,
          '@total' => $threads,
        ));
      }
    }

    // Spin up threads on demand
    while ($threads > 0 && $this_thread == 1) {
      db_lock_table('variable');
      $thread_time = _boost_variable_get('boost_crawler_thread_num_' . $threads);
      if (!$thread_time || $thread_time + BOOST_MAX_THREAD_TIME < BOOST_TIME) {
        _boost_variable_set('boost_crawler_thread_num_' . $threads, BOOST_TIME);
        db_unlock_tables();
        boost_async_call_crawler($self, $threads, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
        if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_startup'])) {
          watchdog('boost', 'Crawler - Thread @num of @total started', array(
            '@num' => $threads,
            '@total' => _boost_variable_get('boost_crawler_number_of_threads'),
          ));
        }
        _boost_set_time_limit(0);
      }
      db_unlock_tables();
      $threads--;
    }

    // Make sure this thread is supposed to be running.
    $thread = _boost_variable_get('boost_crawler_number_of_threads');
    if ($thread >= 1 && $this_thread > $thread) {

      // Clock out
      if (isset($this_thread)) {
        _boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
        if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_kill'])) {
          watchdog('boost', 'Crawler - Thread %num of %total Killed.', array(
            '%num' => $this_thread,
            '%total' => $total_threads,
          ));
        }
      }

      //       elseif (BOOST_VERBOSE >= 5) {
      //         watchdog('boost', 'Crawler - Extra Thread Killed.');
      //       }
      if (!boost_crawler_threads_alive() && _boost_variable_get('boost_crawler_number_of_tries') < 3 && boost_crawler_verify($expire)) {
        variable_set('boost_crawler_number_of_tries', (int) _boost_variable_get('boost_crawler_number_of_tries') + 1);
        _boost_variable_set('boost_crawler_number_of_threads', 1);
        if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_restart'])) {
          watchdog('boost', 'Crawler - Restarting with 1 thread, to try & get the stubborn urls cached.');
        }
        boost_async_call_crawler($self, 1, 1, $expire);
        exit;
      }
      ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
      ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
      exit;
    }

    // Clock in
    _boost_variable_set('boost_crawler_thread_num_' . $this_thread, BOOST_TIME);

    // Wait 0 to 0.1 seconds before grabbing DB position counter.
    usleep(mt_rand(0, 100000));
    db_lock_table('variable');
    $from = _boost_variable_get('boost_crawler_position');
    _boost_variable_set('boost_crawler_position', $from + BOOST_CRAWLER_BATCH_SIZE);
    db_unlock_tables();
    $results = db_query_range("SELECT DISTINCT hash, url FROM {boost_crawler}", $from, BOOST_CRAWLER_BATCH_SIZE);
    $url = db_fetch_array($results);
    if (!$url) {

      // We Are Done
      // Wait 0 to 0.1 seconds before grabbing number of threads.
      usleep(mt_rand(0, 100000));
      db_lock_table('variable');
      $threads = _boost_variable_get('boost_crawler_number_of_threads');
      _boost_variable_set('boost_crawler_number_of_threads', (int) $threads - 1);

      // Clock out
      _boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
      db_unlock_tables();
      if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_done'])) {
        watchdog('boost', 'Crawler - Thread %num of %total Done.', array(
          '%num' => $this_thread,
          '%total' => $total_threads,
        ));
      }

      // Re init crawler if it missed some, try 3 times
      if (!boost_crawler_threads_alive() && _boost_variable_get('boost_crawler_number_of_tries') < 3 && boost_crawler_verify($expire)) {
        variable_set('boost_crawler_number_of_tries', (int) _boost_variable_get('boost_crawler_number_of_tries') + 1);
        _boost_variable_set('boost_crawler_number_of_threads', 1);
        if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_restart'])) {
          watchdog('boost', 'Crawler - Restarting with 1 thread, to try & get the stubborn urls cached.');
        }
        boost_async_call_crawler($self, 1, 1, $expire);
        exit;
      }
      return TRUE;
    }
    else {

      // Delete page right before crawling it
      if (!BOOST_OVERWRITE_FILE && BOOST_LOOPBACK_BYPASS) {
        $kill = db_result(db_query("SELECT filename FROM {boost_cache} WHERE hash_url = '%s'", $url['hash']));
        if ($kill) {
          boost_cache_kill(array(
            array(
              'filename' => $kill,
              'hash' => $url['hash'],
            ),
          ), TRUE);
        }
      }
      drupal_http_request($url['url']);
      if (BOOST_CRAWLER_THROTTLE) {
        usleep(BOOST_CRAWLER_THROTTLE);
      }
      _boost_set_time_limit(0);
    }
    while ($url = db_fetch_array($results)) {

      // Delete page right before crawling it
      if (!BOOST_OVERWRITE_FILE && BOOST_LOOPBACK_BYPASS) {
        $kill = db_result(db_query("SELECT filename FROM {boost_cache} WHERE hash_url = '%s'", $url['hash']));
        if ($kill) {
          boost_cache_kill(array(
            array(
              'filename' => $kill,
              'hash' => $url['hash'],
            ),
          ), TRUE);
        }
      }
      drupal_http_request($url['url']);
      if (BOOST_CRAWLER_THROTTLE) {
        usleep(BOOST_CRAWLER_THROTTLE);
      }
      _boost_set_time_limit(0);
    }

    // Crawler for this round done, call self and exit
    boost_async_call_crawler($self, $this_thread, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
    exit;
  }
  elseif (boost_crawler_threads_alive() || _boost_variable_get('boost_crawler_sleeping')) {
    if (BOOST_VERBOSE >= 3) {
      watchdog('boost', 'Crawler already running');
    }
    drupal_set_message(t('Boost: Crawler is already running. Attempt to start crawler failed.'), 'warning');
  }
  elseif (!BOOST_CRAWL_ON_CRON) {

    // Crawler Not Enabled
    return FALSE;
  }
  elseif (variable_get('cron_semaphore', FALSE) == TRUE) {

    // This function called from cron; reset & call self.
    if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_start'])) {
      watchdog('boost', 'Crawler Start %self', array(
        '%self' => $self,
      ));
    }
    db_query('TRUNCATE {boost_crawler}');
    variable_set('boost_crawler_position', 0);
    variable_set('boost_crawler_loaded_count' . BOOST_FILE_EXTENSION, 0);
    variable_set('boost_crawler_loaded_count' . BOOST_XML_EXTENSION, 0);
    variable_set('boost_crawler_loaded_count' . BOOST_JSON_EXTENSION, 0);
    variable_set('boost_crawler_loaded_count_alias', 0);
    variable_set('boost_crawl_prune_table', FALSE);
    variable_set('boost_crawler_number_of_tries', 0);
    variable_set('boost_crawler_number_of_threads', 0);
    variable_set('boost_crawler_sleeping', FALSE);
    variable_set('boost_crawler_average_generation', max(1, db_result(db_query("SELECT AVG(timer_average) FROM {boost_cache}"))));
    variable_set('boost_crawler_start_time', FALSE);
    variable_set('boost_crawler_stopped', FALSE);
    $threads = BOOST_MAX_THREADS;
    while ($threads > 0) {
      variable_set('boost_crawler_thread_num_' . $threads, 0);
      $threads--;
    }
    boost_async_call_crawler($self, 1, NULL, $expire);
    return TRUE;
  }
}