function boost_crawler_run in Boost 6
Same name and namespace in other branches
- 7 boost_crawler/boost_crawler.module \boost_crawler_run()
The brains of the crawler.
Parameters
$expire: Has the site changed, if so get expire column
1 call to boost_crawler_run()
- boost_cron in ./
boost.module - Implementation of hook_cron(). Performs periodic actions.
1 string reference to 'boost_crawler_run'
- boost_menu in ./
boost.module - Implementation of hook_menu().
File
- ./
boost.module, line 5729 - Provides static file caching for Drupal text output. Pages, Feeds, ect...
Code
function boost_crawler_run($expire = -1) {
global $base_url, $_boost;
$this_thread = isset($_GET['thread']) && is_numeric($_GET['thread']) ? $_GET['thread'] : NULL;
$total_threads = isset($_GET['total']) && is_numeric($_GET['total']) ? $_GET['total'] : NULL;
$expire = $expire == -1 && isset($_GET['expire']) && is_numeric($_GET['expire']) ? $_GET['expire'] : $expire;
$self = BOOST_CRAWLER_SELF;
$GLOBALS['_boost_max_execution_time'] = ini_get('max_execution_time');
$GLOBALS['_boost_output_buffering'] = ini_get('output_buffering');
if ($_GET['q'] == 'boost-crawler') {
// if not called via cron, require key to be present in url
if ($_GET['key'] != variable_get('boost_crawler_key', FALSE)) {
drupal_access_denied();
exit;
}
// Test for access on status page
if ($_GET['test']) {
echo '<h1>OK</h1>';
exit;
}
// Stop button code
if (_boost_variable_get('boost_crawler_stopped')) {
// Wait 0 to 0.1 seconds before grabbing number of threads.
usleep(mt_rand(0, 100000));
db_lock_table('variable');
$threads = _boost_variable_get('boost_crawler_number_of_threads');
_boost_variable_set('boost_crawler_number_of_threads', (int) $threads - 1);
// Clock out
_boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
db_unlock_tables();
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_stop'])) {
watchdog('boost', 'Crawler - Thread %num stopped.', array(
'%num' => $this_thread,
));
}
ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
exit;
}
// Kill this thread if it doesn't have a thread number assigned to it.
if (!isset($this_thread)) {
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_rogue'])) {
watchdog('boost', 'Crawler - Rogue thread killed.');
}
exit;
}
// Try to prevent crawler from stalling.
ini_set('max_execution_time', 600);
// Return html so connection closes
boost_async_opp('async');
// Turn off output buffer.
ini_set('output_buffering', 'off');
// Fetch the cron semaphore
$semaphore = variable_get('cron_semaphore', FALSE);
// Wait 15 seconds if cron still running and try again (let cron finish); if longer then 5 minutes stop stalling and start crawling.
if ($semaphore == TRUE && BOOST_TIME - $semaphore < 300) {
if (_boost_variable_get('boost_crawler_sleeping')) {
// Kill this thread; multiple crawlers sleeping.
ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
exit;
}
_boost_variable_set('boost_crawler_sleeping', TRUE);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_sleep'])) {
watchdog('boost', 'Crawler Sleep for 15 seconds');
}
sleep(15);
_boost_variable_set('boost_crawler_sleeping', FALSE);
boost_async_call_crawler($self, 1, NULL, $expire);
exit;
}
// Crawler was forced to stop last run, wait extra time before starting up again.
if (variable_get('boost_crawler_stopped', FALSE) && !isset($this_thread) && !isset($total_threads)) {
if (_boost_variable_get('boost_crawler_sleeping')) {
ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
exit;
}
_boost_variable_set('boost_crawler_sleeping', TRUE);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_shutdown'])) {
watchdog('boost', 'Crawler sleeping for @x seconds, do to forced shutdown.', array(
'@x' => 2 * BOOST_CRAWLER_THREADS * BOOST_CRAWLER_BATCH_SIZE,
));
}
$i = BOOST_CRAWLER_BATCH_SIZE;
while ($i > 0) {
_boost_set_time_limit(0);
sleep(2 * BOOST_CRAWLER_THREADS);
$i--;
}
variable_set('boost_crawler_stopped', FALSE);
_boost_variable_set('boost_crawler_sleeping', FALSE);
boost_async_call_crawler($self, 1, NULL, $expire);
exit;
}
// Add URL's to crawler table, call self and exit
if (!boost_crawler_seed_tables($expire)) {
boost_async_call_crawler($self, $this_thread, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
exit;
}
// Calc Threads
$total = boost_crawler_total_count() - BOOST_CRAWLER_BATCH_SIZE;
$threads = _boost_variable_get('boost_crawler_number_of_threads');
$threads = $threads > 0 ? $threads : BOOST_CRAWLER_THREADS;
if ($total / BOOST_CRAWLER_BATCH_SIZE < BOOST_CRAWLER_THREADS) {
$threads = floor($total / BOOST_CRAWLER_BATCH_SIZE);
}
// Sanity Check
if (abs($threads) > BOOST_CRAWLER_THREADS) {
// Kill this thread
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_kill'])) {
watchdog('boost', 'Crawler - Thread %num of %total Killed.', array(
'%num' => $this_thread,
'%total' => $total_threads,
));
}
ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
exit;
}
// Start the clock on first run
if (!_boost_variable_get('boost_crawler_start_time')) {
_boost_variable_set('boost_crawler_start_time', BOOST_TIME);
_boost_variable_set('boost_crawler_number_of_threads', (int) $threads);
// Clock in
_boost_variable_set('boost_crawler_thread_num_' . $this_thread, BOOST_TIME);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_startup'])) {
watchdog('boost', 'Crawler - Thread @num of @total started', array(
'@num' => 1,
'@total' => $threads,
));
}
}
// Spin up threads on demand
while ($threads > 0 && $this_thread == 1) {
db_lock_table('variable');
$thread_time = _boost_variable_get('boost_crawler_thread_num_' . $threads);
if (!$thread_time || $thread_time + BOOST_MAX_THREAD_TIME < BOOST_TIME) {
_boost_variable_set('boost_crawler_thread_num_' . $threads, BOOST_TIME);
db_unlock_tables();
boost_async_call_crawler($self, $threads, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_startup'])) {
watchdog('boost', 'Crawler - Thread @num of @total started', array(
'@num' => $threads,
'@total' => _boost_variable_get('boost_crawler_number_of_threads'),
));
}
_boost_set_time_limit(0);
}
db_unlock_tables();
$threads--;
}
// Make sure this thread is supposed to be running.
$thread = _boost_variable_get('boost_crawler_number_of_threads');
if ($thread >= 1 && $this_thread > $thread) {
// Clock out
if (isset($this_thread)) {
_boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_kill'])) {
watchdog('boost', 'Crawler - Thread %num of %total Killed.', array(
'%num' => $this_thread,
'%total' => $total_threads,
));
}
}
// elseif (BOOST_VERBOSE >= 5) {
// watchdog('boost', 'Crawler - Extra Thread Killed.');
// }
if (!boost_crawler_threads_alive() && _boost_variable_get('boost_crawler_number_of_tries') < 3 && boost_crawler_verify($expire)) {
variable_set('boost_crawler_number_of_tries', (int) _boost_variable_get('boost_crawler_number_of_tries') + 1);
_boost_variable_set('boost_crawler_number_of_threads', 1);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_restart'])) {
watchdog('boost', 'Crawler - Restarting with 1 thread, to try & get the stubborn urls cached.');
}
boost_async_call_crawler($self, 1, 1, $expire);
exit;
}
ini_set('max_execution_time', $GLOBALS['_boost_max_execution_time']);
ini_set('output_buffering', $GLOBALS['_boost_output_buffering']);
exit;
}
// Clock in
_boost_variable_set('boost_crawler_thread_num_' . $this_thread, BOOST_TIME);
// Wait 0 to 0.1 seconds before grabbing DB position counter.
usleep(mt_rand(0, 100000));
db_lock_table('variable');
$from = _boost_variable_get('boost_crawler_position');
_boost_variable_set('boost_crawler_position', $from + BOOST_CRAWLER_BATCH_SIZE);
db_unlock_tables();
$results = db_query_range("SELECT DISTINCT hash, url FROM {boost_crawler}", $from, BOOST_CRAWLER_BATCH_SIZE);
$url = db_fetch_array($results);
if (!$url) {
// We Are Done
// Wait 0 to 0.1 seconds before grabbing number of threads.
usleep(mt_rand(0, 100000));
db_lock_table('variable');
$threads = _boost_variable_get('boost_crawler_number_of_threads');
_boost_variable_set('boost_crawler_number_of_threads', (int) $threads - 1);
// Clock out
_boost_variable_set('boost_crawler_thread_num_' . $this_thread, 0);
db_unlock_tables();
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_done'])) {
watchdog('boost', 'Crawler - Thread %num of %total Done.', array(
'%num' => $this_thread,
'%total' => $total_threads,
));
}
// Re init crawler if it missed some, try 3 times
if (!boost_crawler_threads_alive() && _boost_variable_get('boost_crawler_number_of_tries') < 3 && boost_crawler_verify($expire)) {
variable_set('boost_crawler_number_of_tries', (int) _boost_variable_get('boost_crawler_number_of_tries') + 1);
_boost_variable_set('boost_crawler_number_of_threads', 1);
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_restart'])) {
watchdog('boost', 'Crawler - Restarting with 1 thread, to try & get the stubborn urls cached.');
}
boost_async_call_crawler($self, 1, 1, $expire);
exit;
}
return TRUE;
}
else {
// Delete page right before crawling it
if (!BOOST_OVERWRITE_FILE && BOOST_LOOPBACK_BYPASS) {
$kill = db_result(db_query("SELECT filename FROM {boost_cache} WHERE hash_url = '%s'", $url['hash']));
if ($kill) {
boost_cache_kill(array(
array(
'filename' => $kill,
'hash' => $url['hash'],
),
), TRUE);
}
}
drupal_http_request($url['url']);
if (BOOST_CRAWLER_THROTTLE) {
usleep(BOOST_CRAWLER_THROTTLE);
}
_boost_set_time_limit(0);
}
while ($url = db_fetch_array($results)) {
// Delete page right before crawling it
if (!BOOST_OVERWRITE_FILE && BOOST_LOOPBACK_BYPASS) {
$kill = db_result(db_query("SELECT filename FROM {boost_cache} WHERE hash_url = '%s'", $url['hash']));
if ($kill) {
boost_cache_kill(array(
array(
'filename' => $kill,
'hash' => $url['hash'],
),
), TRUE);
}
}
drupal_http_request($url['url']);
if (BOOST_CRAWLER_THROTTLE) {
usleep(BOOST_CRAWLER_THROTTLE);
}
_boost_set_time_limit(0);
}
// Crawler for this round done, call self and exit
boost_async_call_crawler($self, $this_thread, _boost_variable_get('boost_crawler_number_of_threads'), $expire);
exit;
}
elseif (boost_crawler_threads_alive() || _boost_variable_get('boost_crawler_sleeping')) {
if (BOOST_VERBOSE >= 3) {
watchdog('boost', 'Crawler already running');
}
drupal_set_message(t('Boost: Crawler is already running. Attempt to start crawler failed.'), 'warning');
}
elseif (!BOOST_CRAWL_ON_CRON) {
// Crawler Not Enabled
return FALSE;
}
elseif (variable_get('cron_semaphore', FALSE) == TRUE) {
// This function called from cron; reset & call self.
if (BOOST_VERBOSE >= 5 && isset($_boost['verbose_option_selected']['boost_crawler_run_start'])) {
watchdog('boost', 'Crawler Start %self', array(
'%self' => $self,
));
}
db_query('TRUNCATE {boost_crawler}');
variable_set('boost_crawler_position', 0);
variable_set('boost_crawler_loaded_count' . BOOST_FILE_EXTENSION, 0);
variable_set('boost_crawler_loaded_count' . BOOST_XML_EXTENSION, 0);
variable_set('boost_crawler_loaded_count' . BOOST_JSON_EXTENSION, 0);
variable_set('boost_crawler_loaded_count_alias', 0);
variable_set('boost_crawl_prune_table', FALSE);
variable_set('boost_crawler_number_of_tries', 0);
variable_set('boost_crawler_number_of_threads', 0);
variable_set('boost_crawler_sleeping', FALSE);
variable_set('boost_crawler_average_generation', max(1, db_result(db_query("SELECT AVG(timer_average) FROM {boost_cache}"))));
variable_set('boost_crawler_start_time', FALSE);
variable_set('boost_crawler_stopped', FALSE);
$threads = BOOST_MAX_THREADS;
while ($threads > 0) {
variable_set('boost_crawler_thread_num_' . $threads, 0);
$threads--;
}
boost_async_call_crawler($self, 1, NULL, $expire);
return TRUE;
}
}