You are here

recommender.module in Recommender API 6

Providing generic recommender system algorithms.

File

recommender.module
View source
<?php

/**
 * @file
 * Providing generic recommender system algorithms.
 */

/**
 * classical collaborative filtering algorithm based on correlation coefficient.
 * could be used in the classical user-user or item-item algorithm
 * see the README file for more details
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'performance': whether to do calculation in memory or in database. 'auto' is to decide automatically.
 *          'missing': how to handle missing data -- 'none' do nothing; 'zero' fill in missing data with zero; 'adjusted' skip mice that don't share cheese in common.
 *          'sensitivity': if similarity is smaller enough to be less than a certain value (sensitivity), we just discard those
 * @return null {recommender_similarity} will be filled with similarity data
 */
function recommender_similarity_classical($app_name, $table_name, $field_mouse, $field_cheese, $field_weight, $options = array()) {

  // get param value
  $app_id = recommender_get_app_id($app_name);
  $op = $options['performance'];
  if (!isset($op) || $op == 'auto') {

    // decide the scale, then choose which algorithm to use.
    $result = db_result(db_query("SELECT COUNT(DISTINCT {$field_mouse}) count_mouse, COUNT(DISTINCT {$field_mouse}) count_mouse FROM {{$table_name}}"));
    $op = $result['count_mouse'] <= 2000 && $result['count_cheese'] <= 10000 && $options['missing'] == 'zero' ? 'memory' : 'database';
  }
  if ($op == 'memory') {
    _recommender_similarity_classical_in_memory($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options);
  }
  else {
    if ($op == 'database') {
      _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options);
    }
  }
}

/**
 * Matrix computation in memory. Fast. But require lots of memory.
 * Limitation: only support $missing='zero', will cover other cases in future release
 */
function _recommender_similarity_classical_in_memory($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) {
  $created = time();

  // retrieve value from the database. setup program.
  $list_mouse = array();
  $list_cheese = array();
  $prep_matrix = array();
  $result = db_query("SELECT {$field_mouse} field_mouse, {$field_cheese} field_cheese, SUM({$field_weight}) field_weight\n                      FROM {{$table_name}} GROUP BY field_mouse, field_cheese");
  while ($tuple = db_fetch_array($result)) {
    $id_mouse = $tuple['field_mouse'];
    $id_cheese = $tuple['field_cheese'];
    $weight = $tuple['field_weight'];
    if (!array_key_exists($id_mouse, $list_mouse)) {
      $list_mouse[$id_mouse] = count($list_mouse);
    }
    if (!array_key_exists($id_cheese, $list_cheese)) {
      $list_cheese[$id_cheese] = count($list_cheese);
    }
    $prep_matrix[$list_mouse[$id_mouse]][$list_cheese[$id_cheese]] = $weight;
  }

  // create the matrix
  $m = count($list_mouse);
  $n = count($list_cheese);
  $matrix = array();
  for ($v1 = 0; $v1 < $m; $v1++) {
    for ($v2 = 0; $v2 < $n; $v2++) {

      // basically set the missing values to be 0.
      $matrix[$v1][$v2] = isset($prep_matrix[$v1][$v2]) ? $prep_matrix[$v1][$v2] : 0;
    }
  }

  // time consuming calculation
  $cor_matrix =& _recommender_fast_correlation_coefficient($matrix);

  // save to database
  // TODO: batch insert to improve performance.
  $sensitivity = isset($options['sensitivity']) ? $options['sensitivity'] : 0.0001;
  $map = array_flip($list_mouse);
  for ($v1 = 0; $v1 < $m; $v1++) {
    for ($v2 = 0; $v2 < $m; $v2++) {
      $score = $cor_matrix[$v1][$v2];
      if (abs($score) > $sensitivity) {
        db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                  VALUE(%d, %d, %d, %f, %d)", $app_id, $map[$v1], $map[$v2], $score, $created);
      }
    }
  }
  db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/**
 * Fast correlation matrix calculation.
 * @param $matrix has to be indexed by [0..m][0..n]. it WILL BE MODIFIED to save mem space!!!
 * @return array the correlation matrix
 */
function &_recommender_fast_correlation_coefficient(&$matrix) {
  if (!isset($matrix) || !is_array($matrix)) {
    return NULL;
  }
  $m = count($matrix);
  $n = count($matrix[0]);

  // assume the matrix has the same width
  $variance = array();
  foreach ($matrix as &$vector) {
    if (count($vector) != $n) {
      return NULL;
    }

    // adjust each element in the vector by the mean, and calculate variance.
    $mean = array_sum($vector) / $n;
    $sigma = 0;
    for ($i = 0; $i < $n; $i++) {
      $vector[$i] = $vector[$i] - $mean;
      $sigma += pow($vector[$i], 2);
    }

    // due to float point error, a certain small number could be indeed 0.
    // but it doesn't seem to hurt. so just comment them out.

    //$vector_diagnal[] = sqrt($sigma)<0.00001 ? 0 : sqrt($sigma);
    $variance[] = sqrt($sigma);

    // note we didn't divide it by n.
  }
  $cor_matrix = array();
  $product = array();
  for ($v1 = 0; $v1 < $m; $v1++) {
    for ($v2 = $v1; $v2 < $m; $v2++) {
      $vector_1 =& $matrix[$v1];
      $vector_2 =& $matrix[$v2];
      if ($variance[$v1] == 0 || $variance[$v2] == 0) {
        $cor_matrix[$v1][$v2] = 0;
      }
      else {
        for ($i = 0; $i < $n; $i++) {
          $product[$i] = $vector_1[$i] * $vector_2[$i];
        }
        $cor = array_sum($product) / ($variance[$v1] * $variance[$v2]);
        $cor_matrix[$v1][$v2] = $cor;
        $cor_matrix[$v2][$v1] = $cor;
      }
    }
  }
  return $cor_matrix;
}

/**
 * Classical algorithm computed in databases. Full functioning.
 * It has no memory limits because all data are saved in database.
 * But the performance is worse than in-memory computation.
 */
function _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) {

  // get param values
  $missing = isset($options['missing']) ? $options['missing'] : 'none';
  $created = time();

  // operations would be quite different for different missing data treament
  switch ($missing) {

    // in this case, we treat the missing data as missing.
    case 'none':
    default:

      // compute pair-wise avg/stddev for each mouse, which is different in each pair.
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2, avg1, avg2, stddev1, stddev2)\n                SELECT n1.{$field_mouse}, n2.{$field_mouse}, AVG(n1.{$field_weight}), AVG(n2.{$field_weight}),\n                STDDEV(n1.{$field_weight}), STDDEV(n2.{$field_weight})\n                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese}\n                GROUP BY n1.{$field_mouse}, n2.{$field_mouse}");

      // compute similarity. would be time-consuming
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT %d, n1.{$field_mouse}, n2.{$field_mouse},\n                (AVG((n1.{$field_weight}-a.avg1)*(n2.{$field_weight}-a.avg2)))/(a.stddev1*a.stddev2) corr, %d\n                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese} AND n1.{$field_mouse}<=n2.{$field_mouse}\n                INNER JOIN {recommender_helper_pair_stat} a ON n1.{$field_mouse}=a.id1 AND n2.{$field_mouse}=a.id2\n                GROUP BY n1.{$field_mouse}, n2.{$field_mouse} HAVING corr<>0", $app_id, $created);

      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}\n                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clean up
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      break;

    // in this case, we treat the missing data as default value 0;
    case 'zero':

    // this case is similar to 'zero', but adjust the calculation to be more efficient for very sparse data.
    // refer to README file
    case 'adjusted':

      // expand the matrix to have zeros for missing data.
      _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing);

      // calcualate mean and stddev for each vector
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("INSERT INTO {recommender_helper_single_stat}(id, avg, stddev)\n                SELECT mouse_id, AVG(weight), STDDEV(weight) FROM {recommender_helper_matrix}\n                GROUP BY mouse_id");

      // addjust the full matrix by the mean of each vector.
      db_query("UPDATE {recommender_helper_matrix} m, {recommender_helper_single_stat} s\n                SET m.weight=m.weight-s.avg WHERE m.mouse_id=s.id");

      // we skip those mice that don't share cheese with other mice in 'adjusted' mode.
      $sql_adjusted = $missing != 'adjusted' ? '' : "INNER JOIN {recommender_helper_pair_stat} p ON n1.mouse_id=p.id1 AND n2.mouse_id=p.id2";

      // compute similarity. could be time-consuming.
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT %d, n1.mouse_id, n2.mouse_id,\n                (AVG(n1.weight*n2.weight))/(s1.stddev*s2.stddev) corr, %d\n                FROM {recommender_helper_matrix} n1 INNER JOIN {recommender_helper_matrix} n2\n                ON n1.cheese_id=n2.cheese_id AND n1.mouse_id<=n2.mouse_id {$sql_adjusted}\n                INNER JOIN {recommender_helper_single_stat} s1 ON n1.mouse_id=s1.id\n                INNER JOIN {recommender_helper_single_stat} s2 ON n2.mouse_id=s2.id\n                WHERE s1.stddev<>0 AND s2.stddev<>0\n                GROUP BY n1.mouse_id, n2.mouse_id HAVING corr<>0", $app_id, $created);

      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}\n                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clear up temporary data
      if ($missing == 'adjusted') {
        db_query("DELETE FROM {recommender_helper_pair_stat}");
      }
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("DELETE FROM {recommender_helper_matrix}");
      break;
  }

  // remove old data.
  db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/**
 * Expand the sparse database table to have the missing data default as zero.
 * 'adjusted' means that only mouses that share common cheese will be expanded.
 * 
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $missing the way of handling missing data. could be either 'zero' or 'adjusted'.
 * @return null the {recommender_helper_matrix} and {recommender_helper_pair_stat} will be filled with data.
 */
function _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing) {

  // clean up
  db_query("DELETE FROM {recommender_helper_matrix}");

  // if 'adjusted', we record the pairs of mice that share at least one cheese.
  // we'll skip those mice that don't share cheese with other mice.
  // this measure is extremely useful for very sparse dataset.
  if ($missing == 'adjusted') {
    db_query("DELETE FROM {recommender_helper_pair_stat}");
    db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2)\n              SELECT n1.{$field_mouse}, n2.{$field_mouse} FROM {{$table_name}} n1\n              INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese}\n              GROUP BY n1.{$field_mouse}, n2.{$field_mouse}");
  }

  // expand sparse matrix in {$table_name} to the full matrix, saved to {recommender_helper_matrix}, initially as 0
  // if 'adjusted', then we skip those mice that don't share at least one cheese with other mice.
  $sql_adjusted = $missing != 'adjusted' ? '' : "INNER JOIN (SELECT DISTINCT id1 FROM {recommender_helper_pair_stat}) p ON l.{$field_mouse}=p.id1";
  db_query("INSERT INTO {recommender_helper_matrix} SELECT m.mouse_id, c.cheese_id, 0 FROM\n            (SELECT DISTINCT l.{$field_mouse} mouse_id FROM {{$table_name}} l {$sql_adjusted}) m JOIN\n            (SELECT DISTINCT {$field_cheese} cheese_id FROM {{$table_name}}) c");

  // update the full matrix to have the correct value where cell is not 0
  db_query("UPDATE {recommender_helper_matrix} m, {{$table_name}} l SET m.weight=l.{$field_weight}\n            WHERE m.mouse_id=l.{$field_mouse} AND m.cheese_id=l.{$field_cheese}");
}

/**
 * Classical weight-average algorithm to calculate prediction from the similarity matrix, based on average weight.
 * Limitation: we only do database operation for now. no in-memory operation available until future release.
 * Limitation: we only do average weight. regression-based weight maybe included in future release.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'missing': how to handle missing data -- 'none' do nothing; 'zero' fill in missing data with zero; 'adjusted' skip mice that don't share cheese in common.
 *          'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate'
 *          'sensitivity': if similarity is smaller enough to be less than a certain value, we treat it as zero
 * @return null {recommender_prediction} will be filled with prediction data
 */
function recommender_prediction_classical($app_name, $table_name = 'recommender_link', $field_mouse = 'mouse_id', $field_cheese = 'cheese_id', $field_weight = 'weight', $options = array()) {

  // get param values
  $app_id = recommender_get_app_id($app_name);
  $missing = isset($options['missing']) ? $options['missing'] : 'none';
  $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'preserve';

  // could be 'eliminate'
  $sensitivity = isset($options['sensitivity']) ? $options['sensitivity'] : 0;
  $created = time();

  // append missing data with 0s. then use table {recommender_helper_matrix} instead.
  if ($missing == 'zero' || $missing == 'adjusted') {
    _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing);
    $table_name = '{recommender_helper_matrix}';
    $field_mouse = 'mouse_id';
    $field_cheese = 'cheese_id';
    $field_weight = 'weight';
  }

  // calculate the mean value for each mouse, will be used as the starting point of each prediction.
  db_query("DELETE FROM {recommender_helper_single_stat}");
  db_query("INSERT INTO {recommender_helper_single_stat}(id, avg)\n            SELECT {$field_mouse}, AVG({$field_weight}) FROM {{$table_name}} GROUP BY {$field_mouse}");

  // for some reason, the following SQL, esp."INNER JOIN {recommender_helper_single_stat} m1 ON s.mouse1_id=m1.id"
  // is really time-consuming. probably it's also in the GROUP BY?
  // changed to the 2ne SQL approach is much more efficient.

  /* db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)
     SELECT %d, s.mouse1_id, t.$field_cheese,
     m1.avg+SUM(s.similarity*(t.$field_weight-m2.avg))/SUM(ABS(s.similarity)) prediction, %d
     FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m1
     ON s.mouse1_id=m1.id INNER JOIN {recommender_helper_single_stat} m2
     ON s.mouse2_id=m2.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.$field_mouse
     WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.$field_cheese",
     $app_id, $created, $app_id, $sensitivity); */

  // if we treat missing data as adjusted, then we only count opinions from mice who share cheese from other mice.
  $sql_adjusted = $missing != 'adjusted' ? '' : "INNER JOIN {recommender_helper_pair_stat} p ON s.mouse1_id=p.id1 AND s.mouse2_id=p.id2";

  // calculate prediction. could be time-consuming
  db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)\n            SELECT %d, s.mouse1_id, t.{$field_cheese},\n            SUM(s.similarity*(t.{$field_weight}-m.avg))/SUM(ABS(s.similarity)) prediction, %d\n            FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m\n            ON s.mouse2_id=m.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.{$field_mouse}\n            {$sql_adjusted}\n            WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.{$field_cheese}", $app_id, $created, $app_id, $sensitivity);

  // adjust the prediction based on each mouse's average evaluation.
  db_query("UPDATE {recommender_prediction} p, {recommender_helper_single_stat} m\n            SET prediction=prediction+m.avg WHERE p.mouse_id=m.id\n            AND p.app_id=%d AND created=%d", $app_id, $created);

  // remove duplicate prediction
  if ($duplicate == 'eliminate') {
    if ($missing == 'zero' || $missing == 'adjusted') {
      $missing_sql = "WHERE {$field_weight}<>0";
    }
    db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND\n              (mouse_id, cheese_id) IN (SELECT {$field_mouse}, {$field_cheese} FROM {{$table_name}}\n              {$missing_sql})", $app_id, $created);
  }

  // clean_up
  db_query("DELETE FROM {recommender_helper_single_stat}");
  if ($missing == 'zero' || $missing == 'adjusted') {
    db_query("DELETE FROM {recommender_helper_matrix}");
    if ($missing == 'adjusted') {
      db_query("DELETE FROM {recommender_helper_pair_stat}");
    }
  }

  // remove old predictions from the last calculation
  db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/**
 * Co-ocurrences algorithm that compute similarity among mice based on how many cheese they share.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $options an array of options
 *          'incremental': whether to rebuild the whole similarity matrix, or incrementally update those got changed. 'rebuild' or 'update'
 * @return null {recommender_similarity} will be filled with similarity data
 */
function recommender_similarity_coocurrences($app_name, $table_name, $field_mouse, $field_cheese, $options = array()) {

  // get param values
  $app_id = recommender_get_app_id($app_name);
  $created = time();
  $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild';

  // the basic skeleton sql
  $basic_sql = "INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT {$app_id}, n1.{$field_mouse}, n2.{$field_mouse}, COUNT(n1.{$field_cheese}), {$created}\n                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese}\n                GROUP BY n1.{$field_mouse}, n2.{$field_mouse}";
  switch ($incremental) {

    // rebuild the similarity index
    case 'rebuild':
    default:
      db_query($basic_sql);
      db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $created);
      db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
      break;

    // incrementally update. might incur more overhead than 'rebuild'
    case 'update':
      $max_timestamp = _recommender_updated_max($app_id);
      if ($max_timestamp == NULL) {
        return;
      }
      $ids = _recommender_updated_list($app_id, $max_timestamp, 'mouse_id');
      db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND (mouse1_id IN {$ids} OR mouse2_id IN {$ids})", $app_id);
      db_query($basic_sql . " HAVING (n1.{$field_mouse} IN {$ids} OR n2.{$field_mouse} IN {$ids})");
      _recommender_updated_markdone($app_id, $max_timestamp);
      break;
  }
}

/**
 * This is the implementation of slope-one algorithm, which is supposed to be faster than other algrithms.
 * From the original research paper, the author argues slope-one support incremental update. But this is quite hard to implement.
 * The incremental update doesn't work for now.
 * Missing data is just treated as missing. For slope-one, we don't automatically expand the matrix to have zeros.
 * The responsibility of handling missing data is on the caller functions.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'extention': whether to use 'basic', 'weighted', or 'bipolar' extensions of the algorithm.
 *                       Please refer to the original research paper. Usually it could just be 'weighted'.
 *          'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate'
 *          'incremental': whether to 'rebuild' or to 'update'. CAUTION: 'update' doesn't work now.
 * @return null {recommender_prediction} will be filled with prediction data
 */
function recommender_prediction_slopeone($app_name, $table_name, $field_mouse, $field_cheese, $field_weight, $options = array()) {

  // get param values
  $app_id = recommender_get_app_id($app_name);
  $extension = isset($options['extension']) ? $options['extension'] : 'weighted';

  // could be 'weighted', 'bipolar'
  $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild';

  // could be 'update'
  $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'eliminate';

  // or 'preserve'
  $created = time();
  db_query("DELETE FROM {recommender_slopeone_dev} WHERE app_id=%d", $app_id);

  // create dev(i,j)
  db_query("INSERT INTO {recommender_slopeone_dev}(app_id, cheese1_id, cheese2_id, count, dev)\n            SELECT %d, n1.{$field_cheese}, n2.{$field_cheese},\n            COUNT(*), AVG(n1.{$field_weight}-n2.{$field_weight}) FROM {{$table_name}} n1\n            INNER JOIN {{$table_name}} n2 ON n1.{$field_mouse}=n2.{$field_mouse}\n            AND n1.{$field_cheese} <> n2.{$field_cheese}\n            GROUP BY n1.{$field_cheese}, n2.{$field_cheese}", $app_id);

  // create P(u,j)
  if ($extension == 'basic') {
    $extension_sql = "AVG(t.{$field_weight}+p.dev)";
  }
  else {
    if ($extension == 'weighted') {

      // the 'weighted slope one'
      $extension_sql = "SUM((t.{$field_weight}+p.dev)*p.count)/SUM(p.count)";
    }
  }

  // haven't implemented the "bipolar" extension of Slope One.
  // generate predictions.
  db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)\n            SELECT %d, t.{$field_mouse}, p.cheese1_id, {$extension_sql}, %d\n            FROM {recommender_slopeone_dev} p INNER JOIN {{$table_name}} t ON p.cheese2_id=t.{$field_cheese}\n            GROUP BY t.{$field_mouse}, p.cheese1_id", $app_id, $created);

  // remove duplicate prediction
  if ($duplicate == 'eliminate') {
    db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND (mouse_id, cheese_id)\n              IN (SELECT {$field_mouse}, {$field_cheese} FROM {{$table_name}})", $app_id, $created);
  }
  db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/////////////////////// Helper functions /////////////////////

/**
 * Get the application id from the application name.
 * @param $app_name
 * @return the id of the application.
 */
function recommender_get_app_id($app_name) {
  if (!isset($app_name) || empty($app_name)) {
    return NULL;
  }
  $id = db_result(@db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name));
  if (!isset($id) || empty($id)) {
    db_query("INSERT INTO {recommender_app_map}(app_name) VALUE('%s')", $app_name);
    $id = db_result(db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name));
  }
  return $id;
}

/*
 * Below is a set of incremental update helper functions
 * This is very experiment. Please use with caution.
 */

// Add a new record to the updated_queue for further process.
function recommender_updated_add($app_name, $mouse_id, $cheese_id, $old_value, $new_value) {
  $app_id = recommender_get_app_id($app_name);
  $created = time();
  if ($old_value == NULL) {
    $old_value = 'NULL';
  }
  if ($new_value == NULL) {
    $new_value = 'NULL';
  }
  db_query("INSERT INTO {recommender_updated_queue}(app_id, mouse_id, cheese_id, old, new, created)\n            VALUE(%d, %d, %d, %s, %s, %d)", $app_id, $mouse_id, $cheese_id, $old_value, $new_value, $created);
}

// purge the updated queue
function recommender_updated_purge($app_name, $op = 'outdated') {
  $app_id = recommender_get_app_id($app_name);
  $op_sql = $op == 'outdated' ? "AND status=-100" : '';
  db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d {$op_sql}", $app_id);
}

// get the max timestamp from the updated queue
function _recommender_updated_max($app_id) {
  $updated_info = db_fetch_array(db_query("SELECT COUNT(*) count, MAX(created) max\n          FROM {recommender_updated_queue} WHERE app_id=%d AND status!=-100", $app_id));

  // if nothing is updated, then do nothing.
  if ($updated_info['count'] == 0) {
    return NULL;
  }
  return $updated_info['max'];
}

// get the list of updated items in parenthesis.
function _recommender_updated_list($app_id, $max_timestamp, $field = 'mouse_id') {
  $result = db_query("SELECT DISTINCT {$field} FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp);
  while ($r = db_fetch_array($result)) {
    $updated[] = $r[$field];
  }
  return '(' . implode(',', $updated) . ')';
}

// mark items in the updated queue as done.
function _recommender_updated_markdone($app_id, $max_timestamp) {
  db_query("UPDATE {recommender_updated_queue} SET status=-100 WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp);
}

Functions

Namesort descending Description
recommender_get_app_id Get the application id from the application name.
recommender_prediction_classical Classical weight-average algorithm to calculate prediction from the similarity matrix, based on average weight. Limitation: we only do database operation for now. no in-memory operation available until future release. Limitation: we only do average…
recommender_prediction_slopeone This is the implementation of slope-one algorithm, which is supposed to be faster than other algrithms. From the original research paper, the author argues slope-one support incremental update. But this is quite hard to implement. The incremental…
recommender_similarity_classical classical collaborative filtering algorithm based on correlation coefficient. could be used in the classical user-user or item-item algorithm see the README file for more details
recommender_similarity_coocurrences Co-ocurrences algorithm that compute similarity among mice based on how many cheese they share.
recommender_updated_add
recommender_updated_purge
_recommender_expand_sparse_data Expand the sparse database table to have the missing data default as zero. 'adjusted' means that only mouses that share common cheese will be expanded.
_recommender_fast_correlation_coefficient Fast correlation matrix calculation.
_recommender_similarity_classical_in_database Classical algorithm computed in databases. Full functioning. It has no memory limits because all data are saved in database. But the performance is worse than in-memory computation.
_recommender_similarity_classical_in_memory Matrix computation in memory. Fast. But require lots of memory. Limitation: only support $missing='zero', will cover other cases in future release
_recommender_updated_list
_recommender_updated_markdone
_recommender_updated_max