You are here

function _recommender_similarity_classical_in_database in Recommender API 5

Same name and namespace in other branches
  1. 6 recommender.module \_recommender_similarity_classical_in_database()

Classical algorithm computed in databases. Full functioning. It has no memory limits because all data are saved in database. But the performance is worse than in-memory computation.

1 call to _recommender_similarity_classical_in_database()
recommender_similarity_classical in ./recommender.module
classical collaborative filtering algorithm based on correlation coefficient. could be used in the classical user-user or item-item algorithm see the README file for more details

File

./recommender.module, line 161
Providing generic recommender system algorithms.

Code

function _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) {

  // get param values
  $missing = isset($options['missing']) ? $options['missing'] : 'none';
  $created = time();

  // operations would be quite different for different missing data treament
  switch ($missing) {

    // in this case, we treat the missing data as missing.
    case 'none':
    default:

      // compute pair-wise avg/stddev for each mouse, which is different in each pair.
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2, avg1, avg2, stddev1, stddev2)\n                SELECT n1.{$field_mouse}, n2.{$field_mouse}, AVG(n1.{$field_weight}), AVG(n2.{$field_weight}),\n                STDDEV(n1.{$field_weight}), STDDEV(n2.{$field_weight})\n                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese}\n                GROUP BY n1.{$field_mouse}, n2.{$field_mouse}");

      // compute similarity. would be time-consuming
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT %d, n1.{$field_mouse}, n2.{$field_mouse},\n                (AVG((n1.{$field_weight}-a.avg1)*(n2.{$field_weight}-a.avg2)))/(a.stddev1*a.stddev2) corr, %d\n                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.{$field_cheese}=n2.{$field_cheese} AND n1.{$field_mouse}<=n2.{$field_mouse}\n                INNER JOIN {recommender_helper_pair_stat} a ON n1.{$field_mouse}=a.id1 AND n2.{$field_mouse}=a.id2\n                GROUP BY n1.{$field_mouse}, n2.{$field_mouse} HAVING corr<>0", $app_id, $created);

      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}\n                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clean up
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      break;

    // in this case, we treat the missing data as default value 0;
    case 'zero':

    // this case is similar to 'zero', but adjust the calculation to be more efficient for very sparse data.
    // refer to README file
    case 'adjusted':

      // expand the matrix to have zeros for missing data.
      _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing);

      // calcualate mean and stddev for each vector
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("INSERT INTO {recommender_helper_single_stat}(id, avg, stddev)\n                SELECT mouse_id, AVG(weight), STDDEV(weight) FROM {recommender_helper_matrix}\n                GROUP BY mouse_id");

      // addjust the full matrix by the mean of each vector.
      db_query("UPDATE {recommender_helper_matrix} m, {recommender_helper_single_stat} s\n                SET m.weight=m.weight-s.avg WHERE m.mouse_id=s.id");

      // we skip those mice that don't share cheese with other mice in 'adjusted' mode.
      $sql_adjusted = $missing != 'adjusted' ? '' : "INNER JOIN {recommender_helper_pair_stat} p ON n1.mouse_id=p.id1 AND n2.mouse_id=p.id2";

      // compute similarity. could be time-consuming.
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT %d, n1.mouse_id, n2.mouse_id,\n                (AVG(n1.weight*n2.weight))/(s1.stddev*s2.stddev) corr, %d\n                FROM {recommender_helper_matrix} n1 INNER JOIN {recommender_helper_matrix} n2\n                ON n1.cheese_id=n2.cheese_id AND n1.mouse_id<=n2.mouse_id {$sql_adjusted}\n                INNER JOIN {recommender_helper_single_stat} s1 ON n1.mouse_id=s1.id\n                INNER JOIN {recommender_helper_single_stat} s2 ON n2.mouse_id=s2.id\n                WHERE s1.stddev<>0 AND s2.stddev<>0\n                GROUP BY n1.mouse_id, n2.mouse_id HAVING corr<>0", $app_id, $created);

      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)\n                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}\n                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clear up temporary data
      if ($missing == 'adjusted') {
        db_query("DELETE FROM {recommender_helper_pair_stat}");
      }
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("DELETE FROM {recommender_helper_matrix}");
      break;
  }

  // remove old data.
  db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
}