You are here

function _remove_duplicates_get_nodes in Remove Duplicates 7

Get all nodes with selected type / field.

Parameters

string $node_type_machine_name: The node type to fetch.

string $node_field_machine_name: The {field} to join with the fetched node type.

bool $case_sensitive: If TRUE, duplicates detection is case sensitive Otherwise, duplicates detection is case insensitive.

Return value

array An array of node objects with 3 properties : nid, status, {field}.

1 call to _remove_duplicates_get_nodes()
_remove_duplicates_get_duplicate_node_groups in ./remove_duplicates.module
Get all duplicate node grouped according to selected field.

File

./remove_duplicates.module, line 747
Remove duplicate nodes according to node fields or Custom fields.

Code

function _remove_duplicates_get_nodes($node_type_machine_name = NULL, $node_field_machine_name = NULL, $case_sensitive = TRUE) {
  $records = array();

  // EntityFieldQuery does not support GROUP BY nor DISTINCT.
  // See : https://drupal.org/node/1565708
  // Using raw database calls instead.
  if (in_array(strtolower($node_field_machine_name), array(
    'title',
  ))) {

    // For basic title field.
    $field = $node_field_machine_name;
    if (Database::getConnection()
      ->databaseType() == 'pgsql') {

      // In PostgreSQL string comparisons are case sensitive by default.
      // Nested Query pattern (case sensitive) :
      // @code
      //  SELECT s.{field} AS {field},
      //  COUNT( * ) AS duplicate
      //  FROM node s
      //  WHERE (
      //    s.type = {node_type_machine_name}
      //  )
      //  GROUP BY {$field}
      // @endcode
      // Nested Query pattern (case insensitive) :
      // @code
      //  SELECT s.lowered as {field},
      //  SUM(s.duplicate) AS duplicate
      //  FROM (
      //    SELECT s.{field} AS {field}, lower(s.{field}) AS lowered,
      //    COUNT( * ) AS duplicate
      //    FROM node s
      //    WHERE (
      //      s.type = {node_type_machine_name}
      //    )
      //    GROUP BY lowered, {field}
      //  ) s
      //  GROUP BY lowered
      // @endcode
      $nested_query = db_select('node', 's');
      $nested_query
        ->fields('s', array(
        $field,
      ));
      $nested_query
        ->addExpression('COUNT(*)', 'duplicate');
      $nested_query
        ->condition('s.type', $node_type_machine_name, '=');
      if ($case_sensitive) {
        $nested_query
          ->groupBy($field);
      }
      else {
        $nested_query
          ->addExpression('lower(s.' . $field . ')', 'lowered');
        $nested_query
          ->groupBy('lowered');
        $nested_query
          ->groupBy($field);
        $nested_query = db_select($nested_query, 's');
        $nested_query
          ->addExpression('s.lowered', $field);
        $nested_query
          ->addExpression('SUM(s.duplicate)', 'duplicate');
        $nested_query
          ->groupBy('lowered');
      }

      // Sub Query pattern (both cases) :
      // @code
      //  SELECT n.{field} AS {field}
      //  FROM {$nested_query} n
      //  WHERE duplicates > 1
      // @endcode
      $sub_query = db_select($nested_query, 'n');
      $sub_query
        ->fields('n', array(
        $field,
      ));
      $sub_query
        ->condition('duplicate', 1, '>');
    }
    else {

      // In MySQL nonbinary string comparisons are case insensitive by default.
      // See : https://dev.mysql.com/doc/refman/5.0/en/case-sensitivity.html
      // Sub Query pattern (case sensitive) :
      // @code
      //  SELECT n.{field} AS {field}, md5(n.{field}) AS checksum,
      //  COUNT( * ) AS duplicate
      //  FROM node n
      //  WHERE (
      //    n.type = {node_type_machine_name}
      //  )
      //  GROUP BY checksum
      //  HAVING count( duplicate ) >1
      // @endcode
      // Sub Query pattern (case insensitive) :
      // @code
      //  SELECT n.{field} AS {field},
      //  COUNT( * ) AS duplicate
      //  FROM node n
      //  WHERE (
      //    n.type = {node_type_machine_name}
      //  )
      //  GROUP BY {field}
      //  HAVING count( duplicate ) > 1
      // @endcode
      $sub_query = db_select('node', 'n');
      $sub_query
        ->fields('n', array(
        $field,
      ));
      $sub_query
        ->addExpression('COUNT(*)', 'duplicate');
      if ($case_sensitive) {
        $sub_query
          ->addExpression('md5(n.' . $field . ')', 'checksum');
        $sub_query
          ->groupBy('checksum');
      }
      else {
        $sub_query
          ->groupBy($field);
      }
      $sub_query
        ->condition('n.type', $node_type_machine_name, '=');
      $sub_query
        ->havingCondition('duplicate', 1, '>');
    }

    // Main Query pattern (case sensitive) :
    // @code
    //  SELECT n.nid AS nid, n.uid AS uid, n.status AS status,
    //  n.created AS created, n.changed AS changed,
    //  n.{field} AS {field}, u.name AS name
    //  FROM node n
    //  INNER JOIN users u ON n.uid = u.uid
    //  INNER JOIN (
    //    {sub_query}
    //  ) nn ON md5(n.{field}) = md5(nn.{field})
    //  WHERE (
    //    n.type = {node_type_machine_name}
    //  )
    //  ORDER BY {field} ASC
    // @endcode
    // Main Query pattern (case insensitive) :
    // @code
    //  SELECT n.nid AS nid, n.uid AS uid, n.status AS status,
    //  n.created AS created, n.changed AS changed,
    //  n.{field} AS {field}, u.name AS name
    //  FROM node n
    //  INNER JOIN users u ON n.uid = u.uid
    //  INNER JOIN (
    //    {sub_query}
    //  ) nn ON n.{field} = nn.{field}
    //  WHERE (
    //    n.type = {node_type_machine_name}
    //  )
    //  ORDER BY {field} ASC
    // @endcode
    $main_query = db_select('node', 'n');
    $main_query
      ->fields('n', array(
      'nid',
      'uid',
      'status',
      'created',
      'changed',
      $field,
    ));
    $main_query
      ->fields('u', array(
      'name',
    ));
    $main_query
      ->join('users', 'u', 'n.uid = u.uid');
    if ($case_sensitive) {
      $main_query
        ->join($sub_query, 'nn', 'md5(n.' . $field . ') = md5(nn.' . $field . ')');
    }
    else {
      if (Database::getConnection()
        ->databaseType() == 'pgsql') {
        $main_query
          ->join($sub_query, 'nn', 'lower(n.' . $field . ') = lower(nn.' . $field . ')');
      }
      else {
        $main_query
          ->join($sub_query, 'nn', 'n.' . $field . ' = nn.' . $field);
      }
    }
    $main_query
      ->condition('n.type', $node_type_machine_name, '=');
    $main_query
      ->orderBy($field, 'ASC');
    $records = $main_query
      ->execute()
      ->fetchall();
  }
  else {

    // For Custom Fields.
    $field_info = field_info_field($node_field_machine_name);
    if (!empty($field_info['storage']['details']['sql']['FIELD_LOAD_CURRENT'])) {
      $table = key($field_info['storage']['details']['sql']['FIELD_LOAD_CURRENT']);
      $field = current($field_info['storage']['details']['sql']['FIELD_LOAD_CURRENT'][$table]);
      if (db_table_exists($table) && db_field_exists($table, $field)) {
        if (Database::getConnection()
          ->databaseType() == 'pgsql') {

          // Nested Query pattern (case sensitive) :
          // @code
          //  SELECT cf.{field} AS {field},
          //  COUNT( * ) AS duplicate
          //  FROM node s
          //  INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
          //  AND cf.entity_type = 'node' AND cf.entity_id = s.nid
          //  WHERE (
          //    s.type = {node_type_machine_name}
          //  )
          //  GROUP BY {$field}
          // @endcode
          // Nested Query pattern (case insensitive) :
          // @code
          //  SELECT s.lowered as {field},
          //  SUM(s.duplicate) AS duplicate
          //  FROM (
          //    SELECT cf.{field} AS {field}, lower(cf.{field}) AS lowered,
          //    COUNT( * ) AS duplicate
          //    FROM node s
          //    INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
          //    AND cf.entity_type = 'node' AND cf.entity_id = s.nid
          //    WHERE (
          //      s.type = {node_type_machine_name}
          //    )
          //    GROUP BY lowered, {field}
          //  ) s
          //  GROUP BY lowered
          // @endcode
          $nested_query = db_select('node', 's');
          $nested_query
            ->addJoin('INNER', $table, 'cf', 'cf.bundle = :bundle AND cf.entity_type = :entity_type AND s.nid = cf.entity_id ', array(
            ':bundle' => $node_type_machine_name,
            ':entity_type' => 'node',
          ));
          $nested_query
            ->fields('cf', array(
            $field,
          ));
          $nested_query
            ->addExpression('COUNT(*)', 'duplicate');
          $nested_query
            ->condition('s.type', $node_type_machine_name, '=');
          if ($case_sensitive) {
            $nested_query
              ->groupBy($field);
          }
          else {
            $nested_query
              ->addExpression('lower(cf.' . $field . ')', 'lowered');
            $nested_query
              ->groupBy('lowered');
            $nested_query
              ->groupBy($field);
            $nested_query = db_select($nested_query, 's');
            $nested_query
              ->addExpression('s.lowered', $field);
            $nested_query
              ->addExpression('SUM(s.duplicate)', 'duplicate');
            $nested_query
              ->groupBy('lowered');
          }

          // Sub Query pattern (both cases) :
          // @code
          //  SELECT cf.{field} AS {field}
          //  FROM {$nested_query} cf
          //  WHERE duplicates > 1
          // @endcode
          $sub_query = db_select($nested_query, 'cf');
          $sub_query
            ->fields('cf', array(
            $field,
          ));
          $sub_query
            ->condition('duplicate', 1, '>');
        }
        else {

          // Sub Query pattern (case sensitive) :
          // @code
          //  SELECT cf.{field} AS {field}, md5(cf.{field}) AS checksum,
          //  COUNT( * ) AS duplicate
          //  FROM node n
          //  INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
          //  AND cf.entity_type = 'node' AND cf.entity_id = n.nid
          //  WHERE (
          //    n.type = {node_type_machine_name}
          //  )
          //  GROUP BY checksum
          //  HAVING count( duplicate ) > 1
          // @endcode
          // Sub Query pattern (case insensitive) :
          // @code
          //  SELECT cf.{field} AS {field},
          //  COUNT( * ) AS duplicate
          //  FROM node n
          //  INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
          //  AND cf.entity_type = 'node' AND cf.entity_id = n.nid
          //  WHERE (
          //    n.type = {node_type_machine_name}
          //  )
          //  GROUP BY {field}
          //  HAVING count( duplicate ) > 1
          // @endcode
          $sub_query = db_select('node', 'n');
          $sub_query
            ->addJoin('INNER', $table, 'cf', 'cf.bundle = :bundle AND cf.entity_type = :entity_type AND n.nid = cf.entity_id ', array(
            ':bundle' => $node_type_machine_name,
            ':entity_type' => 'node',
          ));
          $sub_query
            ->condition('n.type', $node_type_machine_name, '=');
          $sub_query
            ->fields('cf', array(
            $field,
          ));
          $sub_query
            ->addExpression('COUNT(*)', 'duplicate');
          if ($case_sensitive) {
            $sub_query
              ->addExpression('md5(cf.' . $field . ')', 'checksum');
            $sub_query
              ->groupBy('checksum');
          }
          else {
            $sub_query
              ->groupBy($field);
          }
          $sub_query
            ->havingCondition('duplicate', 1, '>');
        }

        // Main Query pattern (case sensitive) :
        // @code
        //  SELECT n.nid AS nid, n.uid AS uid, n.status AS status,
        //  n.created AS created, n.changed AS changed, n.title AS title,
        //  u.name AS name, cf.{field} AS {field}
        //  FROM node n
        //  INNER JOIN users u ON n.uid = u.uid
        //  INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
        //  AND cf.entity_type = 'node' AND cf.entity_id = n.nid
        //  INNER JOIN (
        //    {sub_query}
        //  ) nn ON md5(cf.{field}) = md5(nn.{field})
        //  WHERE (
        //    n.type = {node_type_machine_name}
        //  )
        //  ORDER BY {field} ASC
        // @endcode
        // Main Query pattern (case insensitive) :
        // @code
        //  SELECT n.nid AS nid, n.uid AS uid, n.status AS status,
        //  n.created AS created, n.changed AS changed, n.title AS title,
        //  u.name AS name, cf.{field} AS {field}
        //  FROM node n
        //  INNER JOIN users u ON n.uid = u.uid
        //  INNER JOIN {table} cf ON cf.bundle = {node_type_machine_name}
        //  AND cf.entity_type = 'node' AND cf.entity_id = n.nid
        //  INNER JOIN (
        //    {sub_query}
        //  ) nn ON cf.{field} = nn.{field}
        //  WHERE (
        //    n.type = {node_type_machine_name}
        //  )
        //  ORDER BY {field} ASC
        // @endcode
        $main_query = db_select('node', 'n');
        $main_query
          ->fields('n', array(
          'nid',
          'uid',
          'status',
          'created',
          'changed',
          'title',
        ));
        $main_query
          ->fields('u', array(
          'name',
        ));
        $main_query
          ->fields('cf', array(
          $field,
        ));
        $main_query
          ->addJoin('INNER', $table, 'cf', 'cf.bundle = :bundle AND cf.entity_type = :entity_type AND n.nid = cf.entity_id ', array(
          ':bundle' => $node_type_machine_name,
          ':entity_type' => 'node',
        ));
        $main_query
          ->join('users', 'u', 'n.uid = u.uid');
        if ($case_sensitive) {
          $main_query
            ->join($sub_query, 'nn', 'md5(cf.' . $field . ') = md5(nn.' . $field . ')');
        }
        else {
          if (Database::getConnection()
            ->databaseType() == 'pgsql') {
            $main_query
              ->join($sub_query, 'nn', 'lower(cf.' . $field . ') = lower(nn.' . $field . ')');
          }
          else {
            $main_query
              ->join($sub_query, 'nn', 'cf.' . $field . ' = nn.' . $field);
          }
        }
        $main_query
          ->condition('n.type', $node_type_machine_name, '=');
        $main_query
          ->orderBy($field, 'ASC');
        $records = $main_query
          ->execute()
          ->fetchall();
      }
    }
  }
  return $records;
}