You are here

ParserCSV.php in Feeds 8.2

Contains CSV Parser.

Functions in this file are independent of the Feeds specific implementation. Thanks to jpetso http://drupal.org/user/56020 for most of the code in this file.

Namespace

Drupal\feeds

File

lib/Drupal/feeds/ParserCSV.php
View source
<?php

/**
 * @file
 * Contains CSV Parser.
 *
 * Functions in this file are independent of the Feeds specific implementation.
 * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
 * file.
 */
namespace Drupal\feeds;

use Iterator;

/**
 * Functionality to parse CSV files into a two dimensional array.
 */
class ParserCSV {
  private $delimiter;
  private $skipFirstLine;
  private $columnNames;
  private $timeout;
  private $timeoutReached;
  private $startByte;
  private $lineLimit;
  private $lastLinePos;
  public function __construct() {
    $this->delimiter = ',';
    $this->skipFirstLine = FALSE;
    $this->columnNames = FALSE;
    $this->timeout = FALSE;
    $this->timeoutReached = FALSE;
    $this->startByte = 0;
    $this->lineLimit = 0;
    $this->lastLinePos = 0;
    ini_set('auto_detect_line_endings', TRUE);
  }

  /**
   * Set the column delimiter string.
   * By default, the comma (',') is used as delimiter.
   */
  public function setDelimiter($delimiter) {
    $this->delimiter = $delimiter;
  }

  /**
   * Set this to TRUE if the parser should skip the first line of the CSV text,
   * which might be desired if the first line contains the column names.
   * By default, this is set to FALSE and the first line is not skipped.
   */
  public function setSkipFirstLine($skipFirstLine) {
    $this->skipFirstLine = $skipFirstLine;
  }

  /**
   * Specify an array of column names if you know them in advance, or FALSE
   * (which is the default) to unset any prior column names. If no column names
   * are set, the parser will put each row into a simple numerically indexed
   * array. If column names are given, the parser will create arrays with
   * these column names as array keys instead.
   */
  public function setColumnNames($columnNames) {
    $this->columnNames = $columnNames;
  }

  /**
   * Define the time (in milliseconds) after which the parser stops parsing,
   * even if it has not yet finished processing the CSV data. If the timeout
   * has been reached before parsing is done, the parse() method will return
   * an incomplete list of rows - a single row will never be cut off in the
   * middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
   *
   * You can check if the timeout has been reached by calling the
   * timeoutReached() method after parse() has been called.
   */
  public function setTimeout($timeout) {
    $this->timeout = $timeout;
  }

  /**
   * After calling the parse() method, determine if the timeout (set by the
   * setTimeout() method) has been reached.
   *
   * @deprecated Use lastLinePos() instead to determine whether a file has
   *   finished parsing.
   */
  public function timeoutReached() {
    return $this->timeoutReached;
  }

  /**
   * Define the number of lines to parse in one parsing operation.
   *
   * By default, all lines of a file are being parsed.
   */
  public function setLineLimit($lines) {
    $this->lineLimit = $lines;
  }

  /**
   * Get the byte number where the parser left off after last parse() call.
   *
   * @return
   *  0 if all lines or no line has been parsed, the byte position of where a
   *  timeout or the line limit has been reached otherwise. This position can be
   *  used to set the start byte for the next iteration after parse() has
   *  reached the timeout set with setTimeout() or the line limit set with
   *  setLineLimit().
   *
   * @see ParserCSV::setStartByte()
   */
  public function lastLinePos() {
    return $this->lastLinePos;
  }

  /**
   * Set the byte where file should be started to read.
   *
   * Useful when parsing a file in batches.
   */
  public function setStartByte($start) {
    return $this->startByte = $start;
  }

  /**
   * Parse CSV files into a two dimensional array.
   *
   * @param Iterator $lineIterator
   *   An Iterator object that yields line strings, e.g. ParserCSVIterator.
   * @param $start
   *   The byte number from where to start parsing the file.
   * @param $lines
   *   The number of lines to parse, 0 for all lines.
   * @return
   *   Two dimensional array that contains the data in the CSV file.
   */
  public function parse(Iterator $lineIterator) {
    $skipLine = $this->skipFirstLine;
    $rows = array();
    $this->timeoutReached = FALSE;
    $this->lastLinePos = 0;
    $maxTime = empty($this->timeout) ? FALSE : microtime() + $this->timeout;
    $linesParsed = 0;
    for ($lineIterator
      ->rewind($this->startByte); $lineIterator
      ->valid(); $lineIterator
      ->next()) {

      // Make really sure we've got lines without trailing newlines.
      $line = trim($lineIterator
        ->current(), "\r\n");

      // Skip empty lines.
      if (empty($line)) {
        continue;
      }

      // If the first line contains column names, skip it.
      if ($skipLine) {
        $skipLine = FALSE;
        continue;
      }

      // The actual parser. explode() is unfortunately not suitable because the
      // delimiter might be located inside a quoted field, and that would break
      // the field and/or require additional effort to re-join the fields.
      $quoted = FALSE;
      $currentIndex = 0;
      $currentField = '';
      $fields = array();

      // We must use strlen() as we're parsing byte by byte using strpos(), so
      // drupal_strlen() will not work properly.
      while ($currentIndex <= strlen($line)) {
        if ($quoted) {
          $nextQuoteIndex = strpos($line, '"', $currentIndex);
          if ($nextQuoteIndex === FALSE) {

            // There's a line break before the quote is closed, so fetch the
            // next line and start from there.
            $currentField .= substr($line, $currentIndex);
            $lineIterator
              ->next();
            if (!$lineIterator
              ->valid()) {

              // Whoa, an unclosed quote! Well whatever, let's just ignore
              // that shortcoming and record it nevertheless.
              $fields[] = $currentField;
              break;
            }

            // Ok, so, on with fetching the next line, as mentioned above.
            $currentField .= "\n";
            $line = trim($lineIterator
              ->current(), "\r\n");
            $currentIndex = 0;
            continue;
          }

          // There's actually another quote in this line...
          // find out whether it's escaped or not.
          $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
          if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {

            // Escaped quote, add a single one to the field and proceed quoted.
            $currentField .= '"';
            $currentIndex = $nextQuoteIndex + 2;
          }
          else {

            // End of the quoted section, close the quote and let the
            // $quoted == FALSE block finalize the field.
            $quoted = FALSE;
            $currentIndex = $nextQuoteIndex + 1;
          }
        }
        else {

          // $quoted == FALSE
          // First, let's find out where the next character of interest is.
          $nextQuoteIndex = strpos($line, '"', $currentIndex);
          $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
          if ($nextQuoteIndex === FALSE) {
            $nextIndex = $nextDelimiterIndex;
          }
          elseif ($nextDelimiterIndex === FALSE) {
            $nextIndex = $nextQuoteIndex;
          }
          else {
            $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
          }
          if ($nextIndex === FALSE) {

            // This line is done, add the rest of it as last field.
            $currentField .= substr($line, $currentIndex);
            $fields[] = $currentField;
            break;
          }
          elseif ($line[$nextIndex] === $this->delimiter[0]) {
            $length = $nextIndex + strlen($this->delimiter) - 1 - $currentIndex;
            $currentField .= substr($line, $currentIndex, $length);
            $fields[] = $currentField;
            $currentField = '';
            $currentIndex += $length + 1;

            // Continue with the next field.
          }
          else {

            // $line[$nextIndex] == '"'
            $quoted = TRUE;
            $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
            $currentIndex = $nextIndex + 1;

            // Continue this field in the $quoted == TRUE block.
          }
        }
      }

      // End of CSV parser. We've now got all the fields of the line as strings
      // in the $fields array.
      if (empty($this->columnNames)) {
        $row = $fields;
      }
      else {
        $row = array();
        foreach ($this->columnNames as $columnName) {
          $field = array_shift($fields);
          $row[$columnName] = isset($field) ? $field : '';
        }
      }
      $rows[] = $row;

      // Quit parsing if timeout has been reached or requested lines have been
      // reached.
      if (!empty($maxTime) && microtime() > $maxTime) {
        $this->timeoutReached = TRUE;
        $this->lastLinePos = $lineIterator
          ->currentPos();
        break;
      }
      $linesParsed++;
      if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
        $this->lastLinePos = $lineIterator
          ->currentPos();
        break;
      }
    }
    return $rows;
  }

}

Classes

Namesort descending Description
ParserCSV Functionality to parse CSV files into a two dimensional array.