You are here

class HtmlLinkExtractor in Link checker 8

Class HtmlLinkExtractor.

Plugin annotation


@LinkExtractor(
  id = "html_link_extractor",
  label = @Translation("HTML extractor"),
  field_types = {
    "text",
    "text_long",
    "text_with_summary",
  }
)

Hierarchy

Expanded class hierarchy of HtmlLinkExtractor

File

src/Plugin/LinkExtractor/HtmlLinkExtractor.php, line 21

Namespace

Drupal\linkchecker\Plugin\LinkExtractor
View source
class HtmlLinkExtractor extends LinkExtractorBase {

  /**
   * {@inheritdoc}
   */
  protected function extractUrlFromField(array $value) {
    $string = $value['value'];
    if (empty($string)) {
      return [];
    }
    $html_dom = Html::load($string);
    $urls = [];

    // Finds all hyperlinks in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_a') == TRUE) {
      $links = $html_dom
        ->getElementsByTagName('a');
      foreach ($links as $link) {
        $urls[] = $link
          ->getAttribute('href');
      }
      $links = $html_dom
        ->getElementsByTagName('area');
      foreach ($links as $link) {
        $urls[] = $link
          ->getAttribute('href');
      }
    }

    // Finds all audio links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_audio') == TRUE) {
      $audios = $html_dom
        ->getElementsByTagName('audio');
      foreach ($audios as $audio) {
        $urls[] = $audio
          ->getAttribute('src');

        // Finds source tags with links in the audio tag.
        $sources = $audio
          ->getElementsByTagName('source');
        foreach ($sources as $source) {
          $urls[] = $source
            ->getAttribute('src');
        }

        // Finds track tags with links in the audio tag.
        $tracks = $audio
          ->getElementsByTagName('track');
        foreach ($tracks as $track) {
          $urls[] = $track
            ->getAttribute('src');
        }
      }
    }

    // Finds embed tags with links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_embed') == TRUE) {
      $embeds = $html_dom
        ->getElementsByTagName('embed');
      foreach ($embeds as $embed) {
        $urls[] = $embed
          ->getAttribute('src');
        $urls[] = $embed
          ->getAttribute('pluginurl');
        $urls[] = $embed
          ->getAttribute('pluginspage');
      }
    }

    // Finds iframe tags with links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_iframe') == TRUE) {
      $iframes = $html_dom
        ->getElementsByTagName('iframe');
      foreach ($iframes as $iframe) {
        $urls[] = $iframe
          ->getAttribute('src');
      }
    }

    // Finds img tags with links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_img') == TRUE) {
      $imgs = $html_dom
        ->getElementsByTagName('img');
      foreach ($imgs as $img) {
        $urls[] = $img
          ->getAttribute('src');
        $urls[] = $img
          ->getAttribute('longdesc');
      }
    }

    // Finds object/param tags with links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_object') == TRUE) {
      $objects = $html_dom
        ->getElementsByTagName('object');
      foreach ($objects as $object) {
        $urls[] = $object
          ->getAttribute('data');
        $urls[] = $object
          ->getAttribute('codebase');

        // Finds param tags with links in the object tag.
        $params = $object
          ->getElementsByTagName('param');
        foreach ($params as $param) {

          // @todo
          // - Try to extract links in unkown "flashvars" values
          //   (e.g., file=http://, data=http://).
          $names = [
            'archive',
            'filename',
            'href',
            'movie',
            'src',
            'url',
          ];
          if ($param
            ->hasAttribute('name') && in_array($param
            ->getAttribute('name'), $names)) {
            $urls[] = $param
              ->getAttribute('value');
          }
          $srcs = [
            'movie',
          ];
          if ($param
            ->hasAttribute('src') && in_array($param
            ->getAttribute('src'), $srcs)) {
            $urls[] = $param
              ->getAttribute('value');
          }
        }
      }
    }

    // Finds video tags with links in the content.
    if ($this->linkcheckerSetting
      ->get('extract.from_video') == TRUE) {
      $videos = $html_dom
        ->getElementsByTagName('video');
      foreach ($videos as $video) {
        $urls[] = $video
          ->getAttribute('poster');
        $urls[] = $video
          ->getAttribute('src');

        // Finds source tags with links in the video tag.
        $sources = $video
          ->getElementsByTagName('source');
        foreach ($sources as $source) {
          $urls[] = $source
            ->getAttribute('src');
        }

        // Finds track tags with links in the audio tag.
        $tracks = $video
          ->getElementsByTagName('track');
        foreach ($tracks as $track) {
          $urls[] = $track
            ->getAttribute('src');
        }
      }
    }

    // Remove empty values.
    $urls = array_filter($urls);

    // Remove duplicate urls.
    $urls = array_unique($urls);
    return $urls;
  }

}

Members

Namesort descending Modifiers Type Description Overrides
HtmlLinkExtractor::extractUrlFromField protected function Extracts a URLs from field. Overrides LinkExtractorBase::extractUrlFromField
LinkExtractorBase::$linkcheckerSetting protected property The Linkchecker settings.
LinkExtractorBase::create public static function Creates an instance of the plugin. Overrides ContainerFactoryPluginInterface::create
LinkExtractorBase::extract public function Extracts links from field list. Overrides LinkExtractorInterface::extract
LinkExtractorBase::__construct public function LinkExtractorBase plugin constructor. Overrides PluginBase::__construct
PluginBase::$configuration protected property Configuration information passed into the plugin. 1
PluginBase::$pluginDefinition protected property The plugin implementation definition. 1
PluginBase::$pluginId protected property The plugin_id.
PluginBase::DERIVATIVE_SEPARATOR constant A string which is used to separate base plugin IDs from the derivative ID.
PluginBase::getBaseId public function Gets the base_plugin_id of the plugin instance. Overrides DerivativeInspectionInterface::getBaseId
PluginBase::getDerivativeId public function Gets the derivative_id of the plugin instance. Overrides DerivativeInspectionInterface::getDerivativeId
PluginBase::getPluginDefinition public function Gets the definition of the plugin implementation. Overrides PluginInspectionInterface::getPluginDefinition 3
PluginBase::getPluginId public function Gets the plugin_id of the plugin instance. Overrides PluginInspectionInterface::getPluginId
PluginBase::isConfigurable public function Determines if the plugin is configurable.