class Markdown_Parser in Markdown 5

Same name and namespace in other branches
6 markdown.php \Markdown_Parser
Hierarchy

class \Markdown_Parser
Expanded class hierarchy of Markdown_Parser
File

./markdown.php, line 200
View source
class Markdown_Parser {

  # Regex to match balanced [brackets].

  # Needed to insert a maximum bracked depth while converting to PHP.
  var $nested_brackets_depth = 6;
  var $nested_brackets_re;
  var $nested_url_parenthesis_depth = 4;
  var $nested_url_parenthesis_re;

  # Table of hash values for escaped characters:
  var $escape_chars = '\\`*_{}[]()>#+-.!';
  var $escape_chars_re;

  # Change to ">" for HTML output.
  var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX;
  var $tab_width = MARKDOWN_TAB_WIDTH;

  # Change to `true` to disallow markup or entities.
  var $no_markup = false;
  var $no_entities = false;

  # Predefined urls and titles for reference links and images.
  var $predef_urls = array();
  var $predef_titles = array();
  function Markdown_Parser() {

    #

    # Constructor function. Initialize appropriate member variables.

    #
    $this
      ->_initDetab();
    $this
      ->prepareItalicsAndBold();
    $this->nested_brackets_re = str_repeat('(?>[^\\[\\]]+|\\[', $this->nested_brackets_depth) . str_repeat('\\])*', $this->nested_brackets_depth);
    $this->nested_url_parenthesis_re = str_repeat('(?>[^()\\s]+|\\(', $this->nested_url_parenthesis_depth) . str_repeat('(?>\\)))*', $this->nested_url_parenthesis_depth);
    $this->escape_chars_re = '[' . preg_quote($this->escape_chars) . ']';

    # Sort document, block, and span gamut in ascendent priority order.
    asort($this->document_gamut);
    asort($this->block_gamut);
    asort($this->span_gamut);
  }

  # Internal hashes used during transformation.
  var $urls = array();
  var $titles = array();
  var $html_hashes = array();

  # Status flag to avoid invalid nesting.
  var $in_anchor = false;
  function setup() {

    #

    # Called before the transformation process starts to setup parser

    # states.

    #

    # Clear global hashes.
    $this->urls = $this->predef_urls;
    $this->titles = $this->predef_titles;
    $this->html_hashes = array();
    $in_anchor = false;
  }
  function teardown() {

    #

    # Called after the transformation process to clear any variable

    # which may be taking up memory unnecessarly.

    #
    $this->urls = array();
    $this->titles = array();
    $this->html_hashes = array();
  }
  function transform($text) {

    #

    # Main function. Performs some preprocessing on the input text

    # and pass it through the document gamut.

    #
    $this
      ->setup();

    # Remove UTF-8 BOM and marker character in input, if present.
    $text = preg_replace('{^\\xEF\\xBB\\xBF|\\x1A}', '', $text);

    # Standardize line endings:

    #   DOS to Unix and Mac to Unix
    $text = preg_replace('{\\r\\n?}', "\n", $text);

    # Make sure $text ends with a couple of newlines:
    $text .= "\n\n";

    # Convert all tabs to spaces.
    $text = $this
      ->detab($text);

    # Turn block-level HTML blocks into hash entries
    $text = $this
      ->hashHTMLBlocks($text);

    # Strip any lines consisting only of spaces and tabs.

    # This makes subsequent regexen easier to write, because we can

    # match consecutive blank lines with /\n+/ instead of something

    # contorted like /[ ]*\n+/ .
    $text = preg_replace('/^[ ]+$/m', '', $text);

    # Run document gamut methods.
    foreach ($this->document_gamut as $method => $priority) {
      $text = $this
        ->{$method}($text);
    }
    $this
      ->teardown();
    return $text . "\n";
  }
  var $document_gamut = array(
    # Strip link definitions, store in hashes.
    "stripLinkDefinitions" => 20,
    "runBasicBlockGamut" => 30,
  );
  function stripLinkDefinitions($text) {

    #

    # Strips link definitions from text, stores the URLs and titles in

    # hash references.

    #
    $less_than_tab = $this->tab_width - 1;

    # Link defs are in the form: ^[id]: url "optional title"
    $text = preg_replace_callback('{
							^[ ]{0,' . $less_than_tab . '}\\[(.+)\\][ ]?:	# id = $1
							  [ ]*
							  \\n?				# maybe *one* newline
							  [ ]*
							<?(\\S+?)>?			# url = $2
							  [ ]*
							  \\n?				# maybe one newline
							  [ ]*
							(?:
								(?<=\\s)			# lookbehind for whitespace
								["(]
								(.*?)			# title = $3
								[")]
								[ ]*
							)?	# title is optional
							(?:\\n+|\\Z)
			}xm', array(
      &$this,
      '_stripLinkDefinitions_callback',
    ), $text);
    return $text;
  }
  function _stripLinkDefinitions_callback($matches) {
    $link_id = strtolower($matches[1]);
    $this->urls[$link_id] = $matches[2];
    $this->titles[$link_id] =& $matches[3];
    return '';

    # String that will replace the block
  }
  function hashHTMLBlocks($text) {
    if ($this->no_markup) {
      return $text;
    }
    $less_than_tab = $this->tab_width - 1;

    # Hashify HTML blocks:

    # We only want to do this for block-level HTML tags, such as headers,

    # lists, and tables. That's because we still want to wrap <p>s around

    # "paragraphs" that are wrapped in non-block-level tags, such as anchors,

    # phrase emphasis, and spans. The list of tags we're looking for is

    # hard-coded:

    #

    # *  List "a" is made of tags which can be both inline or block-level.

    #    These will be treated block-level when the start tag is alone on

    #    its line, otherwise they're not matched here and will be taken as

    #    inline later.

    # *  List "b" is made of tags which are always block-level;

    #
    $block_tags_a_re = 'ins|del';
    $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|' . 'script|noscript|form|fieldset|iframe|math';

    # Regular expression for the content of a block tag.
    $nested_tags_level = 4;
    $attr = '
			(?>				# optional tag attributes
			  \\s			# starts with whitespace
			  (?>
				[^>"/]+		# text outside quotes
			  |
				/+(?!>)		# slash not followed by ">"
			  |
				"[^"]*"		# text inside double quotes (tolerate ">")
			  |
				\'[^\']*\'	# text inside single quotes (tolerate ">")
			  )*
			)?	
			';
    $content = str_repeat('
				(?>
				  [^<]+			# content without tag
				|
				  <\\2			# nested opening tag
					' . $attr . '	# attributes
					(?>
					  />
					|
					  >', $nested_tags_level) . '.*?' . str_repeat('
					  </\\2\\s*>	# closing nested tag
					)
				  |				
					<(?!/\\2\\s*>	# other tags with a different name
				  )
				)*', $nested_tags_level);
    $content2 = str_replace('\\2', '\\3', $content);

    # First, look for nested blocks, e.g.:

    # 	<div>

    # 		<div>

    # 		tags for inner block must be indented.

    # 		</div>

    # 	</div>

    #

    # The outermost tags must start at the left margin for this to match, and

    # the inner nested divs must be indented.

    # We need to do this before the next, more liberal match, because the next

    # match will start at the first `<div>` and stop at the first `</div>`.
    $text = preg_replace_callback('{(?>
			(?>
				(?<=\\n\\n)		# Starting after a blank line
				|				# or
				\\A\\n?			# the beginning of the doc
			)
			(						# save in $1

			  # Match from `\\n<tag>` to `</tag>\\n`, handling nested tags
			  # in between.
					
						[ ]{0,' . $less_than_tab . '}
						<(' . $block_tags_b_re . ')# start tag = $2
						' . $attr . '>			# attributes followed by > and \\n
						' . $content . '		# content, support nesting
						</\\2>				# the matching end tag
						[ ]*				# trailing spaces/tabs
						(?=\\n+|\\Z)	# followed by a newline or end of document

			| # Special version for tags of group a.

						[ ]{0,' . $less_than_tab . '}
						<(' . $block_tags_a_re . ')# start tag = $3
						' . $attr . '>[ ]*\\n	# attributes followed by >
						' . $content2 . '		# content, support nesting
						</\\3>				# the matching end tag
						[ ]*				# trailing spaces/tabs
						(?=\\n+|\\Z)	# followed by a newline or end of document
					
			| # Special case just for <hr />. It was easier to make a special
			  # case than to make the other regex more complicated.
			
						[ ]{0,' . $less_than_tab . '}
						<(hr)				# start tag = $2
						' . $attr . '			# attributes
						/?>					# the matching end tag
						[ ]*
						(?=\\n{2,}|\\Z)		# followed by a blank line or end of document
			
			| # Special case for standalone HTML comments:
			
					[ ]{0,' . $less_than_tab . '}
					(?s:
						<!-- .*? -->
					)
					[ ]*
					(?=\\n{2,}|\\Z)		# followed by a blank line or end of document
			
			| # PHP and ASP-style processor instructions (<? and <%)
			
					[ ]{0,' . $less_than_tab . '}
					(?s:
						<([?%])			# $2
						.*?
						\\2>
					)
					[ ]*
					(?=\\n{2,}|\\Z)		# followed by a blank line or end of document
					
			)
			)}Sxmi', array(
      &$this,
      '_hashHTMLBlocks_callback',
    ), $text);
    return $text;
  }
  function _hashHTMLBlocks_callback($matches) {
    $text = $matches[1];
    $key = $this
      ->hashBlock($text);
    return "\n\n{$key}\n\n";
  }
  function hashPart($text, $boundary = 'X') {

    #

    # Called whenever a tag must be hashed when a function insert an atomic

    # element in the text stream. Passing $text to through this function gives

    # a unique text-token which will be reverted back when calling unhash.

    #

    # The $boundary argument specify what character should be used to surround

    # the token. By convension, "B" is used for block elements that needs not

    # to be wrapped into paragraph tags at the end, ":" is used for elements

    # that are word separators and "X" is used in the general case.

    #

    # Swap back any tag hash found in $text so we do not have to `unhash`

    # multiple times at the end.
    $text = $this
      ->unhash($text);

    # Then hash the block.
    static $i = 0;
    $key = "{$boundary}\32" . ++$i . $boundary;
    $this->html_hashes[$key] = $text;
    return $key;

    # String that will replace the tag.
  }
  function hashBlock($text) {

    #

    # Shortcut function for hashPart with block-level boundaries.

    #
    return $this
      ->hashPart($text, 'B');
  }
  var $block_gamut = array(
    #

    # These are all the transformations that form block-level

    # tags like paragraphs, headers, and list items.

    #
    "doHeaders" => 10,
    "doHorizontalRules" => 20,
    "doLists" => 40,
    "doCodeBlocks" => 50,
    "doBlockQuotes" => 60,
  );
  function runBlockGamut($text) {

    #

    # Run block gamut tranformations.

    #

    # We need to escape raw HTML in Markdown source before doing anything

    # else. This need to be done for each block, and not only at the

    # begining in the Markdown function since hashed blocks can be part of

    # list items and could have been indented. Indented blocks would have

    # been seen as a code block in a previous pass of hashHTMLBlocks.
    $text = $this
      ->hashHTMLBlocks($text);
    return $this
      ->runBasicBlockGamut($text);
  }
  function runBasicBlockGamut($text) {

    #

    # Run block gamut tranformations, without hashing HTML blocks. This is

    # useful when HTML blocks are known to be already hashed, like in the first

    # whole-document pass.

    #
    foreach ($this->block_gamut as $method => $priority) {
      $text = $this
        ->{$method}($text);
    }

    # Finally form paragraph and restore hashed blocks.
    $text = $this
      ->formParagraphs($text);
    return $text;
  }
  function doHorizontalRules($text) {

    # Do Horizontal Rules:
    return preg_replace('{
				^[ ]{0,3}	# Leading space
				([-*_])		# $1: First marker
				(?>			# Repeated marker group
					[ ]{0,2}	# Zero, one, or two spaces.
					\\1			# Marker character
				){2,}		# Group repeated at least twice
				[ ]*		# Tailing spaces
				$			# End of line.
			}mx', "\n" . $this
      ->hashBlock("<hr{$this->empty_element_suffix}") . "\n", $text);
  }
  var $span_gamut = array(
    #

    # These are all the transformations that occur *within* block-level

    # tags like paragraphs, headers, and list items.

    #

    # Process character escapes, code spans, and inline HTML

    # in one shot.
    "parseSpan" => -30,
    # Process anchor and image tags. Images must come first,

    # because ![foo][f] looks like an anchor.
    "doImages" => 10,
    "doAnchors" => 20,
    # Make links out of things like `<http://example.com/>`

    # Must come after doAnchors, because you can use < and >

    # delimiters in inline links like [this](<url>).
    "doAutoLinks" => 30,
    "encodeAmpsAndAngles" => 40,
    "doItalicsAndBold" => 50,
    "doHardBreaks" => 60,
  );
  function runSpanGamut($text) {

    #

    # Run span gamut tranformations.

    #
    foreach ($this->span_gamut as $method => $priority) {
      $text = $this
        ->{$method}($text);
    }
    return $text;
  }
  function doHardBreaks($text) {

    # Do hard breaks:
    return preg_replace_callback('/ {2,}\\n/', array(
      &$this,
      '_doHardBreaks_callback',
    ), $text);
  }
  function _doHardBreaks_callback($matches) {
    return $this
      ->hashPart("<br{$this->empty_element_suffix}\n");
  }
  function doAnchors($text) {

    #

    # Turn Markdown link shortcuts into XHTML <a> tags.

    #
    if ($this->in_anchor) {
      return $text;
    }
    $this->in_anchor = true;

    #

    # First, handle reference-style links: [link text] [id]

    #
    $text = preg_replace_callback('{
			(					# wrap whole match in $1
			  \\[
				(' . $this->nested_brackets_re . ')	# link text = $2
			  \\]

			  [ ]?				# one optional space
			  (?:\\n[ ]*)?		# one optional newline followed by spaces

			  \\[
				(.*?)		# id = $3
			  \\]
			)
			}xs', array(
      &$this,
      '_doAnchors_reference_callback',
    ), $text);

    #

    # Next, inline-style links: [link text](url "optional title")

    #
    $text = preg_replace_callback('{
			(				# wrap whole match in $1
			  \\[
				(' . $this->nested_brackets_re . ')	# link text = $2
			  \\]
			  \\(			# literal paren
				[ ]*
				(?:
					<(\\S*)>	# href = $3
				|
					(' . $this->nested_url_parenthesis_re . ')	# href = $4
				)
				[ ]*
				(			# $5
				  ([\'"])	# quote char = $6
				  (.*?)		# Title = $7
				  \\6		# matching quote
				  [ ]*	# ignore any spaces/tabs between closing quote and )
				)?			# title is optional
			  \\)
			)
			}xs', array(
      &$this,
      '_DoAnchors_inline_callback',
    ), $text);

    #

    # Last, handle reference-style shortcuts: [link text]

    # These must come last in case you've also got [link test][1]

    # or [link test](/foo)

    #

    //		$text = preg_replace_callback('{
    //			(					# wrap whole match in $1
    //			  \[
    //				([^\[\]]+)		# link text = $2; can\'t contain [ or ]
    //			  \]
    //			)
    //			}xs',
    //			array(&$this, '_doAnchors_reference_callback'), $text);
    $this->in_anchor = false;
    return $text;
  }
  function _doAnchors_reference_callback($matches) {
    $whole_match = $matches[1];
    $link_text = $matches[2];
    $link_id =& $matches[3];
    if ($link_id == "") {

      # for shortcut links like [this][] or [this].
      $link_id = $link_text;
    }

    # lower-case and turn embedded newlines into spaces
    $link_id = strtolower($link_id);
    $link_id = preg_replace('{[ ]?\\n}', ' ', $link_id);
    if (isset($this->urls[$link_id])) {
      $url = $this->urls[$link_id];
      $url = $this
        ->encodeAttribute($url);
      $result = "<a href=\"{$url}\"";
      if (isset($this->titles[$link_id])) {
        $title = $this->titles[$link_id];
        $title = $this
          ->encodeAttribute($title);
        $result .= " title=\"{$title}\"";
      }
      $link_text = $this
        ->runSpanGamut($link_text);
      $result .= ">{$link_text}</a>";
      $result = $this
        ->hashPart($result);
    }
    else {
      $result = $whole_match;
    }
    return $result;
  }
  function _doAnchors_inline_callback($matches) {
    $whole_match = $matches[1];
    $link_text = $this
      ->runSpanGamut($matches[2]);
    $url = $matches[3] == '' ? $matches[4] : $matches[3];
    $title =& $matches[7];
    $url = $this
      ->encodeAttribute($url);
    $result = "<a href=\"{$url}\"";
    if (isset($title)) {
      $title = $this
        ->encodeAttribute($title);
      $result .= " title=\"{$title}\"";
    }
    $link_text = $this
      ->runSpanGamut($link_text);
    $result .= ">{$link_text}</a>";
    return $this
      ->hashPart($result);
  }
  function doImages($text) {

    #

    # Turn Markdown image shortcuts into <img> tags.

    #

    #

    # First, handle reference-style labeled images: ![alt text][id]

    #
    $text = preg_replace_callback('{
			(				# wrap whole match in $1
			  !\\[
				(' . $this->nested_brackets_re . ')		# alt text = $2
			  \\]

			  [ ]?				# one optional space
			  (?:\\n[ ]*)?		# one optional newline followed by spaces

			  \\[
				(.*?)		# id = $3
			  \\]

			)
			}xs', array(
      &$this,
      '_doImages_reference_callback',
    ), $text);

    #

    # Next, handle inline images:  ![alt text](url "optional title")

    # Don't forget: encode * and _

    #
    $text = preg_replace_callback('{
			(				# wrap whole match in $1
			  !\\[
				(' . $this->nested_brackets_re . ')		# alt text = $2
			  \\]
			  \\s?			# One optional whitespace character
			  \\(			# literal paren
				[ ]*
				(?:
					<(\\S*)>	# src url = $3
				|
					(' . $this->nested_url_parenthesis_re . ')	# src url = $4
				)
				[ ]*
				(			# $5
				  ([\'"])	# quote char = $6
				  (.*?)		# title = $7
				  \\6		# matching quote
				  [ ]*
				)?			# title is optional
			  \\)
			)
			}xs', array(
      &$this,
      '_doImages_inline_callback',
    ), $text);
    return $text;
  }
  function _doImages_reference_callback($matches) {
    $whole_match = $matches[1];
    $alt_text = $matches[2];
    $link_id = strtolower($matches[3]);
    if ($link_id == "") {
      $link_id = strtolower($alt_text);

      # for shortcut links like ![this][].
    }
    $alt_text = $this
      ->encodeAttribute($alt_text);
    if (isset($this->urls[$link_id])) {
      $url = $this
        ->encodeAttribute($this->urls[$link_id]);
      $result = "<img src=\"{$url}\" alt=\"{$alt_text}\"";
      if (isset($this->titles[$link_id])) {
        $title = $this->titles[$link_id];
        $title = $this
          ->encodeAttribute($title);
        $result .= " title=\"{$title}\"";
      }
      $result .= $this->empty_element_suffix;
      $result = $this
        ->hashPart($result);
    }
    else {

      # If there's no such link ID, leave intact:
      $result = $whole_match;
    }
    return $result;
  }
  function _doImages_inline_callback($matches) {
    $whole_match = $matches[1];
    $alt_text = $matches[2];
    $url = $matches[3] == '' ? $matches[4] : $matches[3];
    $title =& $matches[7];
    $alt_text = $this
      ->encodeAttribute($alt_text);
    $url = $this
      ->encodeAttribute($url);
    $result = "<img src=\"{$url}\" alt=\"{$alt_text}\"";
    if (isset($title)) {
      $title = $this
        ->encodeAttribute($title);
      $result .= " title=\"{$title}\"";

      # $title already quoted
    }
    $result .= $this->empty_element_suffix;
    return $this
      ->hashPart($result);
  }
  function doHeaders($text) {

    # Setext-style headers:

    #	  Header 1

    #	  ========

    #

    #	  Header 2

    #	  --------

    #
    $text = preg_replace_callback('{ ^(.+?)[ ]*\\n(=+|-+)[ ]*\\n+ }mx', array(
      &$this,
      '_doHeaders_callback_setext',
    ), $text);

    # atx-style headers:

    #	# Header 1

    #	## Header 2

    #	## Header 2 with closing hashes ##

    #	...

    #	###### Header 6

    #
    $text = preg_replace_callback('{
				^(\\#{1,6})	# $1 = string of #\'s
				[ ]*
				(.+?)		# $2 = Header text
				[ ]*
				\\#*			# optional closing #\'s (not counted)
				\\n+
			}xm', array(
      &$this,
      '_doHeaders_callback_atx',
    ), $text);
    return $text;
  }
  function _doHeaders_callback_setext($matches) {

    # Terrible hack to check we haven't found an empty list item.
    if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1])) {
      return $matches[0];
    }
    $level = $matches[2][0] == '=' ? 1 : 2;
    $block = "<h{$level}>" . $this
      ->runSpanGamut($matches[1]) . "</h{$level}>";
    return "\n" . $this
      ->hashBlock($block) . "\n\n";
  }
  function _doHeaders_callback_atx($matches) {
    $level = strlen($matches[1]);
    $block = "<h{$level}>" . $this
      ->runSpanGamut($matches[2]) . "</h{$level}>";
    return "\n" . $this
      ->hashBlock($block) . "\n\n";
  }
  function doLists($text) {

    #

    # Form HTML ordered (numbered) and unordered (bulleted) lists.

    #
    $less_than_tab = $this->tab_width - 1;

    # Re-usable patterns to match list item bullets and number markers:
    $marker_ul_re = '[*+-]';
    $marker_ol_re = '\\d+[.]';
    $marker_any_re = "(?:{$marker_ul_re}|{$marker_ol_re})";
    $markers_relist = array(
      $marker_ul_re,
      $marker_ol_re,
    );
    foreach ($markers_relist as $marker_re) {

      # Re-usable pattern to match any entirel ul or ol list:
      $whole_list_re = '
				(								# $1 = whole list
				  (								# $2
					[ ]{0,' . $less_than_tab . '}
					(' . $marker_re . ')			# $3 = first list item marker
					[ ]+
				  )
				  (?s:.+?)
				  (								# $4
					  \\z
					|
					  \\n{2,}
					  (?=\\S)
					  (?!						# Negative lookahead for another list item marker
						[ ]*
						' . $marker_re . '[ ]+
					  )
				  )
				)
			';

      // mx

      # We use a different prefix before nested lists than top-level lists.

      # See extended comment in _ProcessListItems().
      if ($this->list_level) {
        $text = preg_replace_callback('{
						^
						' . $whole_list_re . '
					}mx', array(
          &$this,
          '_doLists_callback',
        ), $text);
      }
      else {
        $text = preg_replace_callback('{
						(?:(?<=\\n)\\n|\\A\\n?) # Must eat the newline
						' . $whole_list_re . '
					}mx', array(
          &$this,
          '_doLists_callback',
        ), $text);
      }
    }
    return $text;
  }
  function _doLists_callback($matches) {

    # Re-usable patterns to match list item bullets and number markers:
    $marker_ul_re = '[*+-]';
    $marker_ol_re = '\\d+[.]';
    $marker_any_re = "(?:{$marker_ul_re}|{$marker_ol_re})";
    $list = $matches[1];
    $list_type = preg_match("/{$marker_ul_re}/", $matches[3]) ? "ul" : "ol";
    $marker_any_re = $list_type == "ul" ? $marker_ul_re : $marker_ol_re;
    $list .= "\n";
    $result = $this
      ->processListItems($list, $marker_any_re);
    $result = $this
      ->hashBlock("<{$list_type}>\n" . $result . "</{$list_type}>");
    return "\n" . $result . "\n\n";
  }
  var $list_level = 0;
  function processListItems($list_str, $marker_any_re) {

    #

    #	Process the contents of a single ordered or unordered list, splitting it

    #	into individual list items.

    #

    # The $this->list_level global keeps track of when we're inside a list.

    # Each time we enter a list, we increment it; when we leave a list,

    # we decrement. If it's zero, we're not in a list anymore.

    #

    # We do this because when we're not inside a list, we want to treat

    # something like this:

    #

    #		I recommend upgrading to version

    #		8. Oops, now this line is treated

    #		as a sub-list.

    #

    # As a single paragraph, despite the fact that the second line starts

    # with a digit-period-space sequence.

    #

    # Whereas when we're inside a list (or sub-list), that line will be

    # treated as the start of a sub-list. What a kludge, huh? This is

    # an aspect of Markdown's syntax that's hard to parse perfectly

    # without resorting to mind-reading. Perhaps the solution is to

    # change the syntax rules such that sub-lists must start with a

    # starting cardinal number; e.g. "1." or "a.".
    $this->list_level++;

    # trim trailing blank lines:
    $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
    $list_str = preg_replace_callback('{
			(\\n)?							# leading line = $1
			(^[ ]*)							# leading whitespace = $2
			(' . $marker_any_re . '				# list marker and space = $3
				(?:[ ]+|(?=\\n))	# space only required if item is not empty
			)
			((?s:.*?))						# list item text   = $4
			(?:(\\n+(?=\\n))|\\n)				# tailing blank line = $5
			(?= \\n* (\\z | \\2 (' . $marker_any_re . ') (?:[ ]+|(?=\\n))))
			}xm', array(
      &$this,
      '_processListItems_callback',
    ), $list_str);
    $this->list_level--;
    return $list_str;
  }
  function _processListItems_callback($matches) {
    $item = $matches[4];
    $leading_line =& $matches[1];
    $leading_space =& $matches[2];
    $marker_space = $matches[3];
    $tailing_blank_line =& $matches[5];
    if ($leading_line || $tailing_blank_line || preg_match('/\\n{2,}/', $item)) {

      # Replace marker with the appropriate whitespace indentation
      $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
      $item = $this
        ->runBlockGamut($this
        ->outdent($item) . "\n");
    }
    else {

      # Recursion for sub-lists:
      $item = $this
        ->doLists($this
        ->outdent($item));
      $item = preg_replace('/\\n+$/', '', $item);
      $item = $this
        ->runSpanGamut($item);
    }
    return "<li>" . $item . "</li>\n";
  }
  function doCodeBlocks($text) {

    #

    #	Process Markdown `<pre><code>` blocks.

    #
    $text = preg_replace_callback('{
				(?:\\n\\n|\\A\\n?)
				(	            # $1 = the code block -- one or more lines, starting with a space/tab
				  (?>
					[ ]{' . $this->tab_width . '}  # Lines must start with a tab or a tab-width of spaces
					.*\\n+
				  )+
				)
				((?=^[ ]{0,' . $this->tab_width . '}\\S)|\\Z)	# Lookahead for non-space at line-start, or end of doc
			}xm', array(
      &$this,
      '_doCodeBlocks_callback',
    ), $text);
    return $text;
  }
  function _doCodeBlocks_callback($matches) {
    $codeblock = $matches[1];
    $codeblock = $this
      ->outdent($codeblock);
    $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);

    # trim leading newlines and trailing newlines
    $codeblock = preg_replace('/\\A\\n+|\\n+\\z/', '', $codeblock);
    $codeblock = "<pre><code>{$codeblock}\n</code></pre>";
    return "\n\n" . $this
      ->hashBlock($codeblock) . "\n\n";
  }
  function makeCodeSpan($code) {

    #

    # Create a code span markup for $code. Called from handleSpanToken.

    #
    $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
    return $this
      ->hashPart("<code>{$code}</code>");
  }
  var $em_relist = array(
    '' => '(?:(?<!\\*)\\*(?!\\*)|(?<!_)_(?!_))(?=\\S)(?![.,:;]\\s)',
    '*' => '(?<=\\S)(?<!\\*)\\*(?!\\*)',
    '_' => '(?<=\\S)(?<!_)_(?!_)',
  );
  var $strong_relist = array(
    '' => '(?:(?<!\\*)\\*\\*(?!\\*)|(?<!_)__(?!_))(?=\\S)(?![.,:;]\\s)',
    '**' => '(?<=\\S)(?<!\\*)\\*\\*(?!\\*)',
    '__' => '(?<=\\S)(?<!_)__(?!_)',
  );
  var $em_strong_relist = array(
    '' => '(?:(?<!\\*)\\*\\*\\*(?!\\*)|(?<!_)___(?!_))(?=\\S)(?![.,:;]\\s)',
    '***' => '(?<=\\S)(?<!\\*)\\*\\*\\*(?!\\*)',
    '___' => '(?<=\\S)(?<!_)___(?!_)',
  );
  var $em_strong_prepared_relist;
  function prepareItalicsAndBold() {

    #

    # Prepare regular expressions for seraching emphasis tokens in any

    # context.

    #
    foreach ($this->em_relist as $em => $em_re) {
      foreach ($this->strong_relist as $strong => $strong_re) {

        # Construct list of allowed token expressions.
        $token_relist = array();
        if (isset($this->em_strong_relist["{$em}{$strong}"])) {
          $token_relist[] = $this->em_strong_relist["{$em}{$strong}"];
        }
        $token_relist[] = $em_re;
        $token_relist[] = $strong_re;

        # Construct master expression from list.
        $token_re = '{(' . implode('|', $token_relist) . ')}';
        $this->em_strong_prepared_relist["{$em}{$strong}"] = $token_re;
      }
    }
  }
  function doItalicsAndBold($text) {
    $token_stack = array(
      '',
    );
    $text_stack = array(
      '',
    );
    $em = '';
    $strong = '';
    $tree_char_em = false;
    while (1) {

      #

      # Get prepared regular expression for seraching emphasis tokens

      # in current context.

      #
      $token_re = $this->em_strong_prepared_relist["{$em}{$strong}"];

      #

      # Each loop iteration seach for the next emphasis token.

      # Each token is then passed to handleSpanToken.

      #
      $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
      $text_stack[0] .= $parts[0];
      $token =& $parts[1];
      $text =& $parts[2];
      if (empty($token)) {

        # Reached end of text span: empty stack without emitting.

        # any more emphasis.
        while ($token_stack[0]) {
          $text_stack[1] .= array_shift($token_stack);
          $text_stack[0] .= array_shift($text_stack);
        }
        break;
      }
      $token_len = strlen($token);
      if ($tree_char_em) {

        # Reached closing marker while inside a three-char emphasis.
        if ($token_len == 3) {

          # Three-char closing marker, close em and strong.
          array_shift($token_stack);
          $span = array_shift($text_stack);
          $span = $this
            ->runSpanGamut($span);
          $span = "<strong><em>{$span}</em></strong>";
          $text_stack[0] .= $this
            ->hashPart($span);
          $em = '';
          $strong = '';
        }
        else {

          # Other closing marker: close one em or strong and

          # change current token state to match the other
          $token_stack[0] = str_repeat($token[0], 3 - $token_len);
          $tag = $token_len == 2 ? "strong" : "em";
          $span = $text_stack[0];
          $span = $this
            ->runSpanGamut($span);
          $span = "<{$tag}>{$span}</{$tag}>";
          $text_stack[0] = $this
            ->hashPart($span);
          ${$tag} = '';

          # $$tag stands for $em or $strong
        }
        $tree_char_em = false;
      }
      else {
        if ($token_len == 3) {
          if ($em) {

            # Reached closing marker for both em and strong.

            # Closing strong marker:
            for ($i = 0; $i < 2; ++$i) {
              $shifted_token = array_shift($token_stack);
              $tag = strlen($shifted_token) == 2 ? "strong" : "em";
              $span = array_shift($text_stack);
              $span = $this
                ->runSpanGamut($span);
              $span = "<{$tag}>{$span}</{$tag}>";
              $text_stack[0] .= $this
                ->hashPart($span);
              ${$tag} = '';

              # $$tag stands for $em or $strong
            }
          }
          else {

            # Reached opening three-char emphasis marker. Push on token

            # stack; will be handled by the special condition above.
            $em = $token[0];
            $strong = "{$em}{$em}";
            array_unshift($token_stack, $token);
            array_unshift($text_stack, '');
            $tree_char_em = true;
          }
        }
        else {
          if ($token_len == 2) {
            if ($strong) {

              # Unwind any dangling emphasis marker:
              if (strlen($token_stack[0]) == 1) {
                $text_stack[1] .= array_shift($token_stack);
                $text_stack[0] .= array_shift($text_stack);
              }

              # Closing strong marker:
              array_shift($token_stack);
              $span = array_shift($text_stack);
              $span = $this
                ->runSpanGamut($span);
              $span = "<strong>{$span}</strong>";
              $text_stack[0] .= $this
                ->hashPart($span);
              $strong = '';
            }
            else {
              array_unshift($token_stack, $token);
              array_unshift($text_stack, '');
              $strong = $token;
            }
          }
          else {

            # Here $token_len == 1
            if ($em) {
              if (strlen($token_stack[0]) == 1) {

                # Closing emphasis marker:
                array_shift($token_stack);
                $span = array_shift($text_stack);
                $span = $this
                  ->runSpanGamut($span);
                $span = "<em>{$span}</em>";
                $text_stack[0] .= $this
                  ->hashPart($span);
                $em = '';
              }
              else {
                $text_stack[0] .= $token;
              }
            }
            else {
              array_unshift($token_stack, $token);
              array_unshift($text_stack, '');
              $em = $token;
            }
          }
        }
      }
    }
    return $text_stack[0];
  }
  function doBlockQuotes($text) {
    $text = preg_replace_callback('/
			  (								# Wrap whole match in $1
				(?>
				  ^[ ]*>[ ]?			# ">" at the start of a line
					.+\\n					# rest of the first line
				  (.+\\n)*					# subsequent consecutive lines
				  \\n*						# blanks
				)+
			  )
			/xm', array(
      &$this,
      '_doBlockQuotes_callback',
    ), $text);
    return $text;
  }
  function _doBlockQuotes_callback($matches) {
    $bq = $matches[1];

    # trim one level of quoting - trim whitespace-only lines
    $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
    $bq = $this
      ->runBlockGamut($bq);

    # recurse
    $bq = preg_replace('/^/m', "  ", $bq);

    # These leading spaces cause problem with <pre> content,

    # so we need to fix that:
    $bq = preg_replace_callback('{(\\s*<pre>.+?</pre>)}sx', array(
      &$this,
      '_DoBlockQuotes_callback2',
    ), $bq);
    return "\n" . $this
      ->hashBlock("<blockquote>\n{$bq}\n</blockquote>") . "\n\n";
  }
  function _doBlockQuotes_callback2($matches) {
    $pre = $matches[1];
    $pre = preg_replace('/^  /m', '', $pre);
    return $pre;
  }
  function formParagraphs($text) {

    #

    #	Params:

    #		$text - string to process with html <p> tags

    #

    # Strip leading and trailing lines:
    $text = preg_replace('/\\A\\n+|\\n+\\z/', '', $text);
    $grafs = preg_split('/\\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);

    #

    # Wrap <p> tags and unhashify HTML blocks

    #
    foreach ($grafs as $key => $value) {
      if (!preg_match('/^B\\x1A[0-9]+B$/', $value)) {

        # Is a paragraph.
        $value = $this
          ->runSpanGamut($value);
        $value = preg_replace('/^([ ]*)/', "<p>", $value);
        $value .= "</p>";
        $grafs[$key] = $this
          ->unhash($value);
      }
      else {

        # Is a block.

        # Modify elements of @grafs in-place...
        $graf = $value;
        $block = $this->html_hashes[$graf];
        $graf = $block;

        //				if (preg_match('{
        //					\A
        //					(							# $1 = <div> tag
        //					  <div  \s+
        //					  [^>]*
        //					  \b
        //					  markdown\s*=\s*  ([\'"])	#	$2 = attr quote char
        //					  1
        //					  \2
        //					  [^>]*
        //					  >
        //					)
        //					(							# $3 = contents
        //					.*
        //					)
        //					(</div>)					# $4 = closing tag
        //					\z
        //					}xs', $block, $matches))
        //				{
        //					list(, $div_open, , $div_content, $div_close) = $matches;
        //
        //					# We can't call Markdown(), because that resets the hash;
        //					# that initialization code should be pulled into its own sub, though.
        //					$div_content = $this->hashHTMLBlocks($div_content);
        //
        //					# Run document gamut methods on the content.
        //					foreach ($this->document_gamut as $method => $priority) {
        //						$div_content = $this->$method($div_content);
        //					}
        //
        //					$div_open = preg_replace(
        //						'{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
        //
        //					$graf = $div_open . "\n" . $div_content . "\n" . $div_close;
        //				}
        $grafs[$key] = $graf;
      }
    }
    return implode("\n\n", $grafs);
  }
  function encodeAttribute($text) {

    #

    # Encode text for a double-quoted HTML attribute. This function

    # is *not* suitable for attributes enclosed in single quotes.

    #
    $text = $this
      ->encodeAmpsAndAngles($text);
    $text = str_replace('"', '&quot;', $text);
    return $text;
  }
  function encodeAmpsAndAngles($text) {

    #

    # Smart processing for ampersands and angle brackets that need to

    # be encoded. Valid character entities are left alone unless the

    # no-entities mode is set.

    #
    if ($this->no_entities) {
      $text = str_replace('&', '&amp;', $text);
    }
    else {

      # Ampersand-encoding based entirely on Nat Irons's Amputator

      # MT plugin: <http://bumppo.net/projects/amputator/>
      $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\\w+);)/', '&amp;', $text);
    }

    # Encode remaining <'s
    $text = str_replace('<', '&lt;', $text);
    return $text;
  }
  function doAutoLinks($text) {
    $text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\\s]+)>}i', array(
      &$this,
      '_doAutoLinks_url_callback',
    ), $text);

    # Email addresses: <address@domain.foo>
    $text = preg_replace_callback('{
			<
			(?:mailto:)?
			(
				[-.\\w\\x80-\\xFF]+
				\\@
				[-a-z0-9\\x80-\\xFF]+(\\.[-a-z0-9\\x80-\\xFF]+)*\\.[a-z]+
			)
			>
			}xi', array(
      &$this,
      '_doAutoLinks_email_callback',
    ), $text);
    return $text;
  }
  function _doAutoLinks_url_callback($matches) {
    $url = $this
      ->encodeAttribute($matches[1]);
    $link = "<a href=\"{$url}\">{$url}</a>";
    return $this
      ->hashPart($link);
  }
  function _doAutoLinks_email_callback($matches) {
    $address = $matches[1];
    $link = $this
      ->encodeEmailAddress($address);
    return $this
      ->hashPart($link);
  }
  function encodeEmailAddress($addr) {

    #

    #	Input: an email address, e.g. "foo@example.com"

    #

    #	Output: the email address as a mailto link, with each character

    #		of the address encoded as either a decimal or hex entity, in

    #		the hopes of foiling most address harvesting spam bots. E.g.:

    #

    #	  <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;

    #        &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;

    #        &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;

    #        &#101;&#46;&#x63;&#111;&#x6d;</a></p>

    #

    #	Based by a filter by Matthew Wickline, posted to BBEdit-Talk.

    #   With some optimizations by Milian Wolff.

    #
    $addr = "mailto:" . $addr;
    $chars = preg_split('/(?<!^)(?!$)/', $addr);
    $seed = (int) abs(crc32($addr) / strlen($addr));

    # Deterministic seed.
    foreach ($chars as $key => $char) {
      $ord = ord($char);

      # Ignore non-ascii chars.
      if ($ord < 128) {
        $r = $seed * (1 + $key) % 100;

        # Pseudo-random function.

        # roughly 10% raw, 45% hex, 45% dec

        # '@' *must* be encoded. I insist.
        if ($r > 90 && $char != '@') {

          /* do nothing */
        }
        else {
          if ($r < 45) {
            $chars[$key] = '&#x' . dechex($ord) . ';';
          }
          else {
            $chars[$key] = '&#' . $ord . ';';
          }
        }
      }
    }
    $addr = implode('', $chars);
    $text = implode('', array_slice($chars, 7));

    # text without `mailto:`
    $addr = "<a href=\"{$addr}\">{$text}</a>";
    return $addr;
  }
  function parseSpan($str) {

    #

    # Take the string $str and parse it into tokens, hashing embeded HTML,

    # escaped characters and handling code spans.

    #
    $output = '';
    $span_re = '{
				(
					\\\\' . $this->escape_chars_re . '
				|
					(?<![`\\\\])
					`+						# code span marker
			' . ($this->no_markup ? '' : '
				|
					<!--    .*?     -->		# comment
				|
					<\\?.*?\\?> | <%.*?%>		# processing instruction
				|
					<[/!$]?[-a-zA-Z0-9:]+	# regular tags
					(?>
						\\s
						(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
					)?
					>
			') . '
				)
				}xs';
    while (1) {

      #

      # Each loop iteration seach for either the next tag, the next

      # openning code span marker, or the next escaped character.

      # Each token is then passed to handleSpanToken.

      #
      $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);

      # Create token from text preceding tag.
      if ($parts[0] != "") {
        $output .= $parts[0];
      }

      # Check if we reach the end.
      if (isset($parts[1])) {
        $output .= $this
          ->handleSpanToken($parts[1], $parts[2]);
        $str = $parts[2];
      }
      else {
        break;
      }
    }
    return $output;
  }
  function handleSpanToken($token, &$str) {

    #

    # Handle $token provided by parseSpan by determining its nature and

    # returning the corresponding value that should replace it.

    #
    switch ($token[0]) {
      case "\\":
        return $this
          ->hashPart("&#" . ord($token[1]) . ";");
      case "`":

        # Search for end marker in remaining text.
        if (preg_match('/^(.*?[^`])' . preg_quote($token) . '(?!`)(.*)$/sm', $str, $matches)) {
          $str = $matches[2];
          $codespan = $this
            ->makeCodeSpan($matches[1]);
          return $this
            ->hashPart($codespan);
        }
        return $token;

      // return as text since no ending marker found.
      default:
        return $this
          ->hashPart($token);
    }
  }
  function outdent($text) {

    #

    # Remove one level of line-leading tabs or spaces

    #
    return preg_replace('/^(\\t|[ ]{1,' . $this->tab_width . '})/m', '', $text);
  }

  # String length function for detab. `_initDetab` will create a function to

  # hanlde UTF-8 if the default function does not exist.
  var $utf8_strlen = 'mb_strlen';
  function detab($text) {

    #

    # Replace tabs with the appropriate amount of space.

    #

    # For each line we separate the line in blocks delemited by

    # tab characters. Then we reconstruct every line by adding the

    # appropriate number of space between each blocks.
    $text = preg_replace_callback('/^.*\\t.*$/m', array(
      &$this,
      '_detab_callback',
    ), $text);
    return $text;
  }
  function _detab_callback($matches) {
    $line = $matches[0];
    $strlen = $this->utf8_strlen;

    # strlen function for UTF-8.

    # Split in blocks.
    $blocks = explode("\t", $line);

    # Add each blocks to the line.
    $line = $blocks[0];
    unset($blocks[0]);

    # Do not add first block twice.
    foreach ($blocks as $block) {

      # Calculate amount of space, insert spaces, insert block.
      $amount = $this->tab_width - $strlen($line, 'UTF-8') % $this->tab_width;
      $line .= str_repeat(" ", $amount) . $block;
    }
    return $line;
  }
  function _initDetab() {

    #

    # Check for the availability of the function in the `utf8_strlen` property

    # (initially `mb_strlen`). If the function is not available, create a

    # function that will loosely count the number of UTF-8 characters with a

    # regular expression.

    #
    if (function_exists($this->utf8_strlen)) {
      return;
    }
    $this->utf8_strlen = create_function('$text', 'return preg_match_all(
			"/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
			$text, $m);');
  }
  function unhash($text) {

    #

    # Swap back in all the tags hashed by _HashHTMLBlocks.

    #
    return preg_replace_callback('/(.)\\x1A[0-9]+\\1/', array(
      &$this,
      '_unhash_callback',
    ), $text);
  }
  function _unhash_callback($matches) {
    return $this->html_hashes[$matches[0]];
  }

}
Members

Name	Type	Overrides
Markdown_Parser::$block_gamut	property
Markdown_Parser::$document_gamut	property
Markdown_Parser::$empty_element_suffix	property
Markdown_Parser::$em_relist	property	1
Markdown_Parser::$em_strong_prepared_relist	property
Markdown_Parser::$em_strong_relist	property	1
Markdown_Parser::$escape_chars	property
Markdown_Parser::$escape_chars_re	property
Markdown_Parser::$html_hashes	property
Markdown_Parser::$in_anchor	property
Markdown_Parser::$list_level	property
Markdown_Parser::$nested_brackets_depth	property
Markdown_Parser::$nested_brackets_re	property
Markdown_Parser::$nested_url_parenthesis_depth	property
Markdown_Parser::$nested_url_parenthesis_re	property
Markdown_Parser::$no_entities	property
Markdown_Parser::$no_markup	property
Markdown_Parser::$predef_titles	property
Markdown_Parser::$predef_urls	property
Markdown_Parser::$span_gamut	property
Markdown_Parser::$strong_relist	property	1
Markdown_Parser::$tab_width	property
Markdown_Parser::$titles	property
Markdown_Parser::$urls	property
Markdown_Parser::$utf8_strlen	property
Markdown_Parser::detab	function
Markdown_Parser::doAnchors	function
Markdown_Parser::doAutoLinks	function
Markdown_Parser::doBlockQuotes	function
Markdown_Parser::doCodeBlocks	function
Markdown_Parser::doHardBreaks	function
Markdown_Parser::doHeaders	function	1
Markdown_Parser::doHorizontalRules	function
Markdown_Parser::doImages	function
Markdown_Parser::doItalicsAndBold	function
Markdown_Parser::doLists	function
Markdown_Parser::encodeAmpsAndAngles	function
Markdown_Parser::encodeAttribute	function
Markdown_Parser::encodeEmailAddress	function
Markdown_Parser::formParagraphs	function	1
Markdown_Parser::handleSpanToken	function
Markdown_Parser::hashBlock	function
Markdown_Parser::hashHTMLBlocks	function	1
Markdown_Parser::hashPart	function
Markdown_Parser::makeCodeSpan	function
Markdown_Parser::Markdown_Parser	function
Markdown_Parser::outdent	function
Markdown_Parser::parseSpan	function
Markdown_Parser::prepareItalicsAndBold	function
Markdown_Parser::processListItems	function
Markdown_Parser::runBasicBlockGamut	function
Markdown_Parser::runBlockGamut	function
Markdown_Parser::runSpanGamut	function
Markdown_Parser::setup	function	1
Markdown_Parser::stripLinkDefinitions	function
Markdown_Parser::teardown	function	1
Markdown_Parser::transform	function
Markdown_Parser::unhash	function
Markdown_Parser::_detab_callback	function
Markdown_Parser::_doAnchors_inline_callback	function
Markdown_Parser::_doAnchors_reference_callback	function
Markdown_Parser::_doAutoLinks_email_callback	function
Markdown_Parser::_doAutoLinks_url_callback	function
Markdown_Parser::_doBlockQuotes_callback	function
Markdown_Parser::_doBlockQuotes_callback2	function
Markdown_Parser::_doCodeBlocks_callback	function
Markdown_Parser::_doHardBreaks_callback	function
Markdown_Parser::_doHeaders_callback_atx	function	1
Markdown_Parser::_doHeaders_callback_setext	function	1
Markdown_Parser::_doImages_inline_callback	function
Markdown_Parser::_doImages_reference_callback	function
Markdown_Parser::_doLists_callback	function
Markdown_Parser::_hashHTMLBlocks_callback	function
Markdown_Parser::_initDetab	function
Markdown_Parser::_processListItems_callback	function
Markdown_Parser::_stripLinkDefinitions_callback	function
Markdown_Parser::_unhash_callback	function
You are here

class Markdown_Parser in Markdown 5

Hierarchy

File

Members

API Navigation