<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="content-type" content="text/html; charset=utf-8" />
<title>[55203] trunk: Introduce HTML API with HTML Tag Processor</title>
</head>
<body>
<style type="text/css"><!--
#msg dl.meta { border: 1px #006 solid; background: #369; padding: 6px; color: #fff; }
#msg dl.meta dt { float: left; width: 6em; font-weight: bold; }
#msg dt:after { content:':';}
#msg dl, #msg dt, #msg ul, #msg li, #header, #footer, #logmsg { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; }
#msg dl a { font-weight: bold}
#msg dl a:link { color:#fc3; }
#msg dl a:active { color:#ff0; }
#msg dl a:visited { color:#cc6; }
h3 { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; font-weight: bold; }
#msg pre { white-space: pre-line; overflow: auto; background: #ffc; border: 1px #fa0 solid; padding: 6px; }
#logmsg { background: #ffc; border: 1px #fa0 solid; padding: 1em 1em 0 1em; }
#logmsg p, #logmsg pre, #logmsg blockquote { margin: 0 0 1em 0; }
#logmsg p, #logmsg li, #logmsg dt, #logmsg dd { line-height: 14pt; }
#logmsg h1, #logmsg h2, #logmsg h3, #logmsg h4, #logmsg h5, #logmsg h6 { margin: .5em 0; }
#logmsg h1:first-child, #logmsg h2:first-child, #logmsg h3:first-child, #logmsg h4:first-child, #logmsg h5:first-child, #logmsg h6:first-child { margin-top: 0; }
#logmsg ul, #logmsg ol { padding: 0; list-style-position: inside; margin: 0 0 0 1em; }
#logmsg ul { text-indent: -1em; padding-left: 1em; }#logmsg ol { text-indent: -1.5em; padding-left: 1.5em; }
#logmsg > ul, #logmsg > ol { margin: 0 0 1em 0; }
#logmsg pre { background: #eee; padding: 1em; }
#logmsg blockquote { border: 1px solid #fa0; border-left-width: 10px; padding: 1em 1em 0 1em; background: white;}
#logmsg dl { margin: 0; }
#logmsg dt { font-weight: bold; }
#logmsg dd { margin: 0; padding: 0 0 0.5em 0; }
#logmsg dd:before { content:'\00bb';}
#logmsg table { border-spacing: 0px; border-collapse: collapse; border-top: 4px solid #fa0; border-bottom: 1px solid #fa0; background: #fff; }
#logmsg table th { text-align: left; font-weight: normal; padding: 0.2em 0.5em; border-top: 1px dotted #fa0; }
#logmsg table td { text-align: right; border-top: 1px dotted #fa0; padding: 0.2em 0.5em; }
#logmsg table thead th { text-align: center; border-bottom: 1px solid #fa0; }
#logmsg table th.Corner { text-align: left; }
#logmsg hr { border: none 0; border-top: 2px dashed #fa0; height: 1px; }
#header, #footer { color: #fff; background: #636; border: 1px #300 solid; padding: 6px; }
#patch { width: 100%; }
#patch h4 {font-family: verdana,arial,helvetica,sans-serif;font-size:10pt;padding:8px;background:#369;color:#fff;margin:0;}
#patch .propset h4, #patch .binary h4 {margin:0;}
#patch pre {padding:0;line-height:1.2em;margin:0;}
#patch .diff {width:100%;background:#eee;padding: 0 0 10px 0;overflow:auto;}
#patch .propset .diff, #patch .binary .diff {padding:10px 0;}
#patch span {display:block;padding:0 10px;}
#patch .modfile, #patch .addfile, #patch .delfile, #patch .propset, #patch .binary, #patch .copfile {border:1px solid #ccc;margin:10px 0;}
#patch ins {background:#dfd;text-decoration:none;display:block;padding:0 10px;}
#patch del {background:#fdd;text-decoration:none;display:block;padding:0 10px;}
#patch .lines, .info {color:#888;background:#fff;}
--></style>
<div id="msg">
<dl class="meta" style="font-size: 105%">
<dt style="float: left; width: 6em; font-weight: bold">Revision</dt> <dd><a style="font-weight: bold" href="https://core.trac.wordpress.org/changeset/55203">55203</a><script type="application/ld+json">{"@context":"http://schema.org","@type":"EmailMessage","description":"Review this Commit","action":{"@type":"ViewAction","url":"https://core.trac.wordpress.org/changeset/55203","name":"Review Commit"}}</script></dd>
<dt style="float: left; width: 6em; font-weight: bold">Author</dt> <dd>azaozz</dd>
<dt style="float: left; width: 6em; font-weight: bold">Date</dt> <dd>2023-02-03 01:03:59 +0000 (Fri, 03 Feb 2023)</dd>
</dl>
<pre style='padding-left: 1em; margin: 2em 0; border-left: 2px solid #ccc; line-height: 1.25; font-size: 105%; font-family: sans-serif'>Introduce HTML API with HTML Tag Processor
This commit pulls in the HTML Tag Processor from the Gutenbeg repository.
The Tag Processor attempts to be an HTML5-spec-compliant parser that provides the ability in PHP to find specific HTML tags and then add, remove, or update attributes on that tag. It provides a safe and reliable way to modify the attribute on HTML tags.
More information: https://github.com/WordPress/wordpress-develop/pull/3920.
Props: antonvlasenko, bernhard-reiter, costdev, dmsnell, felixarntz, gziolo, hellofromtonya, zieladam, flixos90, ntsekouras, peterwilsoncc, swissspidy, andrewserong, onemaggie, get_dave, aristath, scruffian, justlevine, andraganescu, noisysocks, dlh, soean, cbirdsong, revgeorge, azaozz.
Fixes <a href="https://core.trac.wordpress.org/ticket/57575">#57575</a>.</pre>
<h3>Modified Paths</h3>
<ul>
<li><a href="#trunksrcwpsettingsphp">trunk/src/wp-settings.php</a></li>
</ul>
<h3>Added Paths</h3>
<ul>
<li>trunk/src/wp-includes/html-api/</li>
<li><a href="#trunksrcwpincludeshtmlapiclasswphtmlattributetokenphp">trunk/src/wp-includes/html-api/class-wp-html-attribute-token.php</a></li>
<li><a href="#trunksrcwpincludeshtmlapiclasswphtmlspanphp">trunk/src/wp-includes/html-api/class-wp-html-span.php</a></li>
<li><a href="#trunksrcwpincludeshtmlapiclasswphtmltagprocessorphp">trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php</a></li>
<li><a href="#trunksrcwpincludeshtmlapiclasswphtmltextreplacementphp">trunk/src/wp-includes/html-api/class-wp-html-text-replacement.php</a></li>
<li>trunk/tests/phpunit/tests/html/</li>
<li><a href="#trunktestsphpunittestshtmlwpHtmlTagProcessorbookmarkphp">trunk/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php</a></li>
<li><a href="#trunktestsphpunittestshtmlwpHtmlTagProcessorphp">trunk/tests/phpunit/tests/html/wpHtmlTagProcessor.php</a></li>
</ul>
</div>
<div id="patch">
<h3>Diff</h3>
<a id="trunksrcwpincludeshtmlapiclasswphtmlattributetokenphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/src/wp-includes/html-api/class-wp-html-attribute-token.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/src/wp-includes/html-api/class-wp-html-attribute-token.php (rev 0)
+++ trunk/src/wp-includes/html-api/class-wp-html-attribute-token.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,89 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * HTML Tag Processor: Attribute token structure class.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ * @since 6.2.0
+ */
+
+/**
+ * Data structure for the attribute token that allows to drastically improve performance.
+ *
+ * This class is for internal usage of the WP_HTML_Tag_Processor class.
+ *
+ * @access private
+ * @since 6.2.0
+ *
+ * @see WP_HTML_Tag_Processor
+ */
+class WP_HTML_Attribute_Token {
+ /**
+ * Attribute name.
+ *
+ * @since 6.2.0
+ * @var string
+ */
+ public $name;
+
+ /**
+ * Attribute value.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $value_starts_at;
+
+ /**
+ * How many bytes the value occupies in the input HTML.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $value_length;
+
+ /**
+ * The string offset where the attribute name starts.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $start;
+
+ /**
+ * The string offset after the attribute value or its name.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $end;
+
+ /**
+ * Whether the attribute is a boolean attribute with value `true`.
+ *
+ * @since 6.2.0
+ * @var bool
+ */
+ public $is_true;
+
+ /**
+ * Constructor.
+ *
+ * @since 6.2.0
+ *
+ * @param string $name Attribute name.
+ * @param int $value_start Attribute value.
+ * @param int $value_length Number of bytes attribute value spans.
+ * @param int $start The string offset where the attribute name starts.
+ * @param int $end The string offset after the attribute value or its name.
+ * @param bool $is_true Whether the attribute is a boolean attribute with true value.
+ */
+ public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) {
+ $this->name = $name;
+ $this->value_starts_at = $value_start;
+ $this->value_length = $value_length;
+ $this->start = $start;
+ $this->end = $end;
+ $this->is_true = $is_true;
+ }
+}
</ins></span></pre></div>
<a id="trunksrcwpincludeshtmlapiclasswphtmlspanphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/src/wp-includes/html-api/class-wp-html-span.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/src/wp-includes/html-api/class-wp-html-span.php (rev 0)
+++ trunk/src/wp-includes/html-api/class-wp-html-span.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,52 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * HTML Span: Represents a textual span inside an HTML document.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ * @since 6.2.0
+ */
+
+/**
+ * Represents a textual span inside an HTML document.
+ *
+ * This is a two-tuple in disguise, used to avoid the memory
+ * overhead involved in using an array for the same purpose.
+ *
+ * This class is for internal usage of the WP_HTML_Tag_Processor class.
+ *
+ * @access private
+ * @since 6.2.0
+ *
+ * @see WP_HTML_Tag_Processor
+ */
+class WP_HTML_Span {
+ /**
+ * Byte offset into document where span begins.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $start;
+
+ /**
+ * Byte offset into document where span ends.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $end;
+
+ /**
+ * Constructor.
+ *
+ * @since 6.2.0
+ *
+ * @param int $start Byte offset into document where replacement span begins.
+ * @param int $end Byte offset into document where replacement span ends.
+ */
+ public function __construct( $start, $end ) {
+ $this->start = $start;
+ $this->end = $end;
+ }
+}
</ins></span></pre></div>
<a id="trunksrcwpincludeshtmlapiclasswphtmltagprocessorphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php (rev 0)
+++ trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,2264 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * Scans through an HTML document to find specific tags, then
+ * transforms those tags by adding, removing, or updating the
+ * values of the HTML attributes within that tag (opener).
+ *
+ * Does not fully parse HTML or _recurse_ into the HTML structure
+ * Instead this scans linearly through a document and only parses
+ * the HTML tag openers.
+ *
+ * ### Possible future direction for this module
+ *
+ * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c".
+ * This would increase the size of the changes for some operations but leave more
+ * natural-looking output HTML.
+ * - Decode HTML character references within class names when matching. E.g. match having
+ * class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor
+ * will fail to find the right tag if the class name is encoded as such.
+ * - Properly decode HTML character references in `get_attribute()`. PHP's
+ * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the
+ * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
+ * or may not terminate a character reference.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ * @since 6.2.0
+ */
+
+/**
+ * Modifies attributes in an HTML document for tags matching a query.
+ *
+ * ## Usage
+ *
+ * Use of this class requires three steps:
+ *
+ * 1. Create a new class instance with your input HTML document.
+ * 2. Find the tag(s) you are looking for.
+ * 3. Request changes to the attributes in those tag(s).
+ *
+ * Example:
+ * ```php
+ * $tags = new WP_HTML_Tag_Processor( $html );
+ * if ( $tags->next_tag( [ 'tag_name' => 'option' ] ) ) {
+ * $tags->set_attribute( 'selected', true );
+ * }
+ * ```
+ *
+ * ### Finding tags
+ *
+ * The `next_tag()` function moves the internal cursor through
+ * your input HTML document until it finds a tag meeting any of
+ * the supplied restrictions in the optional query argument. If
+ * no argument is provided then it will find the next HTML tag,
+ * regardless of what kind it is.
+ *
+ * If you want to _find whatever the next tag is_:
+ * ```php
+ * $tags->next_tag();
+ * ```
+ *
+ * | Goal | Query |
+ * |-----------------------------------------------------------|----------------------------------------------------------------------------|
+ * | Find any tag. | `$tags->next_tag();` |
+ * | Find next image tag. | `$tags->next_tag( [ 'tag_name' => 'img' ] );` |
+ * | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( [ 'class_name' => 'fullwidth' ] );` |
+ * | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( [ 'tag_name' => 'img', 'class_name' => 'fullwidth' ] );` |
+ *
+ * If a tag was found meeting your criteria then `next_tag()`
+ * will return `true` and you can proceed to modify it. If it
+ * returns `false`, however, it failed to find the tag and
+ * moved the cursor to the end of the file.
+ *
+ * Once the cursor reaches the end of the file the processor
+ * is done and if you want to reach an earlier tag you will
+ * need to recreate the processor and start over, as it's
+ * unable to back up or move in reverse.
+ *
+ * See the section on bookmarks for an exception to this
+ * no-backing-up rule.
+ *
+ * #### Custom queries
+ *
+ * Sometimes it's necessary to further inspect an HTML tag than
+ * the query syntax here permits. In these cases one may further
+ * inspect the search results using the read-only functions
+ * provided by the processor or external state or variables.
+ *
+ * Example:
+ * ```php
+ * // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style.
+ * $remaining_count = 5;
+ * while ( $remaining_count > 0 && $tags->next_tag() ) {
+ * if (
+ * ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) &&
+ * 'jazzy' === $tags->get_attribute( 'data-style' )
+ * ) {
+ * $tags->add_class( 'theme-style-everest-jazz' );
+ * $remaining_count--;
+ * }
+ * }
+ * ```
+ *
+ * `get_attribute()` will return `null` if the attribute wasn't present
+ * on the tag when it was called. It may return `""` (the empty string)
+ * in cases where the attribute was present but its value was empty.
+ * For boolean attributes, those whose name is present but no value is
+ * given, it will return `true` (the only way to set `false` for an
+ * attribute is to remove it).
+ *
+ * ### Modifying HTML attributes for a found tag
+ *
+ * Once you've found the start of an opening tag you can modify
+ * any number of the attributes on that tag. You can set a new
+ * value for an attribute, remove the entire attribute, or do
+ * nothing and move on to the next opening tag.
+ *
+ * Example:
+ * ```php
+ * if ( $tags->next_tag( [ 'class' => 'wp-group-block' ] ) ) {
+ * $tags->set_attribute( 'title', 'This groups the contained content.' );
+ * $tags->remove_attribute( 'data-test-id' );
+ * }
+ * ```
+ *
+ * If `set_attribute()` is called for an existing attribute it will
+ * overwrite the existing value. Similarly, calling `remove_attribute()`
+ * for a non-existing attribute has no effect on the document. Both
+ * of these methods are safe to call without knowing if a given attribute
+ * exists beforehand.
+ *
+ * ### Modifying CSS classes for a found tag
+ *
+ * The tag processor treats the `class` attribute as a special case.
+ * Because it's a common operation to add or remove CSS classes, this
+ * interface adds helper methods to make that easier.
+ *
+ * As with attribute values, adding or removing CSS classes is a safe
+ * operation that doesn't require checking if the attribute or class
+ * exists before making changes. If removing the only class then the
+ * entire `class` attribute will be removed.
+ *
+ * Example:
+ * ```php
+ * // from `<span>Yippee!</span>`
+ * // to `<span class="is-active">Yippee!</span>`
+ * $tags->add_class( 'is-active' );
+ *
+ * // from `<span class="excited">Yippee!</span>`
+ * // to `<span class="excited is-active">Yippee!</span>`
+ * $tags->add_class( 'is-active' );
+ *
+ * // from `<span class="is-active heavy-accent">Yippee!</span>`
+ * // to `<span class="is-active heavy-accent">Yippee!</span>`
+ * $tags->add_class( 'is-active' );
+ *
+ * // from `<input type="text" class="is-active rugby not-disabled" length="24">`
+ * // to `<input type="text" class="is-active not-disabled" length="24">
+ * $tags->remove_class( 'rugby' );
+ *
+ * // from `<input type="text" class="rugby" length="24">`
+ * // to `<input type="text" length="24">
+ * $tags->remove_class( 'rugby' );
+ *
+ * // from `<input type="text" length="24">`
+ * // to `<input type="text" length="24">
+ * $tags->remove_class( 'rugby' );
+ * ```
+ *
+ * When class changes are enqueued but a direct change to `class` is made via
+ * `set_attribute` then the changes to `set_attribute` (or `remove_attribute`)
+ * will take precedence over those made through `add_class` and `remove_class`.
+ *
+ * ### Bookmarks
+ *
+ * While scanning through the input HTMl document it's possible to set
+ * a named bookmark when a particular tag is found. Later on, after
+ * continuing to scan other tags, it's possible to `seek` to one of
+ * the set bookmarks and then proceed again from that point forward.
+ *
+ * Because bookmarks create processing overhead one should avoid
+ * creating too many of them. As a rule, create only bookmarks
+ * of known string literal names; avoid creating "mark_{$index}"
+ * and so on. It's fine from a performance standpoint to create a
+ * bookmark and update it frequently, such as within a loop.
+ *
+ * ```php
+ * $total_todos = 0;
+ * while ( $p->next_tag( [ 'tag_name' => 'UL', 'class_name' => 'todo' ] ) ) {
+ * $p->set_bookmark( 'list-start' );
+ * while ( $p->next_tag( [ 'tag_closers' => 'visit' ] ) ) {
+ * if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) {
+ * $p->set_bookmark( 'list-end' );
+ * $p->seek( 'list-start' );
+ * $p->set_attribute( 'data-contained-todos', (string) $total_todos );
+ * $total_todos = 0;
+ * $p->seek( 'list-end' );
+ * break;
+ * }
+ *
+ * if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) {
+ * $total_todos++;
+ * }
+ * }
+ * }
+ * ```
+ *
+ * ## Design and limitations
+ *
+ * The Tag Processor is designed to linearly scan HTML documents and tokenize
+ * HTML tags and their attributes. It's designed to do this as efficiently as
+ * possible without compromising parsing integrity. Therefore it will be
+ * slower than some methods of modifying HTML, such as those incorporating
+ * over-simplified PCRE patterns, but will not introduce the defects and
+ * failures that those methods bring in, which lead to broken page renders
+ * and often to security vulnerabilities. On the other hand, it will be faster
+ * than full-blown HTML parsers such as DOMDocument and use considerably
+ * less memory. It requires a negligible memory overhead, enough to consider
+ * it a zero-overhead system.
+ *
+ * The performance characteristics are maintained by avoiding tree construction
+ * and semantic cleanups which are specified in HTML5. Because of this, for
+ * example, it's not possible for the Tag Processor to associate any given
+ * opening tag with its corresponding closing tag, or to return the inner markup
+ * inside an element. Systems may be built on top of the Tag Processor to do
+ * this, but the Tag Processor is and should be constrained so it can remain an
+ * efficient, low-level, and reliable HTML scanner.
+ *
+ * The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy.
+ * HTML5 specifies that certain invalid content be transformed into different forms
+ * for display, such as removing null bytes from an input document and replacing
+ * invalid characters with the Unicode replacement character U+FFFD �. Where errors
+ * or transformations exist within the HTML5 specification, the Tag Processor leaves
+ * those invalid inputs untouched, passing them through to the final browser to handle.
+ * While this implies that certain operations will be non-spec-compliant, such as
+ * reading the value of an attribute with invalid content, it also preserves a
+ * simplicity and efficiency for handling those error cases.
+ *
+ * Most operations within the Tag Processor are designed to minimize the difference
+ * between an input and output document for any given change. For example, the
+ * `add_class` and `remove_class` methods preserve whitespace and the class ordering
+ * within the `class` attribute; and when encountering tags with duplicated attributes,
+ * the Tag Processor will leave those invalid duplicate attributes where they are but
+ * update the proper attribute which the browser will read for parsing its value. An
+ * exception to this rule is that all attribute updates store their values as
+ * double-quoted strings, meaning that attributes on input with single-quoted or
+ * unquoted values will appear in the output with double-quotes.
+ *
+ * @since 6.2.0
+ */
+class WP_HTML_Tag_Processor {
+ /**
+ * The maximum number of bookmarks allowed to exist at
+ * any given time.
+ *
+ * @see set_bookmark()
+ * @since 6.2.0
+ * @var int
+ */
+ const MAX_BOOKMARKS = 10;
+
+ /**
+ * Maximum number of times seek() can be called.
+ * Prevents accidental infinite loops.
+ *
+ * @see seek()
+ * @since 6.2.0
+ * @var int
+ */
+ const MAX_SEEK_OPS = 1000;
+
+ /**
+ * The HTML document to parse.
+ *
+ * @since 6.2.0
+ * @var string
+ */
+ private $html;
+
+ /**
+ * The last query passed to next_tag().
+ *
+ * @since 6.2.0
+ * @var array|null
+ */
+ private $last_query;
+
+ /**
+ * The tag name this processor currently scans for.
+ *
+ * @since 6.2.0
+ * @var string|null
+ */
+ private $sought_tag_name;
+
+ /**
+ * The CSS class name this processor currently scans for.
+ *
+ * @since 6.2.0
+ * @var string|null
+ */
+ private $sought_class_name;
+
+ /**
+ * The match offset this processor currently scans for.
+ *
+ * @since 6.2.0
+ * @var int|null
+ */
+ private $sought_match_offset;
+
+ /**
+ * Whether to visit tag closers, e.g. </div>, when walking an input document.
+ *
+ * @since 6.2.0
+ * @var bool
+ */
+ private $stop_on_tag_closers;
+
+ /**
+ * Holds updated HTML as updates are applied.
+ *
+ * Updates and unmodified portions of the input document are
+ * appended to this value as they are applied. It will hold
+ * a copy of the updated document up until the point of the
+ * latest applied update. The fully-updated HTML document
+ * will comprise this value plus the part of the input document
+ * which follows that latest update.
+ *
+ * @see $bytes_already_copied
+ *
+ * @since 6.2.0
+ * @var string
+ */
+ private $output_buffer = '';
+
+ /**
+ * How many bytes from the original HTML document have been read and parsed.
+ *
+ * This value points to the latest byte offset in the input document which
+ * has been already parsed. It is the internal cursor for the Tag Processor
+ * and updates while scanning through the HTML tokens.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ private $bytes_already_parsed = 0;
+
+ /**
+ * How many bytes from the input HTML document have already been
+ * copied into the output buffer.
+ *
+ * Lexical updates are enqueued and processed in batches. Prior
+ * to any given update in the input document, there might exist
+ * a span of HTML unaffected by any changes. This span ought to
+ * be copied verbatim into the output buffer before applying the
+ * following update. This value will point to the starting byte
+ * offset in the input document where that unaffected span of
+ * HTML starts.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ private $bytes_already_copied = 0;
+
+ /**
+ * Byte offset in input document where current tag name starts.
+ *
+ * Example:
+ * ```
+ * <div id="test">...
+ * 01234
+ * - tag name starts at 1
+ * ```
+ *
+ * @since 6.2.0
+ * @var int|null
+ */
+ private $tag_name_starts_at;
+
+ /**
+ * Byte length of current tag name.
+ *
+ * Example:
+ * ```
+ * <div id="test">...
+ * 01234
+ * --- tag name length is 3
+ * ```
+ *
+ * @since 6.2.0
+ * @var int|null
+ */
+ private $tag_name_length;
+
+ /**
+ * Byte offset in input document where current tag token ends.
+ *
+ * Example:
+ * ```
+ * <div id="test">...
+ * 0 1 |
+ * 01234567890123456
+ * --- tag name ends at 14
+ * ```
+ *
+ * @since 6.2.0
+ * @var int|null
+ */
+ private $tag_ends_at;
+
+ /**
+ * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
+ *
+ * @var bool
+ */
+ private $is_closing_tag;
+
+ /**
+ * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
+ *
+ * Example:
+ * ```php
+ * // supposing the parser is working through this content
+ * // and stops after recognizing the `id` attribute
+ * // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
+ * // ^ parsing will continue from this point
+ * $this->attributes = [
+ * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
+ * ];
+ *
+ * // when picking up parsing again, or when asking to find the
+ * // `class` attribute we will continue and add to this array
+ * $this->attributes = [
+ * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
+ * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
+ * ];
+ *
+ * // Note that only the `class` attribute value is stored in the index.
+ * // That's because it is the only value used by this class at the moment.
+ * ```
+ *
+ * @since 6.2.0
+ * @var WP_HTML_Attribute_Token[]
+ */
+ private $attributes = array();
+
+ /**
+ * Which class names to add or remove from a tag.
+ *
+ * These are tracked separately from attribute updates because they are
+ * semantically distinct, whereas this interface exists for the common
+ * case of adding and removing class names while other attributes are
+ * generally modified as with DOM `setAttribute` calls.
+ *
+ * When modifying an HTML document these will eventually be collapsed
+ * into a single `set_attribute( 'class', $changes )` call.
+ *
+ * Example:
+ * ```php
+ * // Add the `wp-block-group` class, remove the `wp-group` class.
+ * $classname_updates = [
+ * // Indexed by a comparable class name
+ * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS,
+ * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
+ * ];
+ * ```
+ *
+ * @since 6.2.0
+ * @var bool[]
+ */
+ private $classname_updates = array();
+
+ /**
+ * Tracks a semantic location in the original HTML which
+ * shifts with updates as they are applied to the document.
+ *
+ * @since 6.2.0
+ * @var WP_HTML_Span[]
+ */
+ protected $bookmarks = array();
+
+ const ADD_CLASS = true;
+ const REMOVE_CLASS = false;
+ const SKIP_CLASS = null;
+
+ /**
+ * Lexical replacements to apply to input HTML document.
+ *
+ * "Lexical" in this class refers to the part of this class which
+ * operates on pure text _as text_ and not as HTML. There's a line
+ * between the public interface, with HTML-semantic methods like
+ * `set_attribute` and `add_class`, and an internal state that tracks
+ * text offsets in the input document.
+ *
+ * When higher-level HTML methods are called, those have to transform their
+ * operations (such as setting an attribute's value) into text diffing
+ * operations (such as replacing the sub-string from indices A to B with
+ * some given new string). These text-diffing operations are the lexical
+ * updates.
+ *
+ * As new higher-level methods are added they need to collapse their
+ * operations into these lower-level lexical updates since that's the
+ * Tag Processor's internal language of change. Any code which creates
+ * these lexical updates must ensure that they do not cross HTML syntax
+ * boundaries, however, so these should never be exposed outside of this
+ * class or any classes which intentionally expand its functionality.
+ *
+ * These are enqueued while editing the document instead of being immediately
+ * applied to avoid processing overhead, string allocations, and string
+ * copies when applying many updates to a single document.
+ *
+ * Example:
+ * ```php
+ * // Replace an attribute stored with a new value, indices
+ * // sourced from the lazily-parsed HTML recognizer.
+ * $start = $attributes['src']->start;
+ * $end = $attributes['src']->end;
+ * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value );
+ *
+ * // Correspondingly, something like this will appear in this array.
+ * $lexical_updates = [
+ * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
+ * ];
+ * ```
+ *
+ * @since 6.2.0
+ * @var WP_HTML_Text_Replacement[]
+ */
+ protected $lexical_updates = array();
+
+ /**
+ * Tracks and limits `seek()` calls to prevent accidental infinite loops.
+ *
+ * @see seek
+ * @since 6.2.0
+ * @var int
+ */
+ protected $seek_count = 0;
+
+ /**
+ * Constructor.
+ *
+ * @since 6.2.0
+ *
+ * @param string $html HTML to process.
+ */
+ public function __construct( $html ) {
+ $this->html = $html;
+ }
+
+ /**
+ * Finds the next tag matching the $query.
+ *
+ * @since 6.2.0
+ *
+ * @param array|string|null $query {
+ * Optional. Which tag name to find, having which class, etc. Default is to find any tag.
+ *
+ * @type string|null $tag_name Which tag to find, or `null` for "any tag."
+ * @type int|null $match_offset Find the Nth tag matching all search criteria.
+ * 0 for "first" tag, 2 for "third," etc.
+ * Defaults to first tag.
+ * @type string|null $class_name Tag must contain this whole class name to match.
+ * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>.
+ * }
+ * @return boolean Whether a tag was matched.
+ */
+ public function next_tag( $query = null ) {
+ $this->parse_query( $query );
+ $already_found = 0;
+
+ do {
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ // Find the next tag if it exists.
+ if ( false === $this->parse_next_tag() ) {
+ $this->bytes_already_parsed = strlen( $this->html );
+
+ return false;
+ }
+
+ // Parse all of its attributes.
+ while ( $this->parse_next_attribute() ) {
+ continue;
+ }
+
+ // Ensure that the tag closes before the end of the document.
+ $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
+ if ( false === $tag_ends_at ) {
+ return false;
+ }
+ $this->tag_ends_at = $tag_ends_at;
+ $this->bytes_already_parsed = $tag_ends_at;
+
+ // Finally, check if the parsed tag and its attributes match the search query.
+ if ( $this->matches() ) {
+ ++$already_found;
+ }
+
+ /*
+ * For non-DATA sections which might contain text that looks like HTML tags but
+ * isn't, scan with the appropriate alternative mode. Looking at the first letter
+ * of the tag name as a pre-check avoids a string allocation when it's not needed.
+ */
+ $t = $this->html[ $this->tag_name_starts_at ];
+ if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) {
+ $tag_name = $this->get_tag();
+
+ if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
+ $this->bytes_already_parsed = strlen( $this->html );
+ return false;
+ } elseif (
+ ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
+ ! $this->skip_rcdata( $tag_name )
+ ) {
+ $this->bytes_already_parsed = strlen( $this->html );
+ return false;
+ }
+ }
+ } while ( $already_found < $this->sought_match_offset );
+
+ return true;
+ }
+
+
+ /**
+ * Sets a bookmark in the HTML document.
+ *
+ * Bookmarks represent specific places or tokens in the HTML
+ * document, such as a tag opener or closer. When applying
+ * edits to a document, such as setting an attribute, the
+ * text offsets of that token may shift; the bookmark is
+ * kept updated with those shifts and remains stable unless
+ * the entire span of text in which the token sits is removed.
+ *
+ * Release bookmarks when they are no longer needed.
+ *
+ * Example:
+ * ```
+ * <main><h2>Surprising fact you may not know!</h2></main>
+ * ^ ^
+ * \-|-- this `H2` opener bookmark tracks the token
+ *
+ * <main class="clickbait"><h2>Surprising fact you may no…
+ * ^ ^
+ * \-|-- it shifts with edits
+ * ```
+ *
+ * Bookmarks provide the ability to seek to a previously-scanned
+ * place in the HTML document. This avoids the need to re-scan
+ * the entire document.
+ *
+ * Example:
+ * ```
+ * <ul><li>One</li><li>Two</li><li>Three</li></ul>
+ * ^^^^
+ * want to note this last item
+ *
+ * $p = new WP_HTML_Tag_Processor( $html );
+ * $in_list = false;
+ * while ( $p->next_tag( [ 'tag_closers' => $in_list ? 'visit' : 'skip' ] ) ) {
+ * if ( 'UL' === $p->get_tag() ) {
+ * if ( $p->is_tag_closer() ) {
+ * $in_list = false;
+ * $p->set_bookmark( 'resume' );
+ * if ( $p->seek( 'last-li' ) ) {
+ * $p->add_class( 'last-li' );
+ * }
+ * $p->seek( 'resume' );
+ * $p->release_bookmark( 'last-li' );
+ * $p->release_bookmark( 'resume' );
+ * } else {
+ * $in_list = true;
+ * }
+ * }
+ *
+ * if ( 'LI' === $p->get_tag() ) {
+ * $p->set_bookmark( 'last-li' );
+ * }
+ * }
+ * ```
+ *
+ * Bookmarks intentionally hide the internal string offsets
+ * to which they refer. They are maintained internally as
+ * updates are applied to the HTML document and therefore
+ * retain their "position" - the location to which they
+ * originally pointed. The inability to use bookmarks with
+ * functions like `substr` is therefore intentional to guard
+ * against accidentally breaking the HTML.
+ *
+ * Because bookmarks allocate memory and require processing
+ * for every applied update, they are limited and require
+ * a name. They should not be created with programmatically-made
+ * names, such as "li_{$index}" with some loop. As a general
+ * rule they should only be created with string-literal names
+ * like "start-of-section" or "last-paragraph".
+ *
+ * Bookmarks are a powerful tool to enable complicated behavior.
+ * Consider double-checking that you need this tool if you are
+ * reaching for it, as inappropriate use could lead to broken
+ * HTML structure or unwanted processing overhead.
+ *
+ * @since 6.2.0
+ *
+ * @param string $name Identifies this particular bookmark.
+ * @return bool Whether the bookmark was successfully created.
+ */
+ public function set_bookmark( $name ) {
+ if ( null === $this->tag_name_starts_at ) {
+ return false;
+ }
+
+ if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Too many bookmarks: cannot create any more.' ),
+ '6.2.0'
+ );
+ return false;
+ }
+
+ $this->bookmarks[ $name ] = new WP_HTML_Span(
+ $this->tag_name_starts_at - 1,
+ $this->tag_ends_at
+ );
+
+ return true;
+ }
+
+
+ /**
+ * Removes a bookmark that is no longer needed.
+ *
+ * Releasing a bookmark frees up the small
+ * performance overhead it requires.
+ *
+ * @param string $name Name of the bookmark to remove.
+ * @return bool Whether the bookmark already existed before removal.
+ */
+ public function release_bookmark( $name ) {
+ if ( ! array_key_exists( $name, $this->bookmarks ) ) {
+ return false;
+ }
+
+ unset( $this->bookmarks[ $name ] );
+
+ return true;
+ }
+
+
+ /**
+ * Skips contents of title and textarea tags.
+ *
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
+ * @since 6.2.0
+ *
+ * @param string $tag_name – the lowercase tag name which will close the RCDATA region.
+ * @return bool Whether an end to the RCDATA region was found before the end of the document.
+ */
+ private function skip_rcdata( $tag_name ) {
+ $html = $this->html;
+ $doc_length = strlen( $html );
+ $tag_length = strlen( $tag_name );
+
+ $at = $this->bytes_already_parsed;
+
+ while ( false !== $at && $at < $doc_length ) {
+ $at = strpos( $this->html, '</', $at );
+
+ // If there is no possible tag closer then fail.
+ if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
+ $this->bytes_already_parsed = $doc_length;
+ return false;
+ }
+
+ $at += 2;
+
+ /*
+ * Find a case-insensitive match to the tag name.
+ *
+ * Because tag names are limited to US-ASCII there is no
+ * need to perform any kind of Unicode normalization when
+ * comparing; any character which could be impacted by such
+ * normalization could not be part of a tag name.
+ */
+ for ( $i = 0; $i < $tag_length; $i++ ) {
+ $tag_char = $tag_name[ $i ];
+ $html_char = $html[ $at + $i ];
+
+ if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
+ $at += $i;
+ continue 2;
+ }
+ }
+
+ $at += $tag_length;
+ $this->bytes_already_parsed = $at;
+
+ /*
+ * Ensure that the tag name terminates to avoid matching on
+ * substrings of a longer tag name. For example, the sequence
+ * "</textarearug" should not match for "</textarea" even
+ * though "textarea" is found within the text.
+ */
+ $c = $html[ $at ];
+ if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
+ continue;
+ }
+
+ while ( $this->parse_next_attribute() ) {
+ continue;
+ }
+ $at = $this->bytes_already_parsed;
+ if ( $at >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
+ ++$this->bytes_already_parsed;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Skips contents of script tags.
+ *
+ * @since 6.2.0
+ *
+ * @return bool Whether the script tag was closed before the end of the document.
+ */
+ private function skip_script_data() {
+ $state = 'unescaped';
+ $html = $this->html;
+ $doc_length = strlen( $html );
+ $at = $this->bytes_already_parsed;
+
+ while ( false !== $at && $at < $doc_length ) {
+ $at += strcspn( $html, '-<', $at );
+
+ /*
+ * For all script states a "-->" transitions
+ * back into the normal unescaped script mode,
+ * even if that's the current state.
+ */
+ if (
+ $at + 2 < $doc_length &&
+ '-' === $html[ $at ] &&
+ '-' === $html[ $at + 1 ] &&
+ '>' === $html[ $at + 2 ]
+ ) {
+ $at += 3;
+ $state = 'unescaped';
+ continue;
+ }
+
+ // Everything of interest past here starts with "<".
+ if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) {
+ continue;
+ }
+
+ /*
+ * Unlike with "-->", the "<!--" only transitions
+ * into the escaped mode if not already there.
+ *
+ * Inside the escaped modes it will be ignored; and
+ * should never break out of the double-escaped
+ * mode and back into the escaped mode.
+ *
+ * While this requires a mode change, it does not
+ * impact the parsing otherwise, so continue
+ * parsing after updating the state.
+ */
+ if (
+ $at + 2 < $doc_length &&
+ '!' === $html[ $at ] &&
+ '-' === $html[ $at + 1 ] &&
+ '-' === $html[ $at + 2 ]
+ ) {
+ $at += 3;
+ $state = 'unescaped' === $state ? 'escaped' : $state;
+ continue;
+ }
+
+ if ( '/' === $html[ $at ] ) {
+ $is_closing = true;
+ ++$at;
+ } else {
+ $is_closing = false;
+ }
+
+ /*
+ * At this point the only remaining state-changes occur with the
+ * <script> and </script> tags; unless one of these appears next,
+ * proceed scanning to the next potential token in the text.
+ */
+ if ( ! (
+ $at + 6 < $doc_length &&
+ ( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
+ ( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
+ ( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
+ ( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
+ ( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
+ ( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
+ ) ) {
+ ++$at;
+ continue;
+ }
+
+ /*
+ * Ensure that the script tag terminates to avoid matching on
+ * substrings of a non-match. For example, the sequence
+ * "<script123" should not end a script region even though
+ * "<script" is found within the text.
+ */
+ if ( $at + 6 >= $doc_length ) {
+ continue;
+ }
+ $at += 6;
+ $c = $html[ $at ];
+ if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
+ ++$at;
+ continue;
+ }
+
+ if ( 'escaped' === $state && ! $is_closing ) {
+ $state = 'double-escaped';
+ continue;
+ }
+
+ if ( 'double-escaped' === $state && $is_closing ) {
+ $state = 'escaped';
+ continue;
+ }
+
+ if ( $is_closing ) {
+ $this->bytes_already_parsed = $at;
+ if ( $this->bytes_already_parsed >= $doc_length ) {
+ return false;
+ }
+
+ while ( $this->parse_next_attribute() ) {
+ continue;
+ }
+
+ if ( '>' === $html[ $this->bytes_already_parsed ] ) {
+ ++$this->bytes_already_parsed;
+ return true;
+ }
+ }
+
+ ++$at;
+ }
+
+ return false;
+ }
+
+ /**
+ * Parses the next tag.
+ *
+ * This will find and start parsing the next tag, including
+ * the opening `<`, the potential closer `/`, and the tag
+ * name. It does not parse the attributes or scan to the
+ * closing `>`; these are left for other methods.
+ *
+ * @since 6.2.0
+ *
+ * @return bool Whether a tag was found before the end of the document.
+ */
+ private function parse_next_tag() {
+ $this->after_tag();
+
+ $html = $this->html;
+ $doc_length = strlen( $html );
+ $at = $this->bytes_already_parsed;
+
+ while ( false !== $at && $at < $doc_length ) {
+ $at = strpos( $html, '<', $at );
+ if ( false === $at ) {
+ return false;
+ }
+
+ if ( '/' === $this->html[ $at + 1 ] ) {
+ $this->is_closing_tag = true;
+ $at++;
+ } else {
+ $this->is_closing_tag = false;
+ }
+
+ /*
+ * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
+ * For example, "<3" is rendered as text, not a tag opener. If at least
+ * one letter follows the "<" then _it is_ a tag, but if the following
+ * character is anything else it _is not a tag_.
+ *
+ * It's not uncommon to find non-tags starting with `<` in an HTML
+ * document, so it's good for performance to make this pre-check before
+ * continuing to attempt to parse a tag name.
+ *
+ * Reference:
+ * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
+ * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
+ if ( $tag_name_prefix_length > 0 ) {
+ ++$at;
+ $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+ $this->tag_name_starts_at = $at;
+ $this->bytes_already_parsed = $at + $this->tag_name_length;
+ return true;
+ }
+
+ /*
+ * Abort if no tag is found before the end of
+ * the document. There is nothing left to parse.
+ */
+ if ( $at + 1 >= strlen( $html ) ) {
+ return false;
+ }
+
+ /*
+ * <! transitions to markup declaration open state
+ * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+ */
+ if ( '!' === $html[ $at + 1 ] ) {
+ /*
+ * <!-- transitions to a bogus comment state – skip to the nearest -->
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if (
+ strlen( $html ) > $at + 3 &&
+ '-' === $html[ $at + 2 ] &&
+ '-' === $html[ $at + 3 ]
+ ) {
+ $closer_at = strpos( $html, '-->', $at + 4 );
+ if ( false === $closer_at ) {
+ return false;
+ }
+
+ $at = $closer_at + 3;
+ continue;
+ }
+
+ /*
+ * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
+ * The CDATA is case-sensitive.
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if (
+ strlen( $html ) > $at + 8 &&
+ '[' === $html[ $at + 2 ] &&
+ 'C' === $html[ $at + 3 ] &&
+ 'D' === $html[ $at + 4 ] &&
+ 'A' === $html[ $at + 5 ] &&
+ 'T' === $html[ $at + 6 ] &&
+ 'A' === $html[ $at + 7 ] &&
+ '[' === $html[ $at + 8 ]
+ ) {
+ $closer_at = strpos( $html, ']]>', $at + 9 );
+ if ( false === $closer_at ) {
+ return false;
+ }
+
+ $at = $closer_at + 3;
+ continue;
+ }
+
+ /*
+ * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
+ * These are ASCII-case-insensitive.
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if (
+ strlen( $html ) > $at + 8 &&
+ ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
+ ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
+ ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
+ ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
+ ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
+ ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
+ ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
+ ) {
+ $closer_at = strpos( $html, '>', $at + 9 );
+ if ( false === $closer_at ) {
+ return false;
+ }
+
+ $at = $closer_at + 1;
+ continue;
+ }
+
+ /*
+ * Anything else here is an incorrectly-opened comment and transitions
+ * to the bogus comment state - skip to the nearest >.
+ */
+ $at = strpos( $html, '>', $at + 1 );
+ continue;
+ }
+
+ /*
+ * <? transitions to a bogus comment state – skip to the nearest >
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if ( '?' === $html[ $at + 1 ] ) {
+ $closer_at = strpos( $html, '>', $at + 2 );
+ if ( false === $closer_at ) {
+ return false;
+ }
+
+ $at = $closer_at + 1;
+ continue;
+ }
+
+ ++$at;
+ }
+
+ return false;
+ }
+
+ /**
+ * Parses the next attribute.
+ *
+ * @since 6.2.0
+ *
+ * @return bool Whether an attribute was found before the end of the document.
+ */
+ private function parse_next_attribute() {
+ // Skip whitespace and slashes.
+ $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ /*
+ * Treat the equal sign as a part of the attribute
+ * name if it is the first encountered byte.
+ *
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
+ */
+ $name_length = '=' === $this->html[ $this->bytes_already_parsed ]
+ ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 )
+ : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );
+
+ // No attribute, just tag closer.
+ if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ $attribute_start = $this->bytes_already_parsed;
+ $attribute_name = substr( $this->html, $attribute_start, $name_length );
+ $this->bytes_already_parsed += $name_length;
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ $this->skip_whitespace();
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ $has_value = '=' === $this->html[ $this->bytes_already_parsed ];
+ if ( $has_value ) {
+ ++$this->bytes_already_parsed;
+ $this->skip_whitespace();
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ switch ( $this->html[ $this->bytes_already_parsed ] ) {
+ case "'":
+ case '"':
+ $quote = $this->html[ $this->bytes_already_parsed ];
+ $value_start = $this->bytes_already_parsed + 1;
+ $value_length = strcspn( $this->html, $quote, $value_start );
+ $attribute_end = $value_start + $value_length + 1;
+ $this->bytes_already_parsed = $attribute_end;
+ break;
+
+ default:
+ $value_start = $this->bytes_already_parsed;
+ $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start );
+ $attribute_end = $value_start + $value_length;
+ $this->bytes_already_parsed = $attribute_end;
+ }
+ } else {
+ $value_start = $this->bytes_already_parsed;
+ $value_length = 0;
+ $attribute_end = $attribute_start + $name_length;
+ }
+
+ if ( $attribute_end >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ if ( $this->is_closing_tag ) {
+ return true;
+ }
+
+ /*
+ * > There must never be two or more attributes on
+ * > the same start tag whose names are an ASCII
+ * > case-insensitive match for each other.
+ * - HTML 5 spec
+ *
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+ */
+ $comparable_name = strtolower( $attribute_name );
+
+ // If an attribute is listed many times, only use the first declaration and ignore the rest.
+ if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
+ $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
+ $attribute_name,
+ $value_start,
+ $value_length,
+ $attribute_start,
+ $attribute_end,
+ ! $has_value
+ );
+ }
+
+ return true;
+ }
+
+ /**
+ * Move the internal cursor past any immediate successive whitespace.
+ *
+ * @since 6.2.0
+ *
+ * @return void
+ */
+ private function skip_whitespace() {
+ $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed );
+ }
+
+ /**
+ * Applies attribute updates and cleans up once a tag is fully parsed.
+ *
+ * @since 6.2.0
+ *
+ * @return void
+ */
+ private function after_tag() {
+ $this->class_name_updates_to_attributes_updates();
+ $this->apply_attributes_updates();
+ $this->tag_name_starts_at = null;
+ $this->tag_name_length = null;
+ $this->tag_ends_at = null;
+ $this->is_closing_tag = null;
+ $this->attributes = array();
+ }
+
+ /**
+ * Converts class name updates into tag attributes updates
+ * (they are accumulated in different data formats for performance).
+ *
+ * @see $lexical_updates
+ * @see $classname_updates
+ *
+ * @since 6.2.0
+ *
+ * @return void
+ */
+ private function class_name_updates_to_attributes_updates() {
+ if ( count( $this->classname_updates ) === 0 ) {
+ return;
+ }
+
+ $existing_class = $this->get_enqueued_attribute_value( 'class' );
+ if ( null === $existing_class || true === $existing_class ) {
+ $existing_class = '';
+ }
+
+ if ( false === $existing_class && isset( $this->attributes['class'] ) ) {
+ $existing_class = substr(
+ $this->html,
+ $this->attributes['class']->value_starts_at,
+ $this->attributes['class']->value_length
+ );
+ }
+
+ if ( false === $existing_class ) {
+ $existing_class = '';
+ }
+
+ /**
+ * Updated "class" attribute value.
+ *
+ * This is incrementally built while scanning through the existing class
+ * attribute, skipping removed classes on the way, and then appending
+ * added classes at the end. Only when finished processing will the
+ * value contain the final new value.
+
+ * @var string $class
+ */
+ $class = '';
+
+ /**
+ * Tracks the cursor position in the existing
+ * class attribute value while parsing.
+ *
+ * @var int $at
+ */
+ $at = 0;
+
+ /**
+ * Indicates if there's any need to modify the existing class attribute.
+ *
+ * If a call to `add_class()` and `remove_class()` wouldn't impact
+ * the `class` attribute value then there's no need to rebuild it.
+ * For example, when adding a class that's already present or
+ * removing one that isn't.
+ *
+ * This flag enables a performance optimization when none of the enqueued
+ * class updates would impact the `class` attribute; namely, that the
+ * processor can continue without modifying the input document, as if
+ * none of the `add_class()` or `remove_class()` calls had been made.
+ *
+ * This flag is set upon the first change that requires a string update.
+ *
+ * @var bool $modified
+ */
+ $modified = false;
+
+ // Remove unwanted classes by only copying the new ones.
+ $existing_class_length = strlen( $existing_class );
+ while ( $at < $existing_class_length ) {
+ // Skip to the first non-whitespace character.
+ $ws_at = $at;
+ $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at );
+ $at += $ws_length;
+
+ // Capture the class name – it's everything until the next whitespace.
+ $name_length = strcspn( $existing_class, " \t\f\r\n", $at );
+ if ( 0 === $name_length ) {
+ // If no more class names are found then that's the end.
+ break;
+ }
+
+ $name = substr( $existing_class, $at, $name_length );
+ $at += $name_length;
+
+ // If this class is marked for removal, start processing the next one.
+ $remove_class = (
+ isset( $this->classname_updates[ $name ] ) &&
+ self::REMOVE_CLASS === $this->classname_updates[ $name ]
+ );
+
+ // If a class has already been seen then skip it; it should not be added twice.
+ if ( ! $remove_class ) {
+ $this->classname_updates[ $name ] = self::SKIP_CLASS;
+ }
+
+ if ( $remove_class ) {
+ $modified = true;
+ continue;
+ }
+
+ /*
+ * Otherwise, append it to the new "class" attribute value.
+ *
+ * There are options for handling whitespace between tags.
+ * Preserving the existing whitespace produces fewer changes
+ * to the HTML content and should clarify the before/after
+ * content when debugging the modified output.
+ *
+ * This approach contrasts normalizing the inter-class
+ * whitespace to a single space, which might appear cleaner
+ * in the output HTML but produce a noisier change.
+ */
+ $class .= substr( $existing_class, $ws_at, $ws_length );
+ $class .= $name;
+ }
+
+ // Add new classes by appending those which haven't already been seen.
+ foreach ( $this->classname_updates as $name => $operation ) {
+ if ( self::ADD_CLASS === $operation ) {
+ $modified = true;
+
+ $class .= strlen( $class ) > 0 ? ' ' : '';
+ $class .= $name;
+ }
+ }
+
+ $this->classname_updates = array();
+ if ( ! $modified ) {
+ return;
+ }
+
+ if ( strlen( $class ) > 0 ) {
+ $this->set_attribute( 'class', $class );
+ } else {
+ $this->remove_attribute( 'class' );
+ }
+ }
+
+ /**
+ * Applies attribute updates to HTML document.
+ *
+ * @since 6.2.0
+ *
+ * @return void
+ */
+ private function apply_attributes_updates() {
+ if ( ! count( $this->lexical_updates ) ) {
+ return;
+ }
+
+ /*
+ * Attribute updates can be enqueued in any order but updates
+ * to the document must occur in lexical order; that is, each
+ * replacement must be made before all others which follow it
+ * at later string indices in the input document.
+ *
+ * Sorting avoid making out-of-order replacements which
+ * can lead to mangled output, partially-duplicated
+ * attributes, and overwritten attributes.
+ */
+ usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) );
+
+ foreach ( $this->lexical_updates as $diff ) {
+ $this->output_buffer .= substr( $this->html, $this->bytes_already_copied, $diff->start - $this->bytes_already_copied );
+ $this->output_buffer .= $diff->text;
+ $this->bytes_already_copied = $diff->end;
+ }
+
+ /*
+ * Adjust bookmark locations to account for how the text
+ * replacements adjust offsets in the input document.
+ */
+ foreach ( $this->bookmarks as $bookmark ) {
+ /*
+ * Each lexical update which appears before the bookmark's endpoints
+ * might shift the offsets for those endpoints. Loop through each change
+ * and accumulate the total shift for each bookmark, then apply that
+ * shift after tallying the full delta.
+ */
+ $head_delta = 0;
+ $tail_delta = 0;
+
+ foreach ( $this->lexical_updates as $diff ) {
+ $update_head = $bookmark->start >= $diff->start;
+ $update_tail = $bookmark->end >= $diff->start;
+
+ if ( ! $update_head && ! $update_tail ) {
+ break;
+ }
+
+ $delta = strlen( $diff->text ) - ( $diff->end - $diff->start );
+
+ if ( $update_head ) {
+ $head_delta += $delta;
+ }
+
+ if ( $update_tail ) {
+ $tail_delta += $delta;
+ }
+ }
+
+ $bookmark->start += $head_delta;
+ $bookmark->end += $tail_delta;
+ }
+
+ $this->lexical_updates = array();
+ }
+
+ /**
+ * Move the internal cursor in the Tag Processor to a given bookmark's location.
+ *
+ * In order to prevent accidental infinite loops, there's a
+ * maximum limit on the number of times seek() can be called.
+ *
+ * @since 6.2.0
+ *
+ * @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
+ * @return bool Whether the internal cursor was successfully moved to the bookmark's location.
+ */
+ public function seek( $bookmark_name ) {
+ if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Unknown bookmark name.' ),
+ '6.2.0'
+ );
+ return false;
+ }
+
+ if ( ++$this->seek_count > self::MAX_SEEK_OPS ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Too many calls to seek() - this can lead to performance issues.' ),
+ '6.2.0'
+ );
+ return false;
+ }
+
+ // Flush out any pending updates to the document.
+ $this->get_updated_html();
+
+ // Point this tag processor before the sought tag opener and consume it.
+ $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
+ $this->bytes_already_copied = $this->bytes_already_parsed;
+ $this->output_buffer = substr( $this->html, 0, $this->bytes_already_copied );
+ return $this->next_tag();
+ }
+
+ /**
+ * Compare two WP_HTML_Text_Replacement objects.
+ *
+ * @since 6.2.0
+ *
+ * @param WP_HTML_Text_Replacement $a First attribute update.
+ * @param WP_HTML_Text_Replacement $b Second attribute update.
+ * @return int Comparison value for string order.
+ */
+ private static function sort_start_ascending( $a, $b ) {
+ $by_start = $a->start - $b->start;
+ if ( 0 !== $by_start ) {
+ return $by_start;
+ }
+
+ $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0;
+ if ( 0 !== $by_text ) {
+ return $by_text;
+ }
+
+ /*
+ * This code should be unreachable, because it implies the two replacements
+ * start at the same location and contain the same text.
+ */
+ return $a->end - $b->end;
+ }
+
+ /**
+ * Return the enqueued value for a given attribute, if one exists.
+ *
+ * Enqueued updates can take different data types:
+ * - If an update is enqueued and is boolean, the return will be `true`
+ * - If an update is otherwise enqueued, the return will be the string value of that update.
+ * - If an attribute is enqueued to be removed, the return will be `null` to indicate that.
+ * - If no updates are enqueued, the return will be `false` to differentiate from "removed."
+ *
+ * @since 6.2.0
+ *
+ * @param string $comparable_name The attribute name in its comparable form.
+ * @return string|boolean|null Value of enqueued update if present, otherwise false.
+ */
+ private function get_enqueued_attribute_value( $comparable_name ) {
+ if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) {
+ return false;
+ }
+
+ $enqueued_text = $this->lexical_updates[ $comparable_name ]->text;
+
+ // Removed attributes erase the entire span.
+ if ( '' === $enqueued_text ) {
+ return null;
+ }
+
+ /*
+ * Boolean attribute updates are just the attribute name without a corresponding value.
+ *
+ * This value might differ from the given comparable name in that there could be leading
+ * or trailing whitespace, and that the casing follows the name given in `set_attribute`.
+ *
+ * Example:
+ * ```
+ * $p->set_attribute( 'data-TEST-id', 'update' );
+ * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' );
+ * ```
+ *
+ * Detect this difference based on the absence of the `=`, which _must_ exist in any
+ * attribute containing a value, e.g. `<input type="text" enabled />`.
+ * ¹ ²
+ * 1. Attribute with a string value.
+ * 2. Boolean attribute whose value is `true`.
+ */
+ $equals_at = strpos( $enqueued_text, '=' );
+ if ( false === $equals_at ) {
+ return true;
+ }
+
+ /*
+ * Finally, a normal update's value will appear after the `=` and
+ * be double-quoted, as performed incidentally by `set_attribute`.
+ *
+ * e.g. `type="text"`
+ * ¹² ³
+ * 1. Equals is here.
+ * 2. Double-quoting starts one after the equals sign.
+ * 3. Double-quoting ends at the last character in the update.
+ */
+ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 );
+ return html_entity_decode( $enqueued_value );
+ }
+
+ /**
+ * Returns the value of a requested attribute from a matched tag opener if that attribute exists.
+ *
+ * Example:
+ * ```php
+ * $p = new WP_HTML_Tag_Processor( '<div enabled class="test" data-test-id="14">Test</div>' );
+ * $p->next_tag( [ 'class_name' => 'test' ] ) === true;
+ * $p->get_attribute( 'data-test-id' ) === '14';
+ * $p->get_attribute( 'enabled' ) === true;
+ * $p->get_attribute( 'aria-label' ) === null;
+ *
+ * $p->next_tag( [] ) === false;
+ * $p->get_attribute( 'class' ) === null;
+ * ```
+ *
+ * @since 6.2.0
+ *
+ * @param string $name Name of attribute whose value is requested.
+ * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
+ */
+ public function get_attribute( $name ) {
+ if ( null === $this->tag_name_starts_at ) {
+ return null;
+ }
+
+ $comparable = strtolower( $name );
+
+ /*
+ * For every attribute other than `class` it's possible to perform a quick check if
+ * there's an enqueued lexical update whose value takes priority over what's found in
+ * the input document.
+ *
+ * The `class` attribute is special though because of the exposed helpers `add_class`
+ * and `remove_class`. These form a builder for the `class` attribute, so an additional
+ * check for enqueued class changes is required in addition to the check for any enqueued
+ * attribute values. If any exist, those enqueued class changes must first be flushed out
+ * into an attribute value update.
+ */
+ if ( 'class' === $name ) {
+ $this->class_name_updates_to_attributes_updates();
+ }
+
+ // Return any enqueued attribute value updates if they exist.
+ $enqueued_value = $this->get_enqueued_attribute_value( $comparable );
+ if ( false !== $enqueued_value ) {
+ return $enqueued_value;
+ }
+
+ if ( ! isset( $this->attributes[ $comparable ] ) ) {
+ return null;
+ }
+
+ $attribute = $this->attributes[ $comparable ];
+
+ /*
+ * This flag distinguishes an attribute with no value
+ * from an attribute with an empty string value. For
+ * unquoted attributes this could look very similar.
+ * It refers to whether an `=` follows the name.
+ *
+ * e.g. <div boolean-attribute empty-attribute=></div>
+ * ¹ ²
+ * 1. Attribute `boolean-attribute` is `true`.
+ * 2. Attribute `empty-attribute` is `""`.
+ */
+ if ( true === $attribute->is_true ) {
+ return true;
+ }
+
+ $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
+
+ return html_entity_decode( $raw_value );
+ }
+
+ /**
+ * Gets lowercase names of all attributes matching a given prefix in the current tag.
+ *
+ * Note that matching is case-insensitive. This is in accordance with the spec:
+ *
+ * > There must never be two or more attributes on
+ * > the same start tag whose names are an ASCII
+ * > case-insensitive match for each other.
+ * - HTML 5 spec
+ *
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+ *
+ * Example:
+ * ```php
+ * $p = new WP_HTML_Tag_Processor( '<div data-ENABLED class="test" DATA-test-id="14">Test</div>' );
+ * $p->next_tag( [ 'class_name' => 'test' ] ) === true;
+ * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' );
+ *
+ * $p->next_tag( [] ) === false;
+ * $p->get_attribute_names_with_prefix( 'data-' ) === null;
+ * ```
+ *
+ * @since 6.2.0
+ *
+ * @param string $prefix Prefix of requested attribute names.
+ * @return array|null List of attribute names, or `null` when no tag opener is matched.
+ */
+ function get_attribute_names_with_prefix( $prefix ) {
+ if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+ return null;
+ }
+
+ $comparable = strtolower( $prefix );
+
+ $matches = array();
+ foreach ( array_keys( $this->attributes ) as $attr_name ) {
+ if ( str_starts_with( $attr_name, $comparable ) ) {
+ $matches[] = $attr_name;
+ }
+ }
+ return $matches;
+ }
+
+ /**
+ * Returns the uppercase name of the matched tag.
+ *
+ * Example:
+ * ```php
+ * $p = new WP_HTML_Tag_Processor( '<DIV CLASS="test">Test</DIV>' );
+ * $p->next_tag( [] ) === true;
+ * $p->get_tag() === 'DIV';
+ *
+ * $p->next_tag( [] ) === false;
+ * $p->get_tag() === null;
+ * ```
+ *
+ * @since 6.2.0
+ *
+ * @return string|null Name of currently matched tag in input HTML, or `null` if none found.
+ */
+ public function get_tag() {
+ if ( null === $this->tag_name_starts_at ) {
+ return null;
+ }
+
+ $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
+
+ return strtoupper( $tag_name );
+ }
+
+ /**
+ * Indicates if the current tag token is a tag closer.
+ *
+ * Example:
+ * ```php
+ * $p = new WP_HTML_Tag_Processor( '<div></div>' );
+ * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
+ * $p->is_tag_closer() === false;
+ *
+ * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
+ * $p->is_tag_closer() === true;
+ * ```
+ *
+ * @since 6.2.0
+ *
+ * @return bool Whether the current tag is a tag closer.
+ */
+ public function is_tag_closer() {
+ return $this->is_closing_tag;
+ }
+
+ /**
+ * Updates or creates a new attribute on the currently matched tag with the passed value.
+ *
+ * For boolean attributes special handling is provided:
+ * - When `true` is passed as the value, then only the attribute name is added to the tag.
+ * - When `false` is passed, the attribute gets removed if it existed before.
+ *
+ * For string attributes, the value is escaped using the `esc_attr` function.
+ *
+ * @since 6.2.0
+ *
+ * @param string $name The attribute name to target.
+ * @param string|bool $value The new attribute value.
+ * @return bool Whether an attribute value was set.
+ */
+ public function set_attribute( $name, $value ) {
+ if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+ return false;
+ }
+
+ /*
+ * WordPress rejects more characters than are strictly forbidden
+ * in HTML5. This is to prevent additional security risks deeper
+ * in the WordPress and plugin stack. Specifically the
+ * less-than (<) greater-than (>) and ampersand (&) aren't allowed.
+ *
+ * The use of a PCRE match enables looking for specific Unicode
+ * code points without writing a UTF-8 decoder. Whereas scanning
+ * for one-byte characters is trivial (with `strcspn`), scanning
+ * for the longer byte sequences would be more complicated. Given
+ * that this shouldn't be in the hot path for execution, it's a
+ * reasonable compromise in efficiency without introducing a
+ * noticeable impact on the overall system.
+ *
+ * @see https://html.spec.whatwg.org/#attributes-2
+ *
+ * @TODO as the only regex pattern maybe we should take it out? are
+ * Unicode patterns available broadly in Core?
+ */
+ if ( preg_match(
+ '~[' .
+ // Syntax-like characters.
+ '"\'>&</ =' .
+ // Control characters.
+ '\x{00}-\x{1F}' .
+ // HTML noncharacters.
+ '\x{FDD0}-\x{FDEF}' .
+ '\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
+ '\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
+ '\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
+ '\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
+ '\x{10FFFE}\x{10FFFF}' .
+ ']~Ssu',
+ $name
+ ) ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Invalid attribute name.' ),
+ '6.2.0'
+ );
+
+ return false;
+ }
+
+ /*
+ * > The values "true" and "false" are not allowed on boolean attributes.
+ * > To represent a false value, the attribute has to be omitted altogether.
+ * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes
+ */
+ if ( false === $value ) {
+ return $this->remove_attribute( $name );
+ }
+
+ if ( true === $value ) {
+ $updated_attribute = $name;
+ } else {
+ $escaped_new_value = esc_attr( $value );
+ $updated_attribute = "{$name}=\"{$escaped_new_value}\"";
+ }
+
+ /*
+ * > There must never be two or more attributes on
+ * > the same start tag whose names are an ASCII
+ * > case-insensitive match for each other.
+ * - HTML 5 spec
+ *
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+ */
+ $comparable_name = strtolower( $name );
+
+ if ( isset( $this->attributes[ $comparable_name ] ) ) {
+ /*
+ * Update an existing attribute.
+ *
+ * Example – set attribute id to "new" in <div id="initial_id" />:
+ * <div id="initial_id"/>
+ * ^-------------^
+ * start end
+ * replacement: `id="new"`
+ *
+ * Result: <div id="new"/>
+ */
+ $existing_attribute = $this->attributes[ $comparable_name ];
+ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
+ $existing_attribute->start,
+ $existing_attribute->end,
+ $updated_attribute
+ );
+ } else {
+ /*
+ * Create a new attribute at the tag's name end.
+ *
+ * Example – add attribute id="new" to <div />:
+ * <div/>
+ * ^
+ * start and end
+ * replacement: ` id="new"`
+ *
+ * Result: <div id="new"/>
+ */
+ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
+ $this->tag_name_starts_at + $this->tag_name_length,
+ $this->tag_name_starts_at + $this->tag_name_length,
+ ' ' . $updated_attribute
+ );
+ }
+
+ /*
+ * Any calls to update the `class` attribute directly should wipe out any
+ * enqueued class changes from `add_class` and `remove_class`.
+ */
+ if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) {
+ $this->classname_updates = array();
+ }
+
+ return true;
+ }
+
+ /**
+ * Remove an attribute from the currently-matched tag.
+ *
+ * @since 6.2.0
+ *
+ * @param string $name The attribute name to remove.
+ * @return bool Whether an attribute was removed.
+ */
+ public function remove_attribute( $name ) {
+ if ( $this->is_closing_tag ) {
+ return false;
+ }
+
+ /*
+ * > There must never be two or more attributes on
+ * > the same start tag whose names are an ASCII
+ * > case-insensitive match for each other.
+ * - HTML 5 spec
+ *
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+ */
+ $name = strtolower( $name );
+
+ /*
+ * Any calls to update the `class` attribute directly should wipe out any
+ * enqueued class changes from `add_class` and `remove_class`.
+ */
+ if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) {
+ $this->classname_updates = array();
+ }
+
+ /*
+ * If updating an attribute that didn't exist in the input
+ * document, then remove the enqueued update and move on.
+ *
+ * For example, this might occur when calling `remove_attribute()`
+ * after calling `set_attribute()` for the same attribute
+ * and when that attribute wasn't originally present.
+ */
+ if ( ! isset( $this->attributes[ $name ] ) ) {
+ if ( isset( $this->lexical_updates[ $name ] ) ) {
+ unset( $this->lexical_updates[ $name ] );
+ }
+ return false;
+ }
+
+ /*
+ * Removes an existing tag attribute.
+ *
+ * Example – remove the attribute id from <div id="main"/>:
+ * <div id="initial_id"/>
+ * ^-------------^
+ * start end
+ * replacement: ``
+ *
+ * Result: <div />
+ */
+ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
+ $this->attributes[ $name ]->start,
+ $this->attributes[ $name ]->end,
+ ''
+ );
+
+ return true;
+ }
+
+ /**
+ * Adds a new class name to the currently matched tag.
+ *
+ * @since 6.2.0
+ *
+ * @param string $class_name The class name to add.
+ * @return bool Whether the class was set to be added.
+ */
+ public function add_class( $class_name ) {
+ if ( $this->is_closing_tag ) {
+ return false;
+ }
+
+ if ( null !== $this->tag_name_starts_at ) {
+ $this->classname_updates[ $class_name ] = self::ADD_CLASS;
+ }
+
+ return true;
+ }
+
+ /**
+ * Removes a class name from the currently matched tag.
+ *
+ * @since 6.2.0
+ *
+ * @param string $class_name The class name to remove.
+ * @return bool Whether the class was set to be removed.
+ */
+ public function remove_class( $class_name ) {
+ if ( $this->is_closing_tag ) {
+ return false;
+ }
+
+ if ( null !== $this->tag_name_starts_at ) {
+ $this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns the string representation of the HTML Tag Processor.
+ *
+ * @since 6.2.0
+ * @see get_updated_html
+ *
+ * @return string The processed HTML.
+ */
+ public function __toString() {
+ return $this->get_updated_html();
+ }
+
+ /**
+ * Returns the string representation of the HTML Tag Processor.
+ *
+ * @since 6.2.0
+ *
+ * @return string The processed HTML.
+ */
+ public function get_updated_html() {
+ $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates );
+
+ /*
+ * When there is nothing more to update and nothing has already been
+ * updated, return the original document and avoid a string copy.
+ */
+ if ( $requires_no_updating && 0 === $this->bytes_already_copied ) {
+ return $this->html;
+ }
+
+ /*
+ * If there are no updates left to apply, but some have already
+ * been applied, then finish by copying the rest of the input
+ * to the end of the updated document and return.
+ */
+ if ( $requires_no_updating && $this->bytes_already_copied > 0 ) {
+ return $this->output_buffer . substr( $this->html, $this->bytes_already_copied );
+ }
+
+ // Apply the updates, rewind to before the current tag, and reparse the attributes.
+ $content_up_to_opened_tag_name = $this->output_buffer . substr(
+ $this->html,
+ $this->bytes_already_copied,
+ $this->tag_name_starts_at + $this->tag_name_length - $this->bytes_already_copied
+ );
+
+ /*
+ * 1. Apply the edits by flushing them to the output buffer and updating the copied byte count.
+ *
+ * Note: `apply_attributes_updates()` modifies `$this->output_buffer`.
+ */
+ $this->class_name_updates_to_attributes_updates();
+ $this->apply_attributes_updates();
+
+ /*
+ * 2. Replace the original HTML with the now-updated HTML so that it's possible to
+ * seek to a previous location and have a consistent view of the updated document.
+ */
+ $this->html = $this->output_buffer . substr( $this->html, $this->bytes_already_copied );
+ $this->output_buffer = $content_up_to_opened_tag_name;
+ $this->bytes_already_copied = strlen( $this->output_buffer );
+
+ /*
+ * 3. Point this tag processor at the original tag opener and consume it
+ *
+ * At this point the internal cursor points to the end of the tag name.
+ * Rewind before the tag name starts so that it's as if the cursor didn't
+ * move; a call to `next_tag()` will reparse the recently-updated attributes
+ * and additional calls to modify the attributes will apply at this same
+ * location.
+ *
+ * <p>Previous HTML<em>More HTML</em></p>
+ * ^ | back up by the length of the tag name plus the opening <
+ * \<-/ back up by strlen("em") + 1 ==> 3
+ */
+ $this->bytes_already_parsed = strlen( $content_up_to_opened_tag_name ) - $this->tag_name_length - 1;
+ $this->next_tag();
+
+ return $this->html;
+ }
+
+ /**
+ * Parses tag query input into internal search criteria.
+ *
+ * @since 6.2.0
+ *
+ * @param array|string|null $query {
+ * Optional. Which tag name to find, having which class, etc. Default is to find any tag.
+ *
+ * @type string|null $tag_name Which tag to find, or `null` for "any tag."
+ * @type int|null $match_offset Find the Nth tag matching all search criteria.
+ * 0 for "first" tag, 2 for "third," etc.
+ * Defaults to first tag.
+ * @type string|null $class_name Tag must contain this class name to match.
+ * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>.
+ * }
+ * @return void
+ */
+ private function parse_query( $query ) {
+ if ( null !== $query && $query === $this->last_query ) {
+ return;
+ }
+
+ $this->last_query = $query;
+ $this->sought_tag_name = null;
+ $this->sought_class_name = null;
+ $this->sought_match_offset = 1;
+ $this->stop_on_tag_closers = false;
+
+ // A single string value means "find the tag of this name".
+ if ( is_string( $query ) ) {
+ $this->sought_tag_name = $query;
+ return;
+ }
+
+ // An empty query parameter applies no restrictions on the search.
+ if ( null === $query ) {
+ return;
+ }
+
+ // If not using the string interface, an associative array is required.
+ if ( ! is_array( $query ) ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'The query argument must be an array or a tag name.' ),
+ '6.2.0'
+ );
+ return;
+ }
+
+ if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) {
+ $this->sought_tag_name = $query['tag_name'];
+ }
+
+ if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) {
+ $this->sought_class_name = $query['class_name'];
+ }
+
+ if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) {
+ $this->sought_match_offset = $query['match_offset'];
+ }
+
+ if ( isset( $query['tag_closers'] ) ) {
+ $this->stop_on_tag_closers = 'visit' === $query['tag_closers'];
+ }
+ }
+
+
+ /**
+ * Checks whether a given tag and its attributes match the search criteria.
+ *
+ * @since 6.2.0
+ *
+ * @return boolean Whether the given tag and its attribute match the search criteria.
+ */
+ private function matches() {
+ if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) {
+ return false;
+ }
+
+ // Does the tag name match the requested tag name in a case-insensitive manner?
+ if ( null !== $this->sought_tag_name ) {
+ /*
+ * String (byte) length lookup is fast. If they aren't the
+ * same length then they can't be the same string values.
+ */
+ if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
+ return false;
+ }
+
+ /*
+ * Check each character to determine if they are the same.
+ * Defer calls to `strtoupper()` to avoid them when possible.
+ * Calling `strcasecmp()` here tested slowed than comparing each
+ * character, so unless benchmarks show otherwise, it should
+ * not be used.
+ *
+ * It's expected that most of the time that this runs, a
+ * lower-case tag name will be supplied and the input will
+ * contain lower-case tag names, thus normally bypassing
+ * the case comparison code.
+ */
+ for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
+ $html_char = $this->html[ $this->tag_name_starts_at + $i ];
+ $tag_char = $this->sought_tag_name[ $i ];
+
+ if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
+ return false;
+ }
+ }
+ }
+
+ $needs_class_name = null !== $this->sought_class_name;
+
+ if ( $needs_class_name && ! isset( $this->attributes['class'] ) ) {
+ return false;
+ }
+
+ /*
+ * Match byte-for-byte (case-sensitive and encoding-form-sensitive) on the class name.
+ *
+ * This will overlook certain classes that exist in other lexical variations
+ * than was supplied to the search query, but requires more complicated searching.
+ */
+ if ( $needs_class_name ) {
+ $class_start = $this->attributes['class']->value_starts_at;
+ $class_end = $class_start + $this->attributes['class']->value_length;
+ $class_at = $class_start;
+
+ /*
+ * Ensure that boundaries surround the class name to avoid matching on
+ * substrings of a longer name. For example, the sequence "not-odd"
+ * should not match for the class "odd" even though "odd" is found
+ * within the class attribute text.
+ *
+ * See https://html.spec.whatwg.org/#attributes-3
+ * See https://html.spec.whatwg.org/#space-separated-tokens
+ */
+ while (
+ // phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
+ false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) &&
+ $class_at < $class_end
+ ) {
+ /*
+ * Verify this class starts at a boundary.
+ */
+ if ( $class_at > $class_start ) {
+ $character = $this->html[ $class_at - 1 ];
+
+ if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
+ $class_at += strlen( $this->sought_class_name );
+ continue;
+ }
+ }
+
+ /*
+ * Verify this class ends at a boundary as well.
+ */
+ if ( $class_at + strlen( $this->sought_class_name ) < $class_end ) {
+ $character = $this->html[ $class_at + strlen( $this->sought_class_name ) ];
+
+ if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
+ $class_at += strlen( $this->sought_class_name );
+ continue;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ return true;
+ }
+}
</ins></span></pre></div>
<a id="trunksrcwpincludeshtmlapiclasswphtmltextreplacementphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/src/wp-includes/html-api/class-wp-html-text-replacement.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/src/wp-includes/html-api/class-wp-html-text-replacement.php (rev 0)
+++ trunk/src/wp-includes/html-api/class-wp-html-text-replacement.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,59 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * HTML Tag Processor: Text replacement class.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ * @since 6.2.0
+ */
+
+/**
+ * Data structure used to replace existing content from start to end that allows to drastically improve performance.
+ *
+ * This class is for internal usage of the WP_HTML_Tag_Processor class.
+ *
+ * @access private
+ * @since 6.2.0
+ *
+ * @see WP_HTML_Tag_Processor
+ */
+class WP_HTML_Text_Replacement {
+ /**
+ * Byte offset into document where replacement span begins.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $start;
+
+ /**
+ * Byte offset into document where replacement span ends.
+ *
+ * @since 6.2.0
+ * @var int
+ */
+ public $end;
+
+ /**
+ * Span of text to insert in document to replace existing content from start to end.
+ *
+ * @since 6.2.0
+ * @var string
+ */
+ public $text;
+
+ /**
+ * Constructor.
+ *
+ * @since 6.2.0
+ *
+ * @param int $start Byte offset into document where replacement span begins.
+ * @param int $end Byte offset into document where replacement span ends.
+ * @param string $text Span of text to insert in document to replace existing content from start to end.
+ */
+ public function __construct( $start, $end, $text ) {
+ $this->start = $start;
+ $this->end = $end;
+ $this->text = $text;
+ }
+}
</ins></span></pre></div>
<a id="trunksrcwpsettingsphp"></a>
<div class="modfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Modified: trunk/src/wp-settings.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/src/wp-settings.php 2023-02-03 00:45:24 UTC (rev 55202)
+++ trunk/src/wp-settings.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -234,6 +234,10 @@
</span><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/class-wp-oembed-controller.php';
</span><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/media.php';
</span><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/http.php';
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php';
+require ABSPATH . WPINC . '/html-api/class-wp-html-span.php';
+require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php';
+require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php';
</ins><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/class-wp-http.php';
</span><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/class-wp-http-streams.php';
</span><span class="cx" style="display: block; padding: 0 10px"> require ABSPATH . WPINC . '/class-wp-http-curl.php';
</span></span></pre></div>
<a id="trunktestsphpunittestshtmlwpHtmlTagProcessorbookmarkphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php (rev 0)
+++ trunk/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,381 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * Unit tests covering WP_HTML_Tag_Processor bookmark functionality.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ */
+
+/**
+ * @group html-api
+ *
+ * @coversDefaultClass WP_HTML_Tag_Processor
+ */
+class Tests_HTML_wpHtmlTagProcessor_Bookmark extends WP_UnitTestCase {
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_bookmark
+ */
+ public function test_set_bookmark() {
+ $p = new WP_HTML_Tag_Processor( '<ul><li>One</li><li>Two</li><li>Three</li></ul>' );
+ $p->next_tag( 'li' );
+ $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not allocate a "first li" bookmark' );
+ $p->next_tag( 'li' );
+ $this->assertTrue( $p->set_bookmark( 'second li' ), 'Could not allocate a "second li" bookmark' );
+ $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not move the "first li" bookmark' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::release_bookmark
+ */
+ public function test_release_bookmark() {
+ $p = new WP_HTML_Tag_Processor( '<ul><li>One</li><li>Two</li><li>Three</li></ul>' );
+ $p->next_tag( 'li' );
+ $this->assertFalse( $p->release_bookmark( 'first li' ), 'Released a non-existing bookmark' );
+ $p->set_bookmark( 'first li' );
+ $this->assertTrue( $p->release_bookmark( 'first li' ), 'Could not release a bookmark' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_seek() {
+ $p = new WP_HTML_Tag_Processor( '<ul><li>One</li><li>Two</li><li>Three</li></ul>' );
+ $p->next_tag( 'li' );
+ $p->set_bookmark( 'first li' );
+
+ $p->next_tag( 'li' );
+ $p->set_attribute( 'foo-2', 'bar-2' );
+
+ $p->seek( 'first li' );
+ $p->set_attribute( 'foo-1', 'bar-1' );
+
+ $this->assertSame(
+ '<ul><li foo-1="bar-1">One</li><li foo-2="bar-2">Two</li><li>Three</li></ul>',
+ $p->get_updated_html(),
+ 'Did not seek to the intended bookmark locations'
+ );
+ }
+
+ /**
+ * WP_HTML_Tag_Processor used to test for the diffs affecting
+ * the adjusted bookmark position while simultaneously adjusting
+ * the bookmark in question. As a result, updating the bookmarks
+ * of a next tag while removing two subsequent attributes in
+ * a previous tag unfolded like this:
+ *
+ * 1. Check if the first removed attribute is before the bookmark:
+ *
+ * <button twenty_one_characters 7_chars></button><button></button>
+ * ^-------------------^ ^
+ * diff applied here the bookmark is here
+ *
+ * (Yes it is)
+ *
+ * 2. Move the bookmark to the left by the attribute length:
+ *
+ * <button twenty_one_characters 7_chars></button><button></button>
+ * ^
+ * the bookmark is here
+ *
+ * 3. Check if the second removed attribute is before the bookmark:
+ *
+ * <button twenty_one_characters 7_chars></button><button></button>
+ * ^ ^-----^
+ * bookmark diff
+ *
+ * This time, it isn't!
+ *
+ * The fix in the WP_HTML_Tag_Processor involves doing all the checks
+ * before moving the bookmark. This test is here to guard us from
+ * the erroneous behavior accidentally returning one day.
+ *
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ * @covers WP_HTML_Tag_Processor::set_bookmark
+ */
+ public function test_removing_long_attributes_doesnt_break_seek() {
+ $input = <<<HTML
+ <button twenty_one_characters 7_chars></button><button></button>
+HTML;
+ $p = new WP_HTML_Tag_Processor( $input );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'first' );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'second' );
+
+ $this->assertTrue(
+ $p->seek( 'first' ),
+ 'Seek() to the first button has failed'
+ );
+ $p->remove_attribute( 'twenty_one_characters' );
+ $p->remove_attribute( '7_chars' );
+
+ $this->assertTrue(
+ $p->seek( 'second' ),
+ 'Seek() to the second button has failed'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ * @covers WP_HTML_Tag_Processor::set_bookmark
+ */
+ public function test_bookmarks_complex_use_case() {
+ $input = <<<HTML
+<div selected class="merge-message" checked>
+ <div class="select-menu d-inline-block">
+ <div checked class="BtnGroup MixedCaseHTML position-relative" />
+ <div checked class="BtnGroup MixedCaseHTML position-relative">
+ <button type="button" class="merge-box-button btn-group-merge rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Merge pull request
+ </button>
+
+ <button type="button" class="merge-box-button btn-group-squash rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Squash and merge
+ </button>
+
+ <button type="button" class="merge-box-button btn-group-rebase rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Rebase and merge
+ </button>
+
+ <button aria-label="Select merge method" disabled="disabled" type="button" data-view-component="true" class="select-menu-button btn BtnGroup-item"></button>
+ </div>
+ </div>
+</div>
+HTML;
+ $expected_output = <<<HTML
+<div selected class="merge-message" checked>
+ <div class="select-menu d-inline-block">
+ <div class="BtnGroup MixedCaseHTML position-relative" />
+ <div checked class="BtnGroup MixedCaseHTML position-relative">
+ <button type="submit" class="merge-box-button btn-group-merge rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Merge pull request
+ </button>
+
+ <button class="hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Squash and merge
+ </button>
+
+ <button id="rebase-and-merge" disabled="">
+ Rebase and merge
+ </button>
+
+ <button id="last-button" ></button>
+ </div>
+ </div>
+</div>
+HTML;
+ $p = new WP_HTML_Tag_Processor( $input );
+ $p->next_tag( 'div' );
+ $p->next_tag( 'div' );
+ $p->next_tag( 'div' );
+ $p->set_bookmark( 'first div' );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'first button' );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'second button' );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'third button' );
+ $p->next_tag( 'button' );
+ $p->set_bookmark( 'fourth button' );
+
+ $p->seek( 'first button' );
+ $p->set_attribute( 'type', 'submit' );
+
+ $this->assertTrue(
+ $p->seek( 'third button' ),
+ 'Seek() to the third button failed'
+ );
+ $p->remove_attribute( 'class' );
+ $p->remove_attribute( 'type' );
+ $p->remove_attribute( 'aria-expanded' );
+ $p->set_attribute( 'id', 'rebase-and-merge' );
+ $p->remove_attribute( 'data-details-container' );
+
+ $this->assertTrue(
+ $p->seek( 'first div' ),
+ 'Seek() to the first div failed'
+ );
+ $p->set_attribute( 'checked', false );
+
+ $this->assertTrue(
+ $p->seek( 'fourth button' ),
+ 'Seek() to the fourth button failed'
+ );
+ $p->set_attribute( 'id', 'last-button' );
+ $p->remove_attribute( 'class' );
+ $p->remove_attribute( 'type' );
+ $p->remove_attribute( 'checked' );
+ $p->remove_attribute( 'aria-label' );
+ $p->remove_attribute( 'disabled' );
+ $p->remove_attribute( 'data-view-component' );
+
+ $this->assertTrue(
+ $p->seek( 'second button' ),
+ 'Seek() to the second button failed'
+ );
+ $p->remove_attribute( 'type' );
+ $p->set_attribute( 'class', 'hx_create-pr-button' );
+
+ $this->assertSame(
+ $expected_output,
+ $p->get_updated_html(),
+ 'Performing several attribute updates on different tags does not produce the expected HTML snippet'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_updates_bookmark_for_additions_after_both_sides() {
+ $p = new WP_HTML_Tag_Processor( '<div>First</div><div>Second</div>' );
+ $p->next_tag();
+ $p->set_bookmark( 'first' );
+ $p->next_tag();
+ $p->add_class( 'second' );
+
+ $p->seek( 'first' );
+ $p->add_class( 'first' );
+
+ $this->assertSame(
+ '<div class="first">First</div><div class="second">Second</div>',
+ $p->get_updated_html(),
+ 'The bookmark was updated incorrectly in response to HTML markup updates'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_updates_bookmark_for_additions_before_both_sides() {
+ $p = new WP_HTML_Tag_Processor( '<div>First</div><div>Second</div>' );
+ $p->next_tag();
+ $p->set_bookmark( 'first' );
+ $p->next_tag();
+ $p->set_bookmark( 'second' );
+
+ $p->seek( 'first' );
+ $p->add_class( 'first' );
+
+ $p->seek( 'second' );
+ $p->add_class( 'second' );
+
+ $this->assertSame(
+ '<div class="first">First</div><div class="second">Second</div>',
+ $p->get_updated_html(),
+ 'The bookmark was updated incorrectly in response to HTML markup updates'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_updates_bookmark_for_deletions_after_both_sides() {
+ $p = new WP_HTML_Tag_Processor( '<div>First</div><div disabled>Second</div>' );
+ $p->next_tag();
+ $p->set_bookmark( 'first' );
+ $p->next_tag();
+ $p->remove_attribute( 'disabled' );
+
+ $p->seek( 'first' );
+ $p->set_attribute( 'untouched', true );
+
+ $this->assertSame(
+ /*
+ * It shouldn't be necessary to assert the extra space after the tag
+ * following the attribute removal, but doing so makes the test easier
+ * to see than it would be if parsing the output HTML for proper
+ * validation. If the Tag Processor changes so that this space no longer
+ * appears then this test should be updated to reflect that. The space
+ * is not required.
+ */
+ '<div untouched>First</div><div >Second</div>',
+ $p->get_updated_html(),
+ 'The bookmark was incorrectly in response to HTML markup updates'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_updates_bookmark_for_deletions_before_both_sides() {
+ $p = new WP_HTML_Tag_Processor( '<div disabled>First</div><div>Second</div>' );
+ $p->next_tag();
+ $p->set_bookmark( 'first' );
+ $p->next_tag();
+ $p->set_bookmark( 'second' );
+
+ $p->seek( 'first' );
+ $p->remove_attribute( 'disabled' );
+
+ $p->seek( 'second' );
+ $p->set_attribute( 'safe', true );
+
+ $this->assertSame(
+ /*
+ * It shouldn't be necessary to assert the extra space after the tag
+ * following the attribute removal, but doing so makes the test easier
+ * to see than it would be if parsing the output HTML for proper
+ * validation. If the Tag Processor changes so that this space no longer
+ * appears then this test should be updated to reflect that. The space
+ * is not required.
+ */
+ '<div >First</div><div safe>Second</div>',
+ $p->get_updated_html(),
+ 'The bookmark was updated incorrectly in response to HTML markup updates'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_bookmark
+ */
+ public function test_limits_the_number_of_bookmarks() {
+ $p = new WP_HTML_Tag_Processor( '<ul><li>One</li><li>Two</li><li>Three</li></ul>' );
+ $p->next_tag( 'li' );
+
+ for ( $i = 0; $i < WP_HTML_Tag_Processor::MAX_BOOKMARKS; $i++ ) {
+ $this->assertTrue( $p->set_bookmark( "bookmark $i" ), "Could not allocate the bookmark #$i" );
+ }
+
+ $this->setExpectedIncorrectUsage( 'WP_HTML_Tag_Processor::set_bookmark' );
+ $this->assertFalse( $p->set_bookmark( 'final bookmark' ), "Allocated $i bookmarks, which is one above the limit" );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::seek
+ */
+ public function test_limits_the_number_of_seek_calls() {
+ $p = new WP_HTML_Tag_Processor( '<ul><li>One</li><li>Two</li><li>Three</li></ul>' );
+ $p->next_tag( 'li' );
+ $p->set_bookmark( 'bookmark' );
+
+ for ( $i = 0; $i < WP_HTML_Tag_Processor::MAX_SEEK_OPS; $i++ ) {
+ $this->assertTrue( $p->seek( 'bookmark' ), 'Could not seek to the "bookmark"' );
+ }
+
+ $this->setExpectedIncorrectUsage( 'WP_HTML_Tag_Processor::seek' );
+ $this->assertFalse( $p->seek( 'bookmark' ), "$i-th seek() to the bookmark succeeded, even though it should exceed the allowed limit" );
+ }
+}
</ins></span></pre></div>
<a id="trunktestsphpunittestshtmlwpHtmlTagProcessorphp"></a>
<div class="addfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Added: trunk/tests/phpunit/tests/html/wpHtmlTagProcessor.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- trunk/tests/phpunit/tests/html/wpHtmlTagProcessor.php (rev 0)
+++ trunk/tests/phpunit/tests/html/wpHtmlTagProcessor.php 2023-02-03 01:03:59 UTC (rev 55203)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -0,0 +1,2016 @@
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+<?php
+/**
+ * Unit tests covering WP_HTML_Tag_Processor functionality.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ */
+
+/**
+ * @group html-api
+ *
+ * @coversDefaultClass WP_HTML_Tag_Processor
+ */
+class Tests_HTML_wpHtmlTagProcessor extends WP_UnitTestCase {
+ const HTML_SIMPLE = '<div id="first"><span id="second">Text</span></div>';
+ const HTML_WITH_CLASSES = '<div class="main with-border" id="first"><span class="not-main bold with-border" id="second">Text</span></div>';
+ const HTML_MALFORMED = '<div><span class="d-md-none" Notifications</span><span class="d-none d-md-inline">Back to notifications</span></div>';
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_tag
+ */
+ public function test_get_tag_returns_null_before_finding_tags() {
+ $p = new WP_HTML_Tag_Processor( '<div>Test</div>' );
+
+ $this->assertNull( $p->get_tag(), 'Calling get_tag() without selecting a tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_tag
+ */
+ public function test_get_tag_returns_null_when_not_in_open_tag() {
+ $p = new WP_HTML_Tag_Processor( '<div>Test</div>' );
+
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertNull( $p->get_tag(), 'Accessing a non-existing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_tag
+ */
+ public function test_get_tag_returns_open_tag_name() {
+ $p = new WP_HTML_Tag_Processor( '<div>Test</div>' );
+
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'DIV', $p->get_tag(), 'Accessing an existing tag name did not return "div"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_null_before_finding_tags() {
+ $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
+
+ $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute without selecting a tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_null_when_not_in_open_tag() {
+ $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
+
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute of a non-existing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_null_when_in_closing_tag() {
+ $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Querying an existing closing tag did not return true' );
+ $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute of a closing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_null_when_attribute_missing() {
+ $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertNull( $p->get_attribute( 'test-id' ), 'Accessing a non-existing attribute did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_attribute_value() {
+ $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'test', $p->get_attribute( 'class' ), 'Accessing a class="test" attribute value did not return "test"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_true_for_boolean_attribute() {
+ $p = new WP_HTML_Tag_Processor( '<div enabled class="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag( array( 'class_name' => 'test' ) ), 'Querying an existing tag did not return true' );
+ $this->assertTrue( $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_string_for_truthy_attributes() {
+ $p = new WP_HTML_Tag_Processor( '<div enabled=enabled checked=1 hidden="true" class="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'enabled', $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' );
+ $this->assertSame( '1', $p->get_attribute( 'checked' ), 'Accessing a checked=1 attribute value did not return "1"' );
+ $this->assertSame( 'true', $p->get_attribute( 'hidden' ), 'Accessing a hidden="true" attribute value did not return "true"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_decodes_html_character_references() {
+ $p = new WP_HTML_Tag_Processor( '<div id="the "grande" is < 32oz†"></div>' );
+ $p->next_tag();
+
+ $this->assertSame( 'the "grande" is < 32oz†', $p->get_attribute( 'id' ), 'HTML Attribute value was returned without decoding character references' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_attributes_parser_treats_slash_as_attribute_separator() {
+ $p = new WP_HTML_Tag_Processor( '<div a/b/c/d/e="test">Test</div>' );
+
+ $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' );
+ $this->assertTrue( $p->get_attribute( 'a' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'b' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'c' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'd' ), 'Accessing an existing attribute did not return true' );
+ $this->assertSame( 'test', $p->get_attribute( 'e' ), 'Accessing an existing e="test" did not return "test"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ *
+ * @dataProvider data_attribute_name_case_variants
+ *
+ * @param string $attribute_name Name of data-enabled attribute with case variations.
+ */
+ public function test_get_attribute_is_case_insensitive_for_attributes_with_values( $attribute_name ) {
+ $p = new WP_HTML_Tag_Processor( '<div DATA-enabled="true">Test</div>' );
+ $p->next_tag();
+
+ $this->assertSame(
+ 'true',
+ $p->get_attribute( $attribute_name ),
+ 'Accessing an attribute by a differently-cased name did not return its value'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ *
+ * @dataProvider data_attribute_name_case_variants
+ *
+ * @param string $attribute_name Name of data-enabled attribute with case variations.
+ */
+ public function test_attributes_parser_is_case_insensitive_for_attributes_without_values( $attribute_name ) {
+ $p = new WP_HTML_Tag_Processor( '<div DATA-enabled>Test</div>' );
+ $p->next_tag();
+
+ $this->assertTrue(
+ $p->get_attribute( $attribute_name ),
+ 'Accessing an attribute by a differently-cased name did not return its value'
+ );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public function data_attribute_name_case_variants() {
+ return array(
+ array( 'DATA-enabled' ),
+ array( 'data-enabled' ),
+ array( 'DATA-ENABLED' ),
+ array( 'DatA-EnABled' ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ */
+ public function test_remove_attribute_is_case_insensitive() {
+ $p = new WP_HTML_Tag_Processor( '<div DATA-enabled="true">Test</div>' );
+ $p->next_tag();
+ $p->remove_attribute( 'data-enabled' );
+
+ $this->assertSame( '<div >Test</div>', $p->get_updated_html(), 'A case-insensitive remove_attribute call did not remove the attribute' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_is_case_insensitive() {
+ $p = new WP_HTML_Tag_Processor( '<div DATA-enabled="true">Test</div>' );
+ $p->next_tag();
+ $p->set_attribute( 'data-enabled', 'abc' );
+
+ $this->assertSame( '<div data-enabled="abc">Test</div>', $p->get_updated_html(), 'A case-insensitive set_attribute call did not update the existing attribute' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_null_before_finding_tags() {
+ $p = new WP_HTML_Tag_Processor( '<div data-foo="bar">Test</div>' );
+ $this->assertNull(
+ $p->get_attribute_names_with_prefix( 'data-' ),
+ 'Accessing attributes by their prefix did not return null when no tag was selected'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_null_when_not_in_open_tag() {
+ $p = new WP_HTML_Tag_Processor( '<div data-foo="bar">Test</div>' );
+ $p->next_tag( 'p' );
+ $this->assertNull( $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a non-existing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_null_when_in_closing_tag() {
+ $p = new WP_HTML_Tag_Processor( '<div data-foo="bar">Test</div>' );
+ $p->next_tag( 'div' );
+ $p->next_tag( array( 'tag_closers' => 'visit' ) );
+
+ $this->assertNull( $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a closing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_empty_array_when_no_attributes_present() {
+ $p = new WP_HTML_Tag_Processor( '<div>Test</div>' );
+ $p->next_tag( 'div' );
+
+ $this->assertSame( array(), $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing the attributes on a tag without any did not return an empty array' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_matching_attribute_names_in_lowercase() {
+ $p = new WP_HTML_Tag_Processor( '<div DATA-enabled class="test" data-test-ID="14">Test</div>' );
+ $p->next_tag();
+
+ $this->assertSame(
+ array( 'data-enabled', 'data-test-id' ),
+ $p->get_attribute_names_with_prefix( 'data-' ),
+ 'Accessing attributes by their prefix did not return their lowercase names'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix
+ */
+ public function test_get_attribute_names_with_prefix_returns_attribute_added_by_set_attribute() {
+ $p = new WP_HTML_Tag_Processor( '<div data-foo="bar">Test</div>' );
+ $p->next_tag();
+ $p->set_attribute( 'data-test-id', '14' );
+
+ $this->assertSame(
+ '<div data-test-id="14" data-foo="bar">Test</div>',
+ $p->get_updated_html(),
+ "Updated HTML doesn't include attribute added via set_attribute"
+ );
+ $this->assertSame(
+ array( 'data-test-id', 'data-foo' ),
+ $p->get_attribute_names_with_prefix( 'data-' ),
+ "Accessing attribute names doesn't find attribute added via set_attribute"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::__toString
+ */
+ public function test_to_string_returns_updated_html() {
+ $p = new WP_HTML_Tag_Processor( '<hr id="remove" /><div enabled class="test">Test</div><span id="span-id"></span>' );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $p->next_tag();
+ $p->set_attribute( 'id', 'div-id-1' );
+ $p->add_class( 'new_class_1' );
+
+ $this->assertSame(
+ $p->get_updated_html(),
+ (string) $p,
+ 'get_updated_html() returned a different value than __toString()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_updated_html
+ */
+ public function test_get_updated_html_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() {
+ $p = new WP_HTML_Tag_Processor( '<hr id="remove" /><div enabled class="test">Test</div><span id="span-id"></span>' );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $p->next_tag();
+ $p->set_attribute( 'id', 'div-id-1' );
+ $p->add_class( 'new_class_1' );
+
+ $this->assertSame(
+ '<hr /><div id="div-id-1" enabled class="test new_class_1">Test</div><span id="span-id"></span>',
+ $p->get_updated_html(),
+ 'Calling get_updated_html after updating the attributes of the second tag returned different HTML than expected'
+ );
+
+ $p->set_attribute( 'id', 'div-id-2' );
+ $p->add_class( 'new_class_2' );
+
+ $this->assertSame(
+ '<hr /><div id="div-id-2" enabled class="test new_class_1 new_class_2">Test</div><span id="span-id"></span>',
+ $p->get_updated_html(),
+ 'Calling get_updated_html after updating the attributes of the second tag for the second time returned different HTML than expected'
+ );
+
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $this->assertSame(
+ '<hr /><div id="div-id-2" enabled class="test new_class_1 new_class_2">Test</div><span ></span>',
+ $p->get_updated_html(),
+ 'Calling get_updated_html after removing the id attribute of the third tag returned different HTML than expected'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_updated_html
+ */
+ public function test_get_updated_html_without_updating_any_attributes_returns_the_original_html() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ $p->get_updated_html(),
+ 'Casting WP_HTML_Tag_Processor to a string without performing any updates did not return the initial HTML snippet'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ */
+ public function test_next_tag_with_no_arguments_should_find_the_next_existing_tag() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+
+ $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ */
+ public function test_next_tag_should_return_false_for_a_non_existing_tag() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ * @covers WP_HTML_Tag_Processor::is_tag_closer
+ */
+ public function test_next_tag_should_stop_on_closers_only_when_requested() {
+ $p = new WP_HTML_Tag_Processor( '<div><img /></div>' );
+
+ $this->assertTrue( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Did not find desired tag opener' );
+ $this->assertFalse( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Visited an unwanted tag, a tag closer' );
+
+ $p = new WP_HTML_Tag_Processor( '<div><img /></div>' );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'tag_closers' => 'visit',
+ )
+ );
+
+ $this->assertFalse( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'tag_closers' => 'visit',
+ )
+ ),
+ 'Did not stop at desired tag closer'
+ );
+ $this->assertTrue( $p->is_tag_closer(), 'Indicated a tag closer is a tag opener' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertFalse( $p->next_tag( 'div' ), 'Querying a non-existing tag did not return false' );
+
+ $p->set_attribute( 'id', 'primary' );
+
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ $p->get_updated_html(),
+ 'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() {
+ $p = new WP_HTML_Tag_Processor( '<div id=3></div invalid-id=4>' );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'tag_closers' => 'visit',
+ )
+ );
+
+ $this->assertFalse( $p->is_tag_closer(), 'Skipped tag opener' );
+
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'tag_closers' => 'visit',
+ )
+ );
+
+ $this->assertTrue( $p->is_tag_closer(), 'Skipped tag closer' );
+ $this->assertFalse( $p->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" );
+ $this->assertFalse( $p->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" );
+ $this->assertFalse( $p->add_class( 'sneaky' ), "Allowed adding a class on a tag closer when it shouldn't have" );
+ $this->assertFalse( $p->remove_class( 'not-appearing-in-this-test' ), "Allowed removing a class on a tag closer when it shouldn't have" );
+ $this->assertSame(
+ '<div id=3></div invalid-id=4>',
+ $p->get_updated_html(),
+ 'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML'
+ );
+ }
+
+ /**
+ * Passing a double quote inside of an attribute value could lead to an XSS attack as follows:
+ *
+ * ```php
+ * $p = new WP_HTML_Tag_Processor( '<div class="header"></div>' );
+ * $p->next_tag();
+ * $p->set_attribute('class', '" onclick="alert');
+ * echo $p;
+ * // <div class="" onclick="alert"></div>
+ * ```
+ *
+ * To prevent it, `set_attribute` calls `esc_attr()` on its given values.
+ *
+ * ```php
+ * <div class="" onclick="alert"></div>
+ * ```
+ *
+ * @ticket 56299
+ *
+ * @dataProvider data_set_attribute_prevents_xss
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ *
+ * @param string $attribute_value A value with potential XSS exploit.
+ */
+ public function test_set_attribute_prevents_xss( $attribute_value ) {
+ $p = new WP_HTML_Tag_Processor( '<div></div>' );
+ $p->next_tag();
+ $p->set_attribute( 'test', $attribute_value );
+
+ /*
+ * Testing the escaping is hard using tools that properly parse
+ * HTML because they might interpret the escaped values. It's hard
+ * with tools that don't understand HTML because they might get
+ * confused by improperly-escaped values.
+ *
+ * Since the input HTML is known, the test will do what looks like
+ * the opposite of what is expected to be done with this library.
+ * But by doing so, the test (a) has full control over the
+ * content and (b) looks at the raw values.
+ */
+ $match = null;
+ preg_match( '~^<div test=(.*)></div>$~', $p->get_updated_html(), $match );
+ list( , $actual_value ) = $match;
+
+ $this->assertSame( '"' . esc_attr( $attribute_value ) . '"', $actual_value, 'Entities were not properly escaped in the attribute value' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return string[][].
+ */
+ public function data_set_attribute_prevents_xss() {
+ return array(
+ array( '"' ),
+ array( '"' ),
+ array( '&' ),
+ array( '&' ),
+ array( '€' ),
+ array( "'" ),
+ array( '<>' ),
+ array( '"";' ),
+ array( '" onclick="alert(\'1\');"><span onclick=""></span><script>alert("1")</script>' ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_with_a_non_existing_attribute_adds_a_new_attribute_to_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'test-attribute', 'test-value' );
+
+ $this->assertSame(
+ '<div test-attribute="test-value" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include attribute added via set_attribute()'
+ );
+ $this->assertSame(
+ 'test-value',
+ $p->get_attribute( 'test-attribute' ),
+ 'get_attribute() (called after get_updated_html()) did not return attribute added via set_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_updated_values_before_they_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'test-attribute', 'test-value' );
+
+ $this->assertSame(
+ 'test-value',
+ $p->get_attribute( 'test-attribute' ),
+ 'get_attribute() (called before get_updated_html()) did not return attribute added via set_attribute()'
+ );
+ $this->assertSame(
+ '<div test-attribute="test-value" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include attribute added via set_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_returns_updated_values_before_they_are_applied_with_different_name_casing() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'test-ATTribute', 'test-value' );
+
+ $this->assertSame(
+ 'test-value',
+ $p->get_attribute( 'test-attribute' ),
+ 'get_attribute() (called before get_updated_html()) did not return attribute added via set_attribute()'
+ );
+ $this->assertSame(
+ '<div test-ATTribute="test-value" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include attribute added via set_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_added_class_names_before_they_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'my-class' );
+
+ $this->assertSame(
+ 'my-class',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) did not return class name added via add_class()'
+ );
+ $this->assertSame(
+ '<div class="my-class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include class name added via add_class()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_added_class_names_before_they_are_applied_and_retains_classes_from_previous_add_class_calls() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'my-class' );
+
+ $this->assertSame(
+ 'my-class',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) did not return class name added via add_class()'
+ );
+
+ $p->add_class( 'my-other-class' );
+
+ $this->assertSame(
+ 'my-class my-other-class',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) did not return class names added via subsequent add_class() calls'
+ );
+ $this->assertSame(
+ '<div class="my-class my-other-class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include class names added via subsequent add_class() calls'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_removed_attribute_before_it_is_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $this->assertNull(
+ $p->get_attribute( 'id' ),
+ 'get_attribute() (called before get_updated_html()) returned attribute that was removed by remove_attribute()'
+ );
+ $this->assertSame(
+ '<div ><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML includes attribute that was removed by remove_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_adding_and_then_removing_an_attribute_before_those_updates_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'test-attribute', 'test-value' );
+ $p->remove_attribute( 'test-attribute' );
+
+ $this->assertNull(
+ $p->get_attribute( 'test-attribute' ),
+ 'get_attribute() (called before get_updated_html()) returned attribute that was added via set_attribute() and then removed by remove_attribute()'
+ );
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ $p->get_updated_html(),
+ 'Updated HTML includes attribute that was added via set_attribute() and then removed by remove_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_setting_and_then_removing_an_existing_attribute_before_those_updates_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'id', 'test-value' );
+ $p->remove_attribute( 'id' );
+
+ $this->assertNull(
+ $p->get_attribute( 'id' ),
+ 'get_attribute() (called before get_updated_html()) returned attribute that was overwritten by set_attribute() and then removed by remove_attribute()'
+ );
+ $this->assertSame(
+ '<div ><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML includes attribute that was overwritten by set_attribute() and then removed by remove_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_removed_class_names_before_they_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->remove_class( 'with-border' );
+
+ $this->assertSame(
+ 'main',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) returned the wrong attribute after calling remove_attribute()'
+ );
+ $this->assertSame(
+ '<div class="main" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML includes wrong attribute after calling remove_attribute()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_setting_and_then_removing_a_class_name_before_those_updates_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $p->remove_class( 'foo-class' );
+
+ $this->assertSame(
+ 'main with-border',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) returned class name that was added via add_class() and then removed by remove_class()'
+ );
+ $this->assertSame(
+ self::HTML_WITH_CLASSES,
+ $p->get_updated_html(),
+ 'Updated HTML includes class that was added via add_class() and then removed by remove_class()'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::get_attribute
+ */
+ public function test_get_attribute_reflects_duplicating_and_then_removing_an_existing_class_name_before_those_updates_are_applied() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'with-border' );
+ $p->remove_class( 'with-border' );
+
+ $this->assertSame(
+ 'main',
+ $p->get_attribute( 'class' ),
+ 'get_attribute() (called before get_updated_html()) returned class name that was duplicated via add_class() and then removed by remove_class()'
+ );
+ $this->assertSame(
+ '<div class="main" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML includes class that was duplicated via add_class() and then removed by remove_class()'
+ );
+ }
+
+ /**
+ * According to HTML spec, only the first instance of an attribute counts.
+ * The other ones are ignored.
+ *
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_update_first_attribute_when_duplicated_attributes_exist() {
+ $p = new WP_HTML_Tag_Processor( '<div id="update-me" id="ignored-id"><span id="second">Text</span></div>' );
+ $p->next_tag();
+ $p->set_attribute( 'id', 'updated-id' );
+
+ $this->assertSame(
+ '<div id="updated-id" id="ignored-id"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Proper (first) appearance of attribute was not updated when duplicates exist'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_with_an_existing_attribute_name_updates_its_value_in_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'id', 'new-id' );
+ $this->assertSame(
+ '<div id="new-id"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Existing attribute was not updated'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_next_tag_and_set_attribute_in_a_loop_update_all_tags_in_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ while ( $p->next_tag() ) {
+ $p->set_attribute( 'data-foo', 'bar' );
+ }
+
+ $this->assertSame(
+ '<div data-foo="bar" id="first"><span data-foo="bar" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Not all tags were updated when looping with next_tag() and set_attribute()'
+ );
+ }
+
+ /**
+ * Removing an attribute that's listed many times, e.g. `<div id="a" id="b" />` should remove
+ * all its instances and output just `<div />`.
+ *
+ * Today, however, WP_HTML_Tag_Processor only removes the first such attribute. It seems like a corner case
+ * and introducing additional complexity to correctly handle this scenario doesn't seem to be worth it.
+ * Let's revisit if and when this becomes a problem.
+ *
+ * This test is in place to confirm this behavior, which while incorrect, is well-defined.
+ * A later fix introduced to the Tag Processor should update this test to reflect the
+ * wanted and correct behavior.
+ *
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ */
+ public function test_remove_first_when_duplicated_attribute() {
+ $p = new WP_HTML_Tag_Processor( '<div id="update-me" id="ignored-id"><span id="second">Text</span></div>' );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $this->assertSame(
+ '<div id="ignored-id"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'First attribute (when duplicates exist) was not removed'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ */
+ public function test_remove_attribute_with_an_existing_attribute_name_removes_it_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $this->assertSame(
+ '<div ><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Attribute was not removed'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ */
+ public function test_remove_attribute_with_a_non_existing_attribute_name_does_not_change_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_attribute( 'no-such-attribute' );
+
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ $p->get_updated_html(),
+ 'Content was changed when attempting to remove an attribute that did not exist'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_creates_a_class_attribute_when_there_is_none() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+
+ $this->assertSame(
+ '<div class="foo-class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include class name added via add_class()'
+ );
+ $this->assertSame(
+ 'foo-class',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) did not return class name added via add_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_calling_add_class_twice_creates_a_class_attribute_with_both_class_names_when_there_is_no_class_attribute() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $p->add_class( 'bar-class' );
+
+ $this->assertSame(
+ '<div class="foo-class bar-class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not include class names added via subsequent add_class() calls'
+ );
+ $this->assertSame(
+ 'foo-class bar-class',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) did not return class names added via subsequent add_class() calls"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_remove_class_does_not_change_the_markup_when_there_is_no_class_attribute() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_class( 'foo-class' );
+
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ $p->get_updated_html(),
+ 'Updated HTML includes class name that was removed by remove_class()'
+ );
+ $this->assertNull(
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) did not return null for class name that was removed by remove_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_appends_class_names_to_the_existing_class_attribute_when_one_already_exists() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $p->add_class( 'bar-class' );
+
+ $this->assertSame(
+ '<div class="main with-border foo-class bar-class" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect class names added to existing class attribute via subsequent add_class() calls'
+ );
+ $this->assertSame(
+ 'main with-border foo-class bar-class',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect class names added to existing class attribute via subsequent add_class() calls"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_remove_class_removes_a_single_class_from_the_class_attribute_when_one_exists() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+
+ $this->assertSame(
+ '<div class=" with-border" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect class name removed from existing class attribute via remove_class()'
+ );
+ $this->assertSame(
+ ' with-border',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect class name removed from existing class attribute via remove_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_calling_remove_class_with_all_listed_class_names_removes_the_existing_class_attribute_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+ $p->remove_class( 'with-border' );
+
+ $this->assertSame(
+ '<div id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect class attribute removed via subesequent remove_class() calls'
+ );
+ $this->assertNull(
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) did not return null for class attribute removed via subesequent remove_class() calls"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_does_not_add_duplicate_class_names() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'with-border' );
+
+ $this->assertSame(
+ '<div class="main with-border" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect deduplicated class name added via add_class()'
+ );
+ $this->assertSame(
+ 'main with-border',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect deduplicated class name added via add_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_preserves_class_name_order_when_a_duplicate_class_name_is_added() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'main' );
+
+ $this->assertSame(
+ '<div class="main with-border" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect class name order after adding duplicated class name via add_class()'
+ );
+ $this->assertSame(
+ 'main with-border',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect class name order after adding duplicated class name added via add_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_when_there_is_a_class_attribute_with_excessive_whitespaces() {
+ $p = new WP_HTML_Tag_Processor(
+ '<div class=" main with-border " id="first"><span class="not-main bold with-border" id="second">Text</span></div>'
+ );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+
+ $this->assertSame(
+ '<div class=" main with-border foo-class" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect existing excessive whitespace after adding class name via add_class()'
+ );
+ $this->assertSame(
+ ' main with-border foo-class',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect existing excessive whitespace after adding class name via add_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_remove_class_preserves_whitespaces_when_there_is_a_class_attribute_with_excessive_whitespaces() {
+ $p = new WP_HTML_Tag_Processor(
+ '<div class=" main with-border " id="first"><span class="not-main bold with-border" id="second">Text</span></div>'
+ );
+ $p->next_tag();
+ $p->remove_class( 'with-border' );
+
+ $this->assertSame(
+ '<div class=" main" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect existing excessive whitespace after removing class name via remove_class()'
+ );
+ $this->assertSame(
+ ' main',
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) does not reflect existing excessive whitespace after removing class name via removing_class()"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::remove_class
+ */
+ public function test_removing_all_classes_removes_the_existing_class_attribute_from_the_markup_even_when_excessive_whitespaces_are_present() {
+ $p = new WP_HTML_Tag_Processor(
+ '<div class=" main with-border " id="first"><span class="not-main bold with-border" id="second">Text</span></div>'
+ );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+ $p->remove_class( 'with-border' );
+ $this->assertSame(
+ '<div id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ 'Updated HTML does not reflect removed class attribute after removing all class names via remove_class()'
+ );
+ $this->assertNull(
+ $p->get_attribute( 'class' ),
+ "get_attribute( 'class' ) did not return null after removing all class names via remove_class()"
+ );
+ }
+
+ /**
+ * When add_class( $different_value ) is called _after_ set_attribute( 'class', $value ), the
+ * final class name should be "$value $different_value". In other words, the `add_class` call
+ * should append its class to the one(s) set by `set_attribute`. When `add_class( $different_value )`
+ * is called _before_ `set_attribute( 'class', $value )`, however, the final class name should be
+ * "$value" instead, as any direct updates to the `class` attribute supersede any changes enqueued
+ * via the class builder methods.
+ *
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_takes_priority_over_add_class() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'add_class' );
+ $p->set_attribute( 'class', 'set_attribute' );
+ $this->assertSame(
+ '<div class="set_attribute" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Calling get_updated_html after updating first tag's attributes did not return the expected HTML"
+ );
+ $this->assertSame(
+ 'set_attribute',
+ $p->get_attribute( 'class' ),
+ "Calling get_attribute after updating first tag's attributes did not return the expected class name"
+ );
+
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->set_attribute( 'class', 'set_attribute' );
+ $p->add_class( 'add_class' );
+ $this->assertSame(
+ '<div class="set_attribute add_class" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Calling get_updated_html after updating first tag's attributes did not return the expected HTML"
+ );
+ $this->assertSame(
+ 'set_attribute add_class',
+ $p->get_attribute( 'class' ),
+ "Calling get_attribute after updating first tag's attributes did not return the expected class name"
+ );
+ }
+
+ /**
+ * When add_class( $different_value ) is called _after_ set_attribute( 'class', $value ), the
+ * final class name should be "$value $different_value". In other words, the `add_class` call
+ * should append its class to the one(s) set by `set_attribute`. When `add_class( $different_value )`
+ * is called _before_ `set_attribute( 'class', $value )`, however, the final class name should be
+ * "$value" instead, as any direct updates to the `class` attribute supersede any changes enqueued
+ * via the class builder methods.
+ *
+ * This is still true when reading enqueued updates before calling `get_updated_html()`.
+ *
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_takes_priority_over_add_class_even_before_updating() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'add_class' );
+ $p->set_attribute( 'class', 'set_attribute' );
+ $this->assertSame(
+ 'set_attribute',
+ $p->get_attribute( 'class' ),
+ "Calling get_attribute after updating first tag's attributes did not return the expected class name"
+ );
+ $this->assertSame(
+ '<div class="set_attribute" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Calling get_updated_html after updating first tag's attributes did not return the expected HTML"
+ );
+
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->set_attribute( 'class', 'set_attribute' );
+ $p->add_class( 'add_class' );
+ $this->assertSame(
+ 'set_attribute add_class',
+ $p->get_attribute( 'class' ),
+ "Calling get_attribute after updating first tag's attributes did not return the expected class name"
+ );
+ $this->assertSame(
+ '<div class="set_attribute add_class" id="first"><span class="not-main bold with-border" id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Calling get_updated_html after updating first tag's attributes did not return the expected HTML"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_overrides_boolean_class_attribute() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'class', true );
+ $p->add_class( 'add_class' );
+ $this->assertSame(
+ '<div class="add_class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Updated HTML doesn't reflect class added via add_class that was originally set as boolean attribute"
+ );
+ $this->assertSame(
+ 'add_class',
+ $p->get_attribute( 'class' ),
+ "get_attribute (called after get_updated_html()) doesn't reflect class added via add_class that was originally set as boolean attribute"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ */
+ public function test_add_class_overrides_boolean_class_attribute_even_before_updating() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'class', true );
+ $p->add_class( 'add_class' );
+ $this->assertSame(
+ 'add_class',
+ $p->get_attribute( 'class' ),
+ "get_attribute (called before get_updated_html()) doesn't reflect class added via add_class that was originally set as boolean attribute"
+ );
+ $this->assertSame(
+ '<div class="add_class" id="first"><span id="second">Text</span></div>',
+ $p->get_updated_html(),
+ "Updated HTML doesn't reflect class added via add_class that was originally set as boolean attribute"
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::remove_class
+ * @covers WP_HTML_Tag_Processor::get_updated_html
+ */
+ public function test_advanced_use_case() {
+ $input = <<<HTML
+<div selected class="merge-message" checked>
+ <div class="select-menu d-inline-block">
+ <div checked class="BtnGroup MixedCaseHTML position-relative" />
+ <div checked class="BtnGroup MixedCaseHTML position-relative">
+ <button type="button" class="merge-box-button btn-group-merge rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Merge pull request
+ </button>
+
+ <button type="button" class="merge-box-button btn-group-squash rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Squash and merge
+ </button>
+
+ <button type="button" class="merge-box-button btn-group-rebase rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Rebase and merge
+ </button>
+
+ <button aria-label="Select merge method" disabled="disabled" type="button" data-view-component="true" class="select-menu-button btn BtnGroup-item"></button>
+ </div>
+ </div>
+</div>
+HTML;
+
+ $expected_output = <<<HTML
+<div data-details="{ "key": "value" }" selected class="merge-message is-processed" checked>
+ <div class="select-menu d-inline-block">
+ <div checked class=" MixedCaseHTML position-relative button-group Another-Mixed-Case" />
+ <div checked class=" MixedCaseHTML position-relative button-group Another-Mixed-Case">
+ <button type="button" class="merge-box-button btn-group-merge rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Merge pull request
+ </button>
+
+ <button type="button" class="merge-box-button btn-group-squash rounded-left-2 btn BtnGroup-item js-details-target hx_create-pr-button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Squash and merge
+ </button>
+
+ <button type="button" aria-expanded="false" data-details-container=".js-merge-pr" disabled="">
+ Rebase and merge
+ </button>
+
+ <button aria-label="Select merge method" disabled="disabled" type="button" data-view-component="true" class="select-menu-button btn BtnGroup-item"></button>
+ </div>
+ </div>
+</div>
+HTML;
+
+ $p = new WP_HTML_Tag_Processor( $input );
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $p->set_attribute( 'data-details', '{ "key": "value" }' );
+ $p->add_class( 'is-processed' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'class_name' => 'BtnGroup',
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_class( 'BtnGroup' );
+ $p->add_class( 'button-group' );
+ $p->add_class( 'Another-Mixed-Case' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'class_name' => 'BtnGroup',
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_class( 'BtnGroup' );
+ $p->add_class( 'button-group' );
+ $p->add_class( 'Another-Mixed-Case' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'button',
+ 'class_name' => 'btn',
+ 'match_offset' => 3,
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_attribute( 'class' );
+ $this->assertFalse( $p->next_tag( 'non-existent' ), 'Querying a non-existing tag did not return false' );
+ $p->set_attribute( 'class', 'test' );
+ $this->assertSame( $expected_output, $p->get_updated_html(), 'Calling get_updated_html after updating the attributes did not return the expected HTML' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ */
+ public function test_correctly_parses_html_attributes_wrapped_in_single_quotation_marks() {
+ $p = new WP_HTML_Tag_Processor(
+ '<div id=\'first\'><span id=\'second\'>Text</span></div>'
+ );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'id' => 'first',
+ )
+ );
+ $p->remove_attribute( 'id' );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'span',
+ 'id' => 'second',
+ )
+ );
+ $p->set_attribute( 'id', 'single-quote' );
+ $this->assertSame(
+ '<div ><span id="single-quote">Text</span></div>',
+ $p->get_updated_html(),
+ 'Did not remove single-quoted attribute'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_set_attribute_with_value_equal_to_true_adds_a_boolean_html_attribute_with_implicit_value() {
+ $p = new WP_HTML_Tag_Processor(
+ '<form action="/action_page.php"><input type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>'
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', true );
+ $this->assertSame(
+ '<form action="/action_page.php"><input checked type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>',
+ $p->get_updated_html(),
+ 'Did not add "checked" as an expected boolean attribute'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_setting_a_boolean_attribute_to_false_removes_it_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor(
+ '<form action="/action_page.php"><input checked type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>'
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', false );
+ $this->assertSame(
+ '<form action="/action_page.php"><input type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>',
+ $p->get_updated_html(),
+ 'Did not remove boolean attribute when set to false'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_setting_a_missing_attribute_to_false_does_not_change_the_markup() {
+ $html_input = '<form action="/action_page.php"><input type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>';
+ $p = new WP_HTML_Tag_Processor( $html_input );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', false );
+ $this->assertSame(
+ $html_input,
+ $p->get_updated_html(),
+ 'Changed the markup unexpectedly when setting a non-existing attribute to false'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_setting_a_boolean_attribute_to_a_string_value_adds_explicit_value_to_the_markup() {
+ $p = new WP_HTML_Tag_Processor(
+ '<form action="/action_page.php"><input checked type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>'
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', 'checked' );
+ $this->assertSame(
+ '<form action="/action_page.php"><input checked="checked" type="checkbox" name="vehicle" value="Bike"><label for="vehicle">I have a bike</label></form>',
+ $p->get_updated_html(),
+ 'Did not add string value to existing boolean attribute'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ */
+ public function test_unclosed_script_tag_should_not_cause_an_infinite_loop() {
+ $p = new WP_HTML_Tag_Processor( '<script>' );
+ $p->next_tag();
+ $this->assertSame( 'SCRIPT', $p->get_tag(), 'Did not find script tag' );
+ $p->next_tag();
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ *
+ * @dataProvider data_next_tag_ignores_script_tag_contents
+ *
+ * @param string $script_then_div HTML to test.
+ */
+ public function test_next_tag_ignores_script_tag_contents( $script_then_div ) {
+ $p = new WP_HTML_Tag_Processor( $script_then_div );
+ $p->next_tag();
+ $this->assertSame( 'SCRIPT', $p->get_tag(), 'The first found tag was not "script"' );
+ $p->next_tag();
+ $this->assertSame( 'DIV', $p->get_tag(), 'The second found tag was not "div"' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public function data_next_tag_ignores_script_tag_contents() {
+ return array(
+ 'Simple script tag' => array(
+ '<script><span class="d-none d-md-inline">Back to notifications</span></script><div></div>',
+ ),
+
+ 'Simple uppercase script tag' => array(
+ '<script><span class="d-none d-md-inline">Back to notifications</span></SCRIPT><div></div>',
+ ),
+
+ 'Script with a comment opener inside should end at the next script tag closer (dash dash escaped state)' => array(
+ '<script class="d-md-none"><!--</script><div></div>-->',
+ ),
+
+ 'Script with a comment opener and a script tag opener inside should end two script tag closer later (double escaped state)' => array(
+ '<script class="d-md-none"><!--<script><span1></script><span2></span2></script><div></div>-->',
+ ),
+
+ 'Double escaped script with a tricky opener' => array(
+ '<script class="d-md-none"><!--<script attr="</script>"></script>"><div></div>',
+ ),
+
+ 'Double escaped script with a tricky closer' => array(
+ '<script class="d-md-none"><!--<script><span></script attr="</script>"><div></div>',
+ ),
+
+ 'Double escaped, then escaped, then double escaped' => array(
+ '<script class="d-md-none"><!--<script></script><script></script><span></span></script><div></div>',
+ ),
+
+ 'Script with a commented a script tag opener inside should at the next tag closer (dash dash escaped state)' => array(
+ '<script class="d-md-none"><!--<script>--><span></script><div></div>-->',
+ ),
+
+ 'Script closer with another script tag in closer attributes' => array(
+ '<script><span class="d-none d-md-inline">Back to notifications</title</span></script <script><div></div>',
+ ),
+
+ 'Script closer with attributes' => array(
+ '<script class="d-md-none"><span class="d-none d-md-inline">Back to notifications</span></script id="test"><div></div>',
+ ),
+
+ 'Script opener with title closer inside' => array(
+ '<script class="d-md-none"></title></script><div></div>',
+ ),
+
+ 'Complex script with many parsing states' => array(
+ '<script class="d-md-none"><!--<script>--><scRipt><span><!--<span><Script</script>--></scripT><div></div>-->',
+ ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ *
+ * @dataProvider data_next_tag_ignores_contents_of_rcdata_tag
+ *
+ * @param string $rcdata_then_div HTML with RCDATA before a DIV.
+ * @param string $rcdata_tag RCDATA tag.
+ */
+ public function test_next_tag_ignores_contents_of_rcdata_tag( $rcdata_then_div, $rcdata_tag ) {
+ $p = new WP_HTML_Tag_Processor( $rcdata_then_div );
+ $p->next_tag();
+ $this->assertSame( $rcdata_tag, $p->get_tag(), "The first found tag was not '$rcdata_tag'" );
+ $p->next_tag();
+ $this->assertSame( 'DIV', $p->get_tag(), "The second found tag was not 'div'" );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public function data_next_tag_ignores_contents_of_rcdata_tag() {
+ return array(
+ 'simple textarea' => array(
+ 'rcdata_then_div' => '<textarea><span class="d-none d-md-inline">Back to notifications</span></textarea><div></div>',
+ 'rcdata_tag' => 'TEXTAREA',
+ ),
+ 'simple title' => array(
+ 'rcdata_then_div' => '<title><span class="d-none d-md-inline">Back to notifications</title</span></title><div></div>',
+ 'rcdata_tag' => 'TITLE',
+ ),
+ 'comment opener inside a textarea tag should be ignored' => array(
+ 'rcdata_then_div' => '<textarea class="d-md-none"><!--</textarea><div></div>-->',
+ 'rcdata_tag' => 'TEXTAREA',
+ ),
+ 'textarea closer with another textarea tag in closer attributes' => array(
+ 'rcdata_then_div' => '<textarea><span class="d-none d-md-inline">Back to notifications</title</span></textarea <textarea><div></div>',
+ 'rcdata_tag' => 'TEXTAREA',
+ ),
+ 'textarea closer with attributes' => array(
+ 'rcdata_then_div' => '<textarea class="d-md-none"><span class="d-none d-md-inline">Back to notifications</span></textarea id="test"><div></div>',
+ 'rcdata_tag' => 'TEXTAREA',
+ ),
+ 'textarea opener with title closer inside' => array(
+ 'rcdata_then_div' => '<textarea class="d-md-none"></title></textarea><div></div>',
+ 'rcdata_tag' => 'TEXTAREA',
+ ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ *
+ * @dataProvider data_skips_contents_of_script_and_rcdata_regions
+ *
+ * @param $input_html HTML with multiple divs, one of which carries the "target" attribute.
+ */
+ public function test_skips_contents_of_script_and_rcdata_regions( $input_html ) {
+ $p = new WP_HTML_Tag_Processor( $input_html );
+ $p->next_tag( 'div' );
+
+ $this->assertTrue(
+ $p->get_attribute( 'target' ),
+ 'Did not properly skip over script and rcdata regions; incorrectly found tags inside'
+ );
+ }
+
+ /**
+ * Data provider
+ *
+ * @return array[]
+ */
+ public function data_skips_contents_of_script_and_rcdata_regions() {
+ return array(
+ 'Balanced SCRIPT tags' => array( '<script>console.log("<div>");</script><div target><div>' ),
+ 'Unexpected SCRIPT closer after DIV' => array( 'console.log("<div target>")</script><div><div>' ),
+ 'Unexpected SCRIPT closer before DIV' => array( 'console.log("<span>")</script><div target><div>' ),
+ 'Missing SCRIPT closer' => array( '<script>console.log("<div>");<div><div></script><div target>' ),
+ 'TITLE before DIV' => array( '<title><div></title><div target><div>' ),
+ 'SCRIPT inside TITLE' => array( '<title><script><div></title><div target><div></script><div>' ),
+ 'TITLE in TEXTAREA' => array( '<textarea><div><title><div></textarea><div target></title><div>' ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_can_query_and_update_wrongly_nested_tags() {
+ $p = new WP_HTML_Tag_Processor(
+ '<span>123<p>456</span>789</p>'
+ );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'class', 'span-class' );
+ $p->next_tag( 'p' );
+ $p->set_attribute( 'class', 'p-class' );
+ $this->assertSame(
+ '<span class="span-class">123<p class="p-class">456</span>789</p>',
+ $p->get_updated_html(),
+ 'Did not find overlapping p tag'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::next_tag
+ * @covers WP_HTML_Tag_Processor::remove_attribute
+ */
+ public function test_removing_specific_attributes_in_malformed_html() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED );
+ $p->next_tag( 'span' );
+ $p->remove_attribute( 'Notifications<' );
+ $this->assertSame(
+ '<div><span class="d-md-none" /span><span class="d-none d-md-inline">Back to notifications</span></div>',
+ $p->get_updated_html(),
+ 'Did not remove "Notifications<" attribute in malformed input'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ */
+ public function test_updating_specific_attributes_in_malformed_html() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'id', 'first' );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'id', 'second' );
+ $this->assertSame(
+ '<div><span id="first" class="d-md-none" Notifications</span><span id="second" class="d-none d-md-inline">Back to notifications</span></div>',
+ $p->get_updated_html(),
+ 'Did not add id attributes properly to malformed input'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ *
+ * @dataProvider data_updating_attributes
+ *
+ * @param string $html HTML to process.
+ * @param string $expected Expected updated HTML.
+ */
+ public function test_updating_attributes( $html, $expected ) {
+ $p = new WP_HTML_Tag_Processor( $html );
+ $p->next_tag();
+ $p->set_attribute( 'foo', 'bar' );
+ $p->add_class( 'firstTag' );
+ $p->next_tag();
+ $p->add_class( 'secondTag' );
+
+ $this->assertSame(
+ $expected,
+ $p->get_updated_html(),
+ 'Did not properly add attributes and class names'
+ );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public function data_updating_attributes() {
+ return array(
+ 'tags inside of a comment' => array(
+ 'input' => '<!-- this is a comment. no <strong>tags</strong> allowed --><span>test</span>',
+ 'expected' => '<!-- this is a comment. no <strong>tags</strong> allowed --><span class="firstTag" foo="bar">test</span>',
+ ),
+ 'does not parse <3' => array(
+ 'input' => '<3 is a heart but <t3> is a tag.<span>test</span>',
+ 'expected' => '<3 is a heart but <t3 class="firstTag" foo="bar"> is a tag.<span class="secondTag">test</span>',
+ ),
+ 'does not parse <*' => array(
+ 'input' => 'The applicative operator <* works well in Haskell; is what?<span>test</span>',
+ 'expected' => 'The applicative operator <* works well in Haskell; is what?<span class="firstTag" foo="bar">test</span>',
+ ),
+ '</> in content' => array(
+ 'input' => '</><span>test</span>',
+ 'expected' => '</><span class="firstTag" foo="bar">test</span>',
+ ),
+ 'custom asdf attribute' => array(
+ 'input' => '<hr asdf="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" asdf="test"><span class="secondTag">test</span>',
+ ),
+ 'custom data-* attribute' => array(
+ 'input' => '<div data-foo="bar"><p>Some content for a <span>test</span></p></div>',
+ 'expected' => '<div class="firstTag" foo="bar" data-foo="bar"><p class="secondTag">Some content for a <span>test</span></p></div>',
+ ),
+ 'tag inside of CDATA' => array(
+ 'input' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span>test</span>',
+ 'expected' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span class="firstTag" foo="bar">test</span>',
+ ),
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers WP_HTML_Tag_Processor::add_class
+ * @covers WP_HTML_Tag_Processor::set_attribute
+ *
+ * @dataProvider data_updating_attributes_in_malformed_html
+ *
+ * @param string $html HTML to process.
+ * @param string $expected Expected updated HTML.
+ */
+ public function test_updating_attributes_in_malformed_html( $html, $expected ) {
+ $p = new WP_HTML_Tag_Processor( $html );
+ $p->next_tag();
+ $p->set_attribute( 'foo', 'bar' );
+ $p->add_class( 'firstTag' );
+ $p->next_tag();
+ $p->add_class( 'secondTag' );
+
+ $this->assertSame(
+ $expected,
+ $p->get_updated_html(),
+ 'Did not properly update attributes and classnames given malformed input'
+ );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public function data_updating_attributes_in_malformed_html() {
+ $null_byte = chr( 0 );
+
+ return array(
+ 'Invalid entity inside attribute value' => array(
+ 'input' => '<img src="https://s0.wp.com/i/atat.png" title="&; First <title> is ¬it;" TITLE="second title" title="An Imperial &imperial; AT-AT"><span>test</span>',
+ 'expected' => '<img class="firstTag" foo="bar" src="https://s0.wp.com/i/atat.png" title="&; First <title> is ¬it;" TITLE="second title" title="An Imperial &imperial; AT-AT"><span class="secondTag">test</span>',
+ ),
+ 'HTML tag opening inside attribute value' => array(
+ 'input' => '<pre id="<code" class="wp-block-code <code is poetry>"><code>This <is> a <strong is="true">thing.</code></pre><span>test</span>',
+ 'expected' => '<pre foo="bar" id="<code" class="wp-block-code <code is poetry> firstTag"><code class="secondTag">This <is> a <strong is="true">thing.</code></pre><span>test</span>',
+ ),
+ 'HTML tag brackets in attribute values and data markup' => array(
+ 'input' => '<pre id="<code->-block->" class="wp-block-code <code is poetry>"><code>This <is> a <strong is="true">thing.</code></pre><span>test</span>',
+ 'expected' => '<pre foo="bar" id="<code->-block->" class="wp-block-code <code is poetry> firstTag"><code class="secondTag">This <is> a <strong is="true">thing.</code></pre><span>test</span>',
+ ),
+ 'Single and double quotes in attribute value' => array(
+ 'input' => '<p title="Demonstrating how to use single quote (\') and double quote (")"><span>test</span>',
+ 'expected' => '<p class="firstTag" foo="bar" title="Demonstrating how to use single quote (\') and double quote (")"><span class="secondTag">test</span>',
+ ),
+ 'Unquoted attribute values' => array(
+ 'input' => '<hr a=1 a=2 a=3 a=5 /><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" a=1 a=2 a=3 a=5 /><span class="secondTag">test</span>',
+ ),
+ 'Double-quotes escaped in double-quote attribute value' => array(
+ 'input' => '<hr title="This is a "double-quote""><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" title="This is a "double-quote""><span class="secondTag">test</span>',
+ ),
+ 'Unquoted attribute value' => array(
+ 'input' => '<hr id=code><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id=code><span class="secondTag">test</span>',
+ ),
+ 'Unquoted attribute value with tag-like value' => array(
+ 'input' => '<hr id= <code> ><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id= <code> ><span class="secondTag">test</span>',
+ ),
+ 'Unquoted attribute value with tag-like value followed by tag-like data' => array(
+ 'input' => '<hr id=code>><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id=code>><span class="secondTag">test</span>',
+ ),
+ 'id=&quo;code' => array(
+ 'input' => '<hr id=&quo;code><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id=&quo;code><span class="secondTag">test</span>',
+ ),
+ 'id/test=5' => array(
+ 'input' => '<hr id/test=5><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id/test=5><span class="secondTag">test</span>',
+ ),
+ '<hr> as the id value' => array(
+ 'input' => '<hr title="<hr>"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" title="<hr>"><span class="secondTag">test</span>',
+ ),
+ 'id=>code' => array(
+ 'input' => '<hr id=>code><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id=>code><span class="secondTag">test</span>',
+ ),
+ 'id"quo="test"' => array(
+ 'input' => '<hr id"quo="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id"quo="test"><span class="secondTag">test</span>',
+ ),
+ 'id without double quotation marks around null byte' => array(
+ 'input' => '<hr id' . $null_byte . 'zero="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id' . $null_byte . 'zero="test"><span class="secondTag">test</span>',
+ ),
+ 'Unexpected > before an attribute' => array(
+ 'input' => '<hr >id="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" >id="test"><span class="secondTag">test</span>',
+ ),
+ 'Unexpected = before an attribute' => array(
+ 'input' => '<hr =id="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" =id="test"><span class="secondTag">test</span>',
+ ),
+ 'Unexpected === before an attribute' => array(
+ 'input' => '<hr ===name="value"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" ===name="value"><span class="secondTag">test</span>',
+ ),
+ 'Missing closing data-tag tag' => array(
+ 'input' => 'The applicative operator <* works well in Haskell; <data-tag> is what?<span>test</span>',
+ 'expected' => 'The applicative operator <* works well in Haskell; <data-tag class="firstTag" foo="bar"> is what?<span class="secondTag">test</span>',
+ ),
+ 'Missing closing t3 tag' => array(
+ 'input' => '<3 is a heart but <t3> is a tag.<span>test</span>',
+ 'expected' => '<3 is a heart but <t3 class="firstTag" foo="bar"> is a tag.<span class="secondTag">test</span>',
+ ),
+ 'invalid comment opening tag' => array(
+ 'input' => '<?comment --><span>test</span>',
+ 'expected' => '<?comment --><span class="firstTag" foo="bar">test</span>',
+ ),
+ '=asdf as attribute name' => array(
+ 'input' => '<hr =asdf="tes"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" =asdf="tes"><span class="secondTag">test</span>',
+ ),
+ '== as attribute name with value' => array(
+ 'input' => '<hr ==="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" ==="test"><span class="secondTag">test</span>',
+ ),
+ '=5 as attribute' => array(
+ 'input' => '<hr =5><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" =5><span class="secondTag">test</span>',
+ ),
+ '= as attribute' => array(
+ 'input' => '<hr =><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" =><span class="secondTag">test</span>',
+ ),
+ '== as attribute' => array(
+ 'input' => '<hr ==><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" ==><span class="secondTag">test</span>',
+ ),
+ '=== as attribute' => array(
+ 'input' => '<hr ===><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" ===><span class="secondTag">test</span>',
+ ),
+ 'unsupported disabled attribute' => array(
+ 'input' => '<hr disabled><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" disabled><span class="secondTag">test</span>',
+ ),
+ 'malformed custom attributes' => array(
+ 'input' => '<hr a"sdf="test"><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" a"sdf="test"><span class="secondTag">test</span>',
+ ),
+ 'Multiple unclosed tags treated as a single tag' => array(
+ 'input' => <<<HTML
+ <hr id=">"code
+ <hr id="value>"code
+ <hr id="/>"code
+ <hr id="value/>"code
+ />
+ <span>test</span>
+HTML
+ ,
+ 'expected' => <<<HTML
+ <hr class="firstTag" foo="bar" id=">"code
+ <hr id="value>"code
+ <hr id="/>"code
+ <hr id="value/>"code
+ />
+ <span class="secondTag">test</span>
+HTML
+ ,
+ ),
+ '<hr id =5>' => array(
+ 'input' => '<hr id =5><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id =5><span class="secondTag">test</span>',
+ ),
+ '<hr id a =5>' => array(
+ 'input' => '<hr id a =5><span>test</span>',
+ 'expected' => '<hr class="firstTag" foo="bar" id a =5><span class="secondTag">test</span>',
+ ),
+ );
+ }
+}
</ins></span></pre>
</div>
</div>
</body>
</html>