View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.devacfr.maven.skins.reflow;
20  
21  import javax.annotation.Nonnull;
22  import javax.annotation.Nullable;
23  
24  import java.text.Normalizer;
25  import java.text.Normalizer.Form;
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.Collection;
29  import java.util.Collections;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.Locale;
33  import java.util.Map;
34  import java.util.Map.Entry;
35  import java.util.Set;
36  import java.util.Stack;
37  import java.util.regex.Pattern;
38  
39  import com.google.common.base.Strings;
40  import com.google.common.collect.Lists;
41  import org.apache.velocity.tools.ToolContext;
42  import org.apache.velocity.tools.config.DefaultKey;
43  import org.apache.velocity.tools.generic.SafeConfig;
44  import org.apache.velocity.tools.generic.ValueParser;
45  import org.jsoup.Jsoup;
46  import org.jsoup.internal.StringUtil;
47  import org.jsoup.nodes.Document;
48  import org.jsoup.nodes.Element;
49  import org.jsoup.nodes.Node;
50  import org.jsoup.parser.Tag;
51  
52  import static java.util.Collections.emptyList;
53  import static java.util.Objects.requireNonNull;
54  
55  /**
56   * An Apache Velocity tool that provides utility methods to manipulate HTML code using
57   * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
58   * <p>
59   * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS selectors</a> to refer to
60   * specific elements for manipulation.
61   * </p>
62   *
63   * @author Andrius Velykis
64   * @author Christophe Friederich
65   * @since 1.0
66   * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
67   * @see <a href= "http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
68   */
69  @DefaultKey("htmlTool")
70  public class HtmlTool extends SafeConfig {
71  
72      private static final int SLUG_SIZE = 50;
73  
74      /** Default separator using to generate slug heading name. */
75      public static final String DEFAULT_SLUG_SEPARATOR = "-";
76  
77      /** prefix heading id associated to table of contents. */
78      private static final String SEPARATOR_TOC = "_toc_";
79  
80      /** A list of all HTML heading classes (h1-6). */
81      private static final List<String> HEADINGS = Collections
82              .unmodifiableList(Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
83  
84      /** Enum indicating separator handling strategy for document partitioning. */
85      public enum JoinSeparator {
86          /**
87           * Keep separators at the start of partitions. The first partition will not have a separator.
88           */
89          AFTER,
90          /**
91           * Keep separators at the end of partitions. The last partition will not have a separator.
92           */
93          BEFORE,
94          /** Drop separators altogether. */
95          NO
96      }
97  
98      /** */
99      private String outputEncoding = "UTF-8";
100 
101     private boolean prettyPrint = true;
102 
103     /**
104      * {@inheritDoc}
105      *
106      * @see SafeConfig#configure(ValueParser)
107      */
108     @Override
109     protected void configure(final ValueParser values) {
110 
111         // retrieve the Velocity context for output encoding
112         final Object velocityContext = values.get("velocityContext");
113 
114         if (!(velocityContext instanceof ToolContext)) {
115             return;
116         }
117 
118         final ToolContext ctxt = (ToolContext) velocityContext;
119 
120         // get the output encoding
121         final Object outputEncodingObj = ctxt.get("outputEncoding");
122         if (outputEncodingObj instanceof String) {
123             this.outputEncoding = (String) outputEncodingObj;
124         }
125 
126         final Object prettyPrint = ctxt.get("prettyPrint");
127         if (prettyPrint instanceof Boolean) {
128             this.prettyPrint = (Boolean) prettyPrint;
129         }
130     }
131 
132     /**
133      * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
134      * (e.g. newline, tab) convert to a simple space
135      *
136      * @param html
137      *            html content to normalise.
138      * @return Returns normalised string.
139      */
140     @Nullable public String normaliseWhitespace(@Nullable final String html) {
141         if (Strings.isNullOrEmpty(html)) {
142             return null;
143         }
144         return StringUtil.normaliseWhitespace(html);
145     }
146 
147     /**
148      * Splits the given HTML content into partitions based on the given separator selector. The separators themselves
149      * are dropped from the results.
150      *
151      * @param content
152      *            body HTML content to split (can not be empty or {@code null}).
153      * @param separatorCssSelector
154      *            CSS selector for separators (can not be empty or {@code null}).
155      * @return a list of HTML partitions split on separator locations, but without the separators.
156      * @since 1.0
157      * @see #split(String, String, JoinSeparator)
158      */
159     public List<String> split(@Nonnull final String content, @Nonnull final String separatorCssSelector) {
160         return split(content, separatorCssSelector, JoinSeparator.NO);
161     }
162 
163     /**
164      * Splits the given HTML content into partitions based on the given separator selector. The separators are kept as
165      * first elements of the partitions.
166      * <p>
167      * Note that the first part is removed if the split was successful. This is because the first part does not include
168      * the separator.
169      * </p>
170      *
171      * @param content
172      *            HTML content to split
173      * @param separatorCssSelector
174      *            CSS selector for separators
175      * @return a list of HTML partitions split on separator locations (except the first one), with separators at the
176      *         beginning of each partition
177      * @since 1.0
178      * @see #split(String, String, JoinSeparator)
179      */
180     public List<String> splitOnStarts(final @Nonnull String content, final @Nonnull String separatorCssSelector) {
181 
182         final List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);
183 
184         if (result == null || result.size() <= 1) {
185             // no result or just one part - return what we have
186             return result;
187         }
188 
189         // otherwise, drop the first part - the first split will be the first 'start'
190         // e.g. if we split on headings, the first part will contain everything
191         // before the first heading.
192         return result.subList(1, result.size());
193     }
194 
195     /**
196      * Splits the given HTML content into partitions based on the given separator selector. The separators are either
197      * dropped or joined with before/after depending on the indicated separator strategy.
198      *
199      * @param content
200      *            HTML content to split
201      * @param separatorCssSelector
202      *            CSS selector for separators
203      * @param separatorStrategy
204      *            strategy to drop or keep separators, one of "after", "before" or "no"
205      * @return a list of HTML partitions split on separator locations.
206      * @since 1.0
207      * @see #split(String, String, JoinSeparator)
208      */
209     public List<String> split(final @Nonnull String content,
210         final @Nonnull String separatorCssSelector,
211         final String separatorStrategy) {
212 
213         JoinSeparator sepStrategy;
214         if ("before".equals(separatorStrategy)) {
215             sepStrategy = JoinSeparator.BEFORE;
216         } else if ("after".equals(separatorStrategy)) {
217             sepStrategy = JoinSeparator.AFTER;
218         } else {
219             sepStrategy = JoinSeparator.NO;
220         }
221 
222         return split(content, separatorCssSelector, sepStrategy);
223     }
224 
225     /**
226      * Splits the given HTML content into partitions based on the given separator selector.The separators are either
227      * dropped or joined with before/after depending on the indicated separator strategy.
228      * <p>
229      * Note that splitting algorithm tries to resolve nested elements so that returned partitions are self-contained
230      * HTML elements. The nesting is normally contained within the first applicable partition.
231      * </p>
232      *
233      * @param content
234      *            Body HTML content to split
235      * @param separatorCssSelector
236      *            CSS selector for separators
237      * @param separatorStrategy
238      *            strategy to drop or keep separators
239      * @return a list of HTML partitions split on separator locations. If no splitting occurs, returns the original
240      *         content as the single element of the list
241      * @since 1.0
242      */
243     public List<String> split(@Nonnull final String content,
244         @Nonnull final String separatorCssSelector,
245         @Nonnull final JoinSeparator separatorStrategy) {
246 
247         requireNonNull(separatorStrategy);
248         final Element body = parse(content).body();
249 
250         final List<Element> separators = body.select(separatorCssSelector);
251         if (separators.size() > 0) {
252             final List<List<Element>> partitions = split(separators, separatorStrategy, body);
253 
254             final List<String> sectionHtml = new ArrayList<>();
255 
256             for (final List<Element> partition : partitions) {
257                 final String html = outerHtml(partition);
258                 if (!Strings.isNullOrEmpty(html)) {
259                     sectionHtml.add(outerHtml(partition));
260                 }
261             }
262 
263             return sectionHtml;
264         } else {
265             // nothing to split
266             return Collections.singletonList(content);
267         }
268     }
269 
270     /**
271      * Recursively splits the {@code parent} element based on the given {@code separators}. If a separator is
272      * encountered in the parent, it is split on that position. The outstanding nested elements go with the first of the
273      * partitions in each case.
274      *
275      * @param separators
276      * @param separatorStrategy
277      * @param parent
278      * @return list of partitions (as lists of root elements for each partition). Partition can be an empty list, e.g.
279      *         if the separator is at the start of the content.
280      */
281     private static List<List<Element>> split(final Collection<Element> separators,
282         final JoinSeparator separatorStrategy,
283         final Element parent) {
284 
285         final List<List<Element>> partitions = Lists.newLinkedList();
286 
287         for (final Element child : parent.children()) {
288 
289             if (separators.contains(child)) {
290                 // split here and do not go deeper
291 
292                 // first ensure there was a partition before
293                 // otherwise the split is not recognised on an outer level
294                 getLastPartition(partitions);
295 
296                 if (separatorStrategy == JoinSeparator.BEFORE) {
297                     // add to the last partition
298                     getLastPartition(partitions).add(child);
299                 }
300 
301                 // add an empty new partition
302                 final List<Element> newPartition = Lists.newLinkedList();
303                 partitions.add(newPartition);
304 
305                 if (separatorStrategy == JoinSeparator.AFTER) {
306                     // add to the new partition
307                     newPartition.add(child);
308                 }
309 
310             } else {
311                 // go deeper
312                 final List<List<Element>> childPartitions = split(separators, separatorStrategy, child);
313 
314                 // add the child to the last partition
315                 getLastPartition(partitions).add(child);
316 
317                 if (childPartitions.size() > 1) {
318                     // more than one partition:
319                     // only keep the first partition elements in the child
320                     // so for all other partitions, remove them from their parents
321 
322                     final List<Element> allChildren = child.children();
323                     final List<Element> firstPartition = childPartitions.get(0);
324 
325                     allChildren.removeAll(firstPartition);
326                     for (final Element removeChild : allChildren) {
327                         removeChild.remove();
328                     }
329 
330                     // add the remaining partitions
331                     for (final List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
332                         partitions.add(nextPartition);
333                     }
334                 }
335             }
336         }
337 
338         return partitions;
339     }
340 
341     /**
342      * Retrieves the last partition (as list of elements) or creates a new one if there was none before.
343      *
344      * @param partitions
345      * @return
346      */
347     private static List<Element> getLastPartition(final List<List<Element>> partitions) {
348         if (partitions.isEmpty()) {
349             final List<Element> newPartition = Lists.newLinkedList();
350             partitions.add(newPartition);
351             return newPartition;
352         } else {
353             return partitions.get(partitions.size() - 1);
354         }
355     }
356 
357     /**
358      * Outputs the list of partition root elements to HTML.
359      *
360      * @param elements
361      * @return
362      */
363     private static String outerHtml(final List<Element> elements) {
364 
365         switch (elements.size()) {
366             case 0:
367                 return "";
368 
369             case 1:
370                 return elements.get(0).outerHtml();
371 
372             default:
373                 // more than one element
374                 // wrap into <div> which we will remove afterwards
375                 final Element root = new Element(Tag.valueOf("div"), "");
376                 for (final Element elem : elements) {
377                     root.appendChild(elem);
378                 }
379 
380                 return root.html();
381         }
382     }
383 
384     /**
385      * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited
386      * to a certain amount, e.g. to bring just the first of selected elements to the top.
387      *
388      * @param content
389      *            HTML content to reorder
390      * @param selector
391      *            CSS selector for elements to bring to top of the content
392      * @param amount
393      *            Maximum number of elements to reorder
394      * @return HTML content with reordered elements, or the original content if no such elements found.
395      * @since 1.0
396      */
397     public String reorderToTop(final String content, final String selector, final int amount) {
398         return reorderToTop(content, selector, amount, null);
399     }
400 
401     /**
402      * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited
403      * to a certain amount, e.g. to bring just the first of selected elements to the top.
404      *
405      * @param content
406      *            HTML content to reorder
407      * @param selector
408      *            CSS selector for elements to bring to top of the content
409      * @param amount
410      *            Maximum number of elements to reorder
411      * @param wrapRemaining
412      *            HTML to wrap the remaining (non-reordered) part
413      * @return HTML content with reordered elements, or the original content if no such elements found.
414      * @since 1.0
415      */
416     public String reorderToTop(final String content,
417         final String selector,
418         final int amount,
419         final String wrapRemaining) {
420 
421         // extract the elements and then prepend them to the remaining body
422         final List<Element> extracted = extractElements(content, selector, amount);
423 
424         if (extracted.size() > 1) {
425 
426             final Element body = extracted.get(0);
427 
428             if (wrapRemaining != null) {
429                 wrapInner(body, wrapRemaining);
430             }
431 
432             final List<Element> elements = extracted.subList(1, extracted.size());
433 
434             // now prepend extracted elements to the body (in backwards to preserve original
435             // order)
436             for (int index = elements.size() - 1; index >= 0; index--) {
437                 body.prependChild(elements.get(index));
438             }
439 
440             return body.html();
441         } else {
442             // nothing to reorder
443             return content;
444         }
445     }
446 
447     private static Element wrapInner(final Element element, final String html) {
448 
449         // wrap everything into an additional <div> for wrapping
450         // otherwise there may be problems, e.g. with <body> element
451         final Element topDiv = new Element(Tag.valueOf("div"), "");
452         for (final Element topElem : element.children()) {
453             // add all elements in the body to the `topDiv`
454             topElem.remove();
455             topDiv.appendChild(topElem);
456         }
457 
458         // add topDiv to the body
459         element.appendChild(topDiv);
460 
461         // wrap topDiv
462         topDiv.wrap(html);
463         // now unwrap topDiv - will remove it from the hierarchy
464         topDiv.unwrap();
465 
466         return element;
467     }
468 
469     /**
470      * Extracts elements from the HTML content.
471      *
472      * @param content
473      * @param selector
474      * @param amount
475      * @return the remainder and a list of extracted elements. The main body (remainder after extraction) is always
476      *         returned as the first element of the list.
477      */
478     private List<Element> extractElements(final String content, final String selector, final int amount) {
479 
480         final Element body = parse(content).body();
481 
482         List<Element> elements = body.select(selector);
483         if (elements.size() > 0) {
484 
485             elements = filterParents(elements);
486 
487             if (amount >= 0) {
488                 // limit to the indicated amount
489                 elements = elements.subList(0, Math.min(amount, elements.size()));
490             }
491 
492             // remove all from their parents
493             for (final Element element : elements) {
494                 element.remove();
495             }
496         }
497 
498         final List<Element> results = new ArrayList<>();
499         // first element is the body
500         results.add(body);
501         results.addAll(elements);
502         return results;
503     }
504 
505     /**
506      * Filters the list of elements to only contain parent elements. This is to avoid both parent and child being in the
507      * list of elements.
508      *
509      * @param elements
510      * @return
511      */
512     private static List<Element> filterParents(final List<Element> elements) {
513         final List<Element> filtered = new ArrayList<>();
514         for (final Element element : elements) {
515             // get the intersection of parents and selected elements
516             final List<Element> parentsInter = element.parents();
517             parentsInter.retainAll(elements);
518             if (parentsInter.isEmpty()) {
519                 // no intersection - element's parents are not in the selected list
520                 filtered.add(element);
521             }
522         }
523 
524         return filtered;
525     }
526 
527     /**
528      * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML elements and the
529      * remainder of HTML content, with these elements removed. Can be limited to a certain amount, e.g. to extract just
530      * the first of selected elements.
531      *
532      * @param content
533      *            HTML content to extract elements from
534      * @param selector
535      *            CSS selector for elements to extract
536      * @param amount
537      *            Maximum number of elements to extract
538      * @return HTML content of the extracted elements together with the remainder of the original content. If no
539      *         elements are found, the remainder contains the original content.
540      * @since 1.0
541      */
542     @Nonnull
543     public ExtractResult extract(final String content, final String selector, final int amount) {
544 
545         final List<Element> extracted = extractElements(content, selector, amount);
546 
547         if (extracted.size() > 1) {
548 
549             // first element is the remaining body, the rest are extracted
550             final Element body = extracted.get(0);
551             final List<Element> elements = extracted.subList(1, extracted.size());
552 
553             // convert to HTML
554             final List<String> elementStr = new ArrayList<>();
555             for (final Element el : elements) {
556                 elementStr.add(el.outerHtml());
557             }
558 
559             return new DefaultExtractResult(elementStr, body.html());
560         } else {
561             // nothing to extract
562             return new DefaultExtractResult(Collections.<String> emptyList(), content);
563         }
564     }
565 
566     /**
567      * A container to carry element extraction results. Contains the extracted element HTML code and the remainder of
568      * the body content with elements removed.
569      *
570      * @author Andrius Velykis
571      * @since 1.0
572      */
573     public interface ExtractResult {
574 
575         /**
576          * Retrieves the extracted HTML elements.
577          *
578          * @return List of HTML of extracted elements. Can be empty if no elements found.
579          */
580         List<String> getExtracted();
581 
582         /**
583          * Retrieves the content from which elements were extracted.
584          *
585          * @return The HTML content with extracted elements removed.
586          */
587         String getRemainder();
588     }
589 
590     /**
591      * @author Christophe Friederich
592      */
593     private static final class DefaultExtractResult implements ExtractResult {
594 
595         /** */
596         private final List<String> extracted;
597 
598         /** */
599         private final String remainder;
600 
601         private DefaultExtractResult(final List<String> extracted, final String remainder) {
602             this.extracted = extracted;
603             this.remainder = remainder;
604         }
605 
606         @Override
607         public List<String> getExtracted() {
608             return Collections.unmodifiableList(extracted);
609         }
610 
611         @Override
612         public String getRemainder() {
613             return remainder;
614         }
615     }
616 
617     /**
618      * Sets attribute to the given value on elements in HTML.
619      *
620      * @param content
621      *            HTML content to set attributes on
622      * @param selector
623      *            CSS selector for elements to modify
624      * @param attributeKey
625      *            Attribute name
626      * @param value
627      *            Attribute value
628      * @return HTML content with modified elements. If no elements are found, the original content is returned.
629      * @since 1.0
630      */
631     public String setAttr(final String content, final String selector, final String attributeKey, final String value) {
632 
633         final Element body = parse(content).body();
634 
635         final List<Element> elements = body.select(selector);
636         if (elements.size() > 0) {
637 
638             for (final Element element : elements) {
639                 element.attr(attributeKey, value);
640             }
641 
642             return body.html();
643         } else {
644             // nothing to update
645             return content;
646         }
647     }
648 
649     /**
650      * Parses body fragment to the {@code <body>} element.
651      *
652      * @param content
653      *            body HTML fragment (can not be {@code null}).
654      * @return the {@code body} element of the parsed content
655      */
656     public Document parse(@Nonnull final String content) {
657         final Document doc = Jsoup.parseBodyFragment(content);
658         doc.outputSettings().charset(outputEncoding).prettyPrint(prettyPrint);
659         return doc;
660     }
661 
662     /**
663      * Retrieves attribute value on elements in HTML. Will return all attribute values for the selector, since there can
664      * be more than one element.
665      *
666      * @param content
667      *            HTML content to read attributes from
668      * @param selector
669      *            CSS selector for elements to find
670      * @param attributeKey
671      *            Attribute name
672      * @return Attribute values for all matching elements. If no elements are found, empty list is returned.
673      * @since 1.0
674      */
675     public List<String> getAttr(final String content, final String selector, final String attributeKey) {
676 
677         final Element body = parse(content).body();
678 
679         final List<Element> elements = body.select(selector);
680         final List<String> attrs = new ArrayList<>();
681 
682         for (final Element element : elements) {
683             final String attrValue = element.attr(attributeKey);
684             attrs.add(attrValue);
685         }
686 
687         return attrs;
688     }
689 
690     /**
691      * Adds given class names to the elements in HTML.
692      *
693      * @param content
694      *            HTML content to modify
695      * @param selector
696      *            CSS selector for elements to add classes to
697      * @param classNames
698      *            Names of classes to add to the selected elements
699      * @param amount
700      *            Maximum number of elements to modify
701      * @return HTML content with modified elements. If no elements are found, the original content is returned.
702      * @since 1.0
703      */
704     public String addClass(final String content,
705         final String selector,
706         final List<String> classNames,
707         final int amount) {
708 
709         final Element body = parse(content).body();
710 
711         List<Element> elements = body.select(selector);
712         if (amount >= 0) {
713             // limit to the indicated amount
714             elements = elements.subList(0, Math.min(amount, elements.size()));
715         }
716 
717         if (elements.size() > 0) {
718 
719             for (final Element element : elements) {
720                 for (final String className : classNames) {
721                     element.addClass(className);
722                 }
723             }
724 
725             return body.html();
726         } else {
727             // nothing to update
728             return content;
729         }
730     }
731 
732     /**
733      * Adds given class names to the elements in HTML.
734      *
735      * @param content
736      *            HTML content to modify
737      * @param selector
738      *            CSS selector for elements to add classes to
739      * @param classNames
740      *            Names of classes to add to the selected elements
741      * @return HTML content with modified elements. If no elements are found, the original content is returned.
742      * @since 1.0
743      */
744     public String addClass(final String content, final String selector, final List<String> classNames) {
745         return addClass(content, selector, classNames, -1);
746     }
747 
748     /**
749      * Adds given class to the elements in HTML.
750      *
751      * @param content
752      *            HTML content to modify
753      * @param selector
754      *            CSS selector for elements to add the class to
755      * @param className
756      *            Name of class to add to the selected elements
757      * @return HTML content with modified elements. If no elements are found, the original content is returned.
758      * @since 1.0
759      */
760     public String addClass(final String content, final String selector, final String className) {
761         return addClass(content, selector, Collections.singletonList(className));
762     }
763 
764     /**
765      * Wraps elements in HTML with the given HTML.
766      *
767      * @param content
768      *            HTML content to modify
769      * @param selector
770      *            CSS selector for elements to wrap
771      * @param wrapHtml
772      *            HTML to use for wrapping the selected elements
773      * @param amount
774      *            Maximum number of elements to modify
775      * @return HTML content with modified elements. If no elements are found, the original content is returned.
776      * @since 1.0
777      */
778     public String wrap(final String content, final String selector, final String wrapHtml, final int amount) {
779 
780         final Element body = parse(content).body();
781 
782         List<Element> elements = body.select(selector);
783         if (amount >= 0) {
784             // limit to the indicated amount
785             elements = elements.subList(0, Math.min(amount, elements.size()));
786         }
787 
788         if (elements.size() > 0) {
789 
790             for (final Element element : elements) {
791                 element.wrap(wrapHtml);
792             }
793 
794             return body.html();
795         } else {
796             // nothing to update
797             return content;
798         }
799     }
800 
801     /**
802      * Removes elements from HTML.
803      *
804      * @param content
805      *            HTML content to modify
806      * @param selector
807      *            CSS selector for elements to remove
808      * @return HTML content with removed elements. If no elements are found, the original content is returned.
809      * @since 1.0
810      */
811     public String remove(final String content, final String selector) {
812 
813         final Element body = parse(content).body();
814 
815         final List<Element> elements = body.select(selector);
816         if (elements.size() > 0) {
817             for (final Element element : elements) {
818                 element.remove();
819             }
820 
821             return body.html();
822         } else {
823             // nothing changed
824             return content;
825         }
826     }
827 
828     /**
829      * Replaces elements in HTML.
830      *
831      * @param content
832      *            HTML content to modify
833      * @param selector
834      *            CSS selector for elements to replace
835      * @param replacement
836      *            HTML replacement (must parse to a single element)
837      * @return HTML content with replaced elements. If no elements are found, the original content is returned.
838      * @since 1.0
839      */
840     public String replace(final String content, final String selector, final String replacement) {
841         return replaceAll(content, Collections.singletonMap(selector, replacement));
842     }
843 
844     /**
845      * Replaces elements in HTML.
846      *
847      * @param content
848      *            HTML content to modify
849      * @param replacements
850      *            Map of CSS selectors to their replacement HTML texts. CSS selectors find elements to be replaced with
851      *            the HTML in the mapping. The HTML must parse to a single element.
852      * @return HTML content with replaced elements. If no elements are found, the original content is returned.
853      * @since 1.0
854      */
855     public String replaceAll(final String content, final Map<String, String> replacements) {
856 
857         final Element body = parse(content).body();
858 
859         boolean modified = false;
860         for (final Entry<String, String> replacementEntry : replacements.entrySet()) {
861             final String selector = replacementEntry.getKey();
862             final String replacement = replacementEntry.getValue();
863 
864             final List<Element> elements = body.select(selector);
865             if (elements.size() > 0) {
866 
867                 // take the first child
868                 final Element replacementElem = parse(replacement).body().child(0);
869 
870                 if (replacementElem != null) {
871                     for (final Element element : elements) {
872                         element.replaceWith(replacementElem.clone());
873                     }
874 
875                     modified = true;
876                 }
877             }
878         }
879 
880         if (modified) {
881             return body.html();
882         } else {
883             // nothing changed
884             return content;
885         }
886     }
887 
888     /**
889      * Replaces All elements in HTML corresponding to <code>selector</code> while preserving the content of this
890      * element.
891      *
892      * @param content
893      *            HTML content to modify
894      * @param selector
895      *            CSS selector for elements to replace
896      * @param newElement
897      *            HTML replacement (must parse to a single element)
898      * @return HTML content with replaced elements. If no elements are found, the original content is returned.
899      * @since 2.0
900      */
901     public String replaceWith(final String content, final String selector, final String newElement) {
902 
903         final Element body = parse(content).body();
904 
905         boolean modified = false;
906         final List<Element> elements = body.select(selector);
907         if (elements.size() > 0) {
908 
909             // take the first child
910             final Element replacementElem = parse(newElement).body().child(0);
911 
912             if (replacementElem != null) {
913                 for (final Element element : elements) {
914                     final List<Node> children = element.childNodes();
915                     final Element el = replacementElem.clone();
916                     for (final Node child : children) {
917                         el.appendChild(child.clone());
918                     }
919                     element.replaceWith(el);
920                 }
921 
922                 modified = true;
923             }
924         }
925 
926         if (modified) {
927             return body.html();
928         } else {
929             // nothing changed
930             return content;
931         }
932     }
933 
934     /**
935      * Retrieves text content of the selected elements in HTML. Renders the element's text as it would be displayed on
936      * the web page (including its children).
937      *
938      * @param content
939      *            HTML content with the elements
940      * @param selector
941      *            CSS selector for elements to extract contents
942      * @return A list of element texts as rendered to display. Empty list if no elements are found.
943      * @since 1.0
944      */
945     public List<String> text(@Nullable final String content, @Nonnull final String selector) {
946         if (Strings.isNullOrEmpty(content)) {
947             return emptyList();
948         }
949         final Element body = parse(content).body();
950 
951         final List<Element> elements = body.select(selector);
952         final List<String> texts = new ArrayList<>();
953 
954         for (final Element element : elements) {
955             texts.add(element.text());
956         }
957 
958         return texts;
959     }
960 
961     /**
962      * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to IDs for heading
963      * elements.
964      * <p>
965      * The anchors are used to indicate positions within a HTML page. In HTML5, however, the {@code name} attribute is
966      * no longer supported on {@code <a>}) tag. The positions within pages are indicated using {@code id} attribute
967      * instead, e.g. {@code
968      *
969      * 
970     <h1 id="myheading">}.
971      * </p>
972      * <p>
973      * The method finds anchors inside, immediately before or after the heading tags and uses their name as heading
974      * {@code id} instead. The anchors themselves are removed.
975      * </p>
976      *
977      * @param content
978      *            HTML content to modify
979      * @return HTML content with modified elements. Anchor names are used for adjacent headings, and anchor tags are
980      *         removed. If no elements are found, the original content is returned.
981      * @since 1.0
982      */
983     public String headingAnchorToId(final String content) {
984 
985         final Element body = parse(content).body();
986 
987         // selectors for headings without IDs
988         final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
989 
990         // selector for anchor with name attribute only
991         final String nameA = "a[name]:not([href])";
992 
993         // select all headings that have inner named anchor
994         final List<Element> headingsInnerA = body
995                 .select(String.join(", ", concat(headNoIds, ":has(" + nameA + ")", true)));
996 
997         boolean modified = false;
998         for (final Element heading : headingsInnerA) {
999             final List<Element> anchors = heading.select(nameA);
1000             // take first
1001             if (!anchors.isEmpty()) {
1002                 anchorToId(heading, anchors.get(0));
1003                 modified = true;
1004             }
1005         }
1006 
1007         // select all headings that have a preceding named anchor
1008         final List<Element> headingsPreA = body.select(String.join(", ", concat(headNoIds, nameA + " + ", false)));
1009 
1010         for (final Element heading : headingsPreA) {
1011             final Element anchor = heading.previousElementSibling();
1012             if (anchor != null) {
1013                 anchorToId(heading, anchor);
1014                 modified = true;
1015             }
1016         }
1017 
1018         // select all headings that are followed by a named anchor
1019         // no selector available for that, so first select the anchors
1020         // then retrieve the headings
1021         final List<Element> anchorsPreH = body.select(String.join(", ", concat(headNoIds, " + " + nameA, true)));
1022 
1023         for (final Element anchor : anchorsPreH) {
1024             final Element heading = anchor.previousElementSibling();
1025             if (heading != null) {
1026                 anchorToId(heading, anchor);
1027                 modified = true;
1028             }
1029         }
1030 
1031         if (modified) {
1032             return body.html();
1033         } else {
1034             // nothing to update
1035             return content;
1036         }
1037     }
1038 
1039     /**
1040      * Moves anchor name to heading id, if one does not exist. Removes the anchor.
1041      *
1042      * @param heading
1043      * @param anchor
1044      */
1045     private static void anchorToId(final Element heading, final Element anchor) {
1046 
1047         if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
1048             final String aName = anchor.attr("name");
1049             if (!aName.isEmpty()) {
1050                 // set the anchor name as heading ID
1051                 heading.attr("id", aName);
1052 
1053                 // remove the anchor
1054                 anchor.remove();
1055             }
1056         }
1057     }
1058 
1059     /**
1060      * Utility method to concatenate a String to a list of Strings. The text can be either appended or prepended.
1061      *
1062      * @param elements
1063      *            list of elements to append/prepend the text to
1064      * @param text
1065      *            the given text to append/prepend
1066      * @param append
1067      *            if {@code true}, text will be appended to the elements. If {@code false}, it will be prepended
1068      * @return list of elements with the text appended/prepended
1069      * @since 1.0
1070      */
1071     public static List<String> concat(final List<String> elements, final String text, final boolean append) {
1072         final List<String> concats = new ArrayList<>();
1073 
1074         for (final String element : elements) {
1075             concats.add(append ? element + text : text + element);
1076         }
1077 
1078         return concats;
1079     }
1080 
1081     /**
1082      * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that do not have one.
1083      * <p>
1084      * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a heading tag without an
1085      * {@code id} is found, its "slug" is generated automatically based on the heading contents and used as the ID.
1086      * </p>
1087      * <p>
1088      * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS selectors, e.g. ":", ".",
1089      * etc. The symbols are removed.
1090      * </p>
1091      *
1092      * @param pageType
1093      *            The type of page.
1094      * @param currentPage
1095      *            The name of current page.
1096      * @param content
1097      *            HTML content to modify.
1098      * @param idSeparator
1099      *            the seperator used to slug ID.
1100      * @return Returns a {@link String} representing HTML content with all heading elements having {@code id}
1101      *         attributes. If all headings were with IDs already, the original content is returned.
1102      * @since 1.0
1103      */
1104     public String ensureHeadingIds(final String pageType,
1105         final String currentPage,
1106         final String content,
1107         final String idSeparator) {
1108         final List<String> excludedPages = Arrays.asList("checkstyle-aggregate", "checkstyle");
1109 
1110         final Element body = parse(content).body();
1111 
1112         // exclude pages
1113         if (excludedPages.contains(currentPage)) {
1114             return content;
1115         }
1116 
1117         // first find all existing IDs (to avoid generating duplicates)
1118         final List<Element> idElems = body.select("*[id]");
1119 
1120         final Set<String> ids = new HashSet<>();
1121         boolean modified = false;
1122         for (final Element idElem : idElems) {
1123 
1124             // fix all existing IDs - remove colon and other symbols which mess up jQuery
1125             final String id = idElem.id();
1126             idElem.attr("id", slug(id, idSeparator));
1127             modified = true;
1128 
1129             ids.add(idElem.id());
1130         }
1131 
1132         // create unique id for all heading elements
1133         final List<String> headIds = concat(HEADINGS, "[id]", true);
1134         // select all headings that have an ID
1135         final List<Element> headingIds = body.select(String.join(", ", headIds));
1136 
1137         for (final Element heading : headingIds) {
1138             final String headingText = heading.text();
1139             String headingSlug = slug(headingText, idSeparator);
1140             // also limit slug to 50 symbols
1141             if (headingSlug.length() > SLUG_SIZE) {
1142                 headingSlug = headingSlug.substring(0, SLUG_SIZE);
1143             }
1144             final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1145 
1146             heading.attr("id", headingId);
1147         }
1148 
1149         final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
1150 
1151         // select all headings that do not have an ID
1152         final List<Element> headingsNoId = body.select(String.join(", ", headNoIds));
1153 
1154         if (!headingsNoId.isEmpty() || modified) {
1155             for (final Element heading : headingsNoId) {
1156 
1157                 final String headingText = heading.text();
1158                 String headingSlug = slug(headingText, idSeparator);
1159                 // also limit slug to 50 symbols
1160                 if (headingSlug.length() > SLUG_SIZE) {
1161                     headingSlug = headingSlug.substring(0, SLUG_SIZE);
1162                 }
1163                 final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1164 
1165                 heading.attr("id", headingId);
1166             }
1167         }
1168 
1169         return body.html();
1170     }
1171 
1172     /**
1173      * Generated a unique ID within the given set of IDs. Appends an incrementing number for duplicates.
1174      *
1175      * @param pageType
1176      *            The type of page.
1177      * @param currentPage
1178      *            Tthe name of current page.
1179      * @param ids
1180      *            The list of ID already existing or used.
1181      * @param idBase
1182      *            The prefix to use.
1183      * @return Returns a new {@link String} representing a new unique ID.
1184      */
1185     private static String generateUniqueId(final String pageType,
1186         final String currentPage,
1187         final Set<String> ids,
1188         final String idBase) {
1189         String id = idBase;
1190         int counter = 1;
1191         while (ids.contains(id)) {
1192             id = idBase + String.valueOf(counter++);
1193         }
1194 
1195         // put the newly generated one into the set
1196         ids.add(id);
1197         if ("frame".equals(pageType)) {
1198             id = currentPage + SEPARATOR_TOC + id;
1199         }
1200         return id;
1201     }
1202 
1203     /**
1204      * Fixes table heads: wraps rows with {@code
1205      *
1206      * 
1207     <th>} (table heading) elements into {@code <thead>} element if they are currently in {@code <tbody>}.
1208      *
1209      * @param content
1210      *            HTML content to modify
1211      * @return HTML content with all table heads fixed. If all heads were correct, the original content is returned.
1212      * @since 1.0
1213      */
1214     public String fixTableHeads(final String content) {
1215 
1216         final Element body = parse(content).body();
1217 
1218         final List<Element> tables = body.select("table");
1219 
1220         for (final Element table : tables) {
1221             // select rows with <th> tags within <tbody>
1222             final List<Element> tableHeadRows = table.select("tbody > tr:has(th)");
1223             // convert only table containing one tr head.
1224             if (tableHeadRows.size() == 1) {
1225 
1226                 for (final Element row : tableHeadRows) {
1227 
1228                     // remove row from its original position
1229                     row.remove();
1230 
1231                     // create table header element with the row
1232                     final Element thead = new Element(Tag.valueOf("thead"), "");
1233                     thead.appendChild(row);
1234                     // add at the beginning of the table
1235                     table.prependChild(thead);
1236                 }
1237             }
1238         }
1239         return body.html();
1240     }
1241 
1242     /** */
1243     private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
1244 
1245     /** */
1246     private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
1247 
1248     /**
1249      * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). Uses "-"
1250      * as a whitespace separator.
1251      *
1252      * @param input
1253      *            text to generate the slug from
1254      * @return the slug of the given text that contains alphanumeric symbols and "-" only
1255      * @since 1.0
1256      */
1257     public static String slug(final String input) {
1258         return slug(input, DEFAULT_SLUG_SEPARATOR);
1259     }
1260 
1261     /**
1262      * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs).
1263      *
1264      * @param input
1265      *            text to generate the slug from
1266      * @param separator
1267      *            separator for whitespace replacement
1268      * @return the slug of the given text that contains alphanumeric symbols and separator only
1269      * @since 1.0
1270      * @see <a href=
1271      *      "http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>
1272      */
1273     private static String slug(final String input, final String separator) {
1274         final String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
1275         final String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
1276         return NONLATIN.matcher(normalized).replaceAll("").toLowerCase(Locale.ENGLISH);
1277     }
1278 
1279     /**
1280      * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are nested within bigger
1281      * ones, e.g. <code>&lt;h2&gt;</code> is nested under preceding <code>&lt;h1&gt;</code>.
1282      * <p>
1283      * Only headings with IDs are included in the hierarchy. The result elements contain ID and heading text for each
1284      * heading. The hierarchy is useful to generate a Table of Contents for a page.
1285      * </p>
1286      *
1287      * @param content
1288      *            HTML content to extract heading hierarchy from
1289      * @param sections
1290      *            list of all sections
1291      * @return a list of top-level heading items (with id and text). The remaining headings are nested within these
1292      *         top-level items. Empty list if no headings are in the content.
1293      * @since 1.0
1294      */
1295     public List<? extends IdElement> headingTree(final String content, final List<String> sections) {
1296 
1297         final List<String> sectionContents = this.split(content, "hr");
1298         final List<String> headIds = concat(HEADINGS, "[id]:not(.no-anchor)", true);
1299         final List<HeadingItem> headingItems = new ArrayList<>();
1300 
1301         int index = 0;
1302         for (final String sectionContent : sectionContents) {
1303             final String sectionType = index < sections.size() ? sections.get(index++) : "";
1304 
1305             // exclude carousel headings
1306             if ("carousel".equals(sectionType)) {
1307                 continue;
1308             }
1309             final Element body = parse(sectionContent).body();
1310             // select all headings that have an ID
1311             final List<Element> headings = body.select(String.join(", ", headIds));
1312             for (final Element heading : headings) {
1313                 headingItems
1314                         .add(new HeadingItem(heading.id(), heading.nodeName(), heading.text(), headingIndex(heading)));
1315             }
1316         }
1317 
1318         final List<HeadingItem> topHeadings = new ArrayList<>();
1319         final Stack<HeadingItem> parentHeadings = new Stack<>();
1320 
1321         for (final HeadingItem heading : headingItems) {
1322 
1323             while (!parentHeadings.isEmpty() && parentHeadings.peek().headingLevel >= heading.headingLevel) {
1324                 parentHeadings.pop();
1325             }
1326 
1327             if (parentHeadings.isEmpty()) {
1328                 // top level heading - no parents
1329                 topHeadings.add(heading);
1330             } else {
1331                 // add to the children of topmost stack parent
1332                 parentHeadings.peek().children.add(heading);
1333             }
1334 
1335             // push the heading onto stack
1336             parentHeadings.push(heading);
1337         }
1338 
1339         return topHeadings;
1340     }
1341 
1342     /**
1343      * Retrieves numeric index of a heading.
1344      *
1345      * @param element
1346      * @return
1347      */
1348     private static int headingIndex(final Element element) {
1349         final String tagName = element.tagName();
1350         if (tagName.startsWith("h")) {
1351             try {
1352                 return Integer.parseInt(tagName.substring(1));
1353             } catch (final Exception ex) {
1354                 throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
1355             }
1356         } else {
1357             throw new IllegalArgumentException("Must be a header tag: " + tagName);
1358         }
1359     }
1360 
1361     /**
1362      * @author Christophe Friederich
1363      */
1364     private static final class HeadingItem implements IdElement {
1365 
1366         /** */
1367         private final String id;
1368 
1369         /** */
1370         private final String tagName;
1371 
1372         /** */
1373         private final String text;
1374 
1375         /** */
1376         private final int headingLevel;
1377 
1378         /** */
1379         private final List<HeadingItem> children = new ArrayList<>();
1380 
1381         private HeadingItem(final String id, final String tagName, final String text, final int headingLevel) {
1382             this.id = id;
1383             this.tagName = tagName;
1384             this.text = text;
1385             this.headingLevel = headingLevel;
1386         }
1387 
1388         @Override
1389         public String getId() {
1390             return id;
1391         }
1392 
1393         @Override
1394         public String getTagName() {
1395             return tagName;
1396         }
1397 
1398         @Override
1399         public String getText() {
1400             return text;
1401         }
1402 
1403         @Override
1404         public List<HeadingItem> getItems() {
1405             return Collections.unmodifiableList(children);
1406         }
1407 
1408         @Override
1409         public int getHeadingLevel() {
1410             return headingLevel;
1411         }
1412     }
1413 
1414     /**
1415      * Representation of a HTML element with ID and a text content. Other such elements can be nested within.
1416      *
1417      * @author Andrius Velykis
1418      * @since 1.0
1419      */
1420     public interface IdElement {
1421 
1422         /**
1423          * Retrieves the ID of the HTML element (attribute {@code id}).
1424          *
1425          * @return element {@code id} value
1426          */
1427         String getId();
1428 
1429         /**
1430          * @return Returns the tag name of element.
1431          */
1432         String getTagName();
1433 
1434         /**
1435          * Retrieves the text contents of the HTML element (rendered for display).
1436          *
1437          * @return text contents of the element
1438          */
1439         String getText();
1440 
1441         /**
1442          * @return Returns the level of heading.
1443          */
1444         int getHeadingLevel();
1445 
1446         /**
1447          * Retrieves the children of the HTML element (nested within the element).
1448          *
1449          * @return nested items within the element
1450          */
1451         List<? extends IdElement> getItems();
1452     }
1453 }