View Javadoc
1   /*
2   * Copyright 2012-2025 Christophe Friederich
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   * http://www.apache.org/licenses/LICENSE-2.0
9   *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16  package org.devacfr.maven.skins.reflow;
17  
18  import static java.util.Collections.emptyList;
19  import static java.util.Objects.requireNonNull;
20  
21  import com.google.common.base.Strings;
22  import com.google.common.collect.Lists;
23  import com.google.common.collect.Sets;
24  import java.text.Normalizer;
25  import java.text.Normalizer.Form;
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.Collection;
29  import java.util.Collections;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.Locale;
33  import java.util.Map;
34  import java.util.Map.Entry;
35  import java.util.Set;
36  import java.util.Stack;
37  import java.util.regex.Pattern;
38  import javax.annotation.Nonnull;
39  import javax.annotation.Nullable;
40  import org.apache.commons.lang3.builder.ToStringBuilder;
41  import org.apache.velocity.tools.ToolContext;
42  import org.apache.velocity.tools.config.DefaultKey;
43  import org.apache.velocity.tools.generic.SafeConfig;
44  import org.apache.velocity.tools.generic.ValueParser;
45  import org.jsoup.Jsoup;
46  import org.jsoup.internal.StringUtil;
47  import org.jsoup.nodes.Document;
48  import org.jsoup.nodes.Element;
49  import org.jsoup.nodes.Node;
50  import org.jsoup.parser.Tag;
51  import org.slf4j.Logger;
52  import org.slf4j.LoggerFactory;
53  
54  /**
55   * An Apache Velocity tool that provides utility methods to manipulate HTML code using
56   * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
57   * <p>
58   * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS selectors</a> to refer to
59   * specific elements for manipulation.
60   * </p>
61   *
62   * @author Andrius Velykis
63   * @author Christophe Friederich
64   * @since 1.0
65   * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
66   * @see <a href= "http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
67   */
68  @DefaultKey("htmlTool")
69  public class HtmlTool extends SafeConfig {
70  
71    /** */
72    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlTool.class);
73  
74    private static final int SLUG_SIZE = 50;
75  
76    /** Default separator using to generate slug heading name. */
77    public static final String DEFAULT_SLUG_SEPARATOR = "-";
78  
79    /** prefix heading id associated to table of contents. */
80    private static final String SEPARATOR_TOC = "_toc_";
81  
82    /** A list of all HTML heading classes (h1-6). */
83    private static final List<String> HEADINGS = Collections
84        .unmodifiableList(Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
85  
86    /** Enum indicating separator handling strategy for document partitioning. */
87    public enum JoinSeparator {
88      /**
89       * Keep separators at the start of partitions. The first partition will not have a separator.
90       */
91      AFTER,
92      /**
93       * Keep separators at the end of partitions. The last partition will not have a separator.
94       */
95      BEFORE,
96      /** Drop separators altogether. */
97      NO
98    }
99  
100   /** */
101   private String outputEncoding = "UTF-8";
102 
103   private boolean prettyPrint = true;
104 
105   /**
106    * {@inheritDoc}
107    *
108    * @see SafeConfig#configure(ValueParser)
109    */
110   @Override
111   protected void configure(final ValueParser values) {
112 
113     // retrieve the Velocity context for output encoding
114     final Object velocityContext = values.get("velocityContext");
115 
116     if (!(velocityContext instanceof ToolContext)) {
117       return;
118     }
119 
120     final ToolContext ctxt = (ToolContext) velocityContext;
121 
122     // get the output encoding
123     final Object outputEncodingObj = ctxt.get("outputEncoding");
124     if (outputEncodingObj instanceof String) {
125       this.outputEncoding = (String) outputEncodingObj;
126     }
127 
128     final Object prettyPrint = ctxt.get("prettyPrint");
129     if (prettyPrint instanceof Boolean) {
130       this.prettyPrint = (Boolean) prettyPrint;
131     }
132   }
133 
134   /**
135    * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
136    * (e.g. newline, tab) convert to a simple space
137    *
138    * @param html
139    *          html content to normalise.
140    * @return Returns normalised string.
141    */
142   @Nullable public String normaliseWhitespace(@Nullable final String html) {
143     if (html == null) {
144       return null;
145     }
146     return StringUtil.normaliseWhitespace(html);
147   }
148 
149   /**
150    * Splits the given HTML content into partitions based on the given separator selector. The separators themselves are
151    * dropped from the results.
152    *
153    * @param content
154    *          body HTML content to split (can not be empty or {@code null}).
155    * @param separatorCssSelector
156    *          CSS selector for separators (can not be empty or {@code null}).
157    * @return a list of HTML partitions split on separator locations, but without the separators.
158    * @since 1.0
159    * @see #split(String, String, JoinSeparator)
160    */
161   public List<String> split(@Nonnull final String content, @Nonnull final String separatorCssSelector) {
162     return split(content, separatorCssSelector, JoinSeparator.NO);
163   }
164 
165   /**
166    * Splits the given HTML content into partitions based on the given separator selector. The separators are kept as
167    * first elements of the partitions.
168    * <p>
169    * Note that the first part is removed if the split was successful. This is because the first part does not include
170    * the separator.
171    * </p>
172    *
173    * @param content
174    *          HTML content to split
175    * @param separatorCssSelector
176    *          CSS selector for separators
177    * @return a list of HTML partitions split on separator locations (except the first one), with separators at the
178    *         beginning of each partition
179    * @since 1.0
180    * @see #split(String, String, JoinSeparator)
181    */
182   public List<String> splitOnStarts(final @Nonnull String content, final @Nonnull String separatorCssSelector) {
183 
184     final List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);
185 
186     if (result == null || result.size() <= 1) {
187       // no result or just one part - return what we have
188       return result;
189     }
190 
191     // otherwise, drop the first part - the first split will be the first 'start'
192     // e.g. if we split on headings, the first part will contain everything
193     // before the first heading.
194     return result.subList(1, result.size());
195   }
196 
197   /**
198    * Splits the given HTML content into partitions based on the given separator selector. The separators are either
199    * dropped or joined with before/after depending on the indicated separator strategy.
200    *
201    * @param content
202    *          HTML content to split
203    * @param separatorCssSelector
204    *          CSS selector for separators
205    * @param separatorStrategy
206    *          strategy to drop or keep separators, one of "after", "before" or "no"
207    * @return a list of HTML partitions split on separator locations.
208    * @since 1.0
209    * @see #split(String, String, JoinSeparator)
210    */
211   public List<String> split(final @Nonnull String content,
212     final @Nonnull String separatorCssSelector,
213     final String separatorStrategy) {
214 
215     JoinSeparator sepStrategy;
216     if ("before".equals(separatorStrategy)) {
217       sepStrategy = JoinSeparator.BEFORE;
218     } else if ("after".equals(separatorStrategy)) {
219       sepStrategy = JoinSeparator.AFTER;
220     } else {
221       sepStrategy = JoinSeparator.NO;
222     }
223 
224     return split(content, separatorCssSelector, sepStrategy);
225   }
226 
227   /**
228    * Splits the given HTML content into partitions based on the given separator selector.The separators are either
229    * dropped or joined with before/after depending on the indicated separator strategy.
230    * <p>
231    * Note that splitting algorithm tries to resolve nested elements so that returned partitions are self-contained HTML
232    * elements. The nesting is normally contained within the first applicable partition.
233    * </p>
234    *
235    * @param content
236    *          Body HTML content to split
237    * @param separatorCssSelector
238    *          CSS selector for separators
239    * @param separatorStrategy
240    *          strategy to drop or keep separators
241    * @return a list of HTML partitions split on separator locations. If no splitting occurs, returns the original
242    *         content as the single element of the list
243    * @since 1.0
244    */
245   public List<String> split(@Nonnull final String content,
246     @Nonnull final String separatorCssSelector,
247     @Nonnull final JoinSeparator separatorStrategy) {
248 
249     requireNonNull(separatorStrategy);
250     final Element body = parse(content).body();
251 
252     final List<Element> separators = body.select(separatorCssSelector);
253     if (separators.size() > 0) {
254       final List<List<Element>> partitions = split(separators, separatorStrategy, body);
255 
256       final List<String> sectionHtml = new ArrayList<>();
257 
258       for (final List<Element> partition : partitions) {
259         final String html = outerHtml(partition);
260         if (!Strings.isNullOrEmpty(html)) {
261           sectionHtml.add(outerHtml(partition));
262         }
263       }
264 
265       return sectionHtml;
266     } else {
267       // nothing to split
268       return Collections.singletonList(content);
269     }
270   }
271 
272   /**
273    * Recursively splits the {@code parent} element based on the given {@code separators}. If a separator is encountered
274    * in the parent, it is split on that position. The outstanding nested elements go with the first of the partitions in
275    * each case.
276    *
277    * @param separators
278    * @param separatorStrategy
279    * @param parent
280    * @return list of partitions (as lists of root elements for each partition). Partition can be an empty list, e.g. if
281    *         the separator is at the start of the content.
282    */
283   private static List<List<Element>> split(final Collection<Element> separators,
284     final JoinSeparator separatorStrategy,
285     final Element parent) {
286 
287     final List<List<Element>> partitions = Lists.newLinkedList();
288 
289     for (final Element child : parent.children()) {
290 
291       if (separators.contains(child)) {
292         // split here and do not go deeper
293 
294         // first ensure there was a partition before
295         // otherwise the split is not recognised on an outer level
296         getLastPartition(partitions);
297 
298         if (separatorStrategy == JoinSeparator.BEFORE) {
299           // add to the last partition
300           getLastPartition(partitions).add(child);
301         }
302 
303         // add an empty new partition
304         final List<Element> newPartition = Lists.newLinkedList();
305         partitions.add(newPartition);
306 
307         if (separatorStrategy == JoinSeparator.AFTER) {
308           // add to the new partition
309           newPartition.add(child);
310         }
311 
312       } else {
313         // go deeper
314         final List<List<Element>> childPartitions = split(separators, separatorStrategy, child);
315 
316         // add the child to the last partition
317         getLastPartition(partitions).add(child);
318 
319         if (childPartitions.size() > 1) {
320           // more than one partition:
321           // only keep the first partition elements in the child
322           // so for all other partitions, remove them from their parents
323 
324           final List<Element> allChildren = child.children();
325           final List<Element> firstPartition = childPartitions.get(0);
326 
327           allChildren.removeAll(firstPartition);
328           for (final Element removeChild : allChildren) {
329             removeChild.remove();
330           }
331 
332           // add the remaining partitions
333           for (final List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
334             partitions.add(nextPartition);
335           }
336         }
337       }
338     }
339 
340     return partitions;
341   }
342 
343   /**
344    * Retrieves the last partition (as list of elements) or creates a new one if there was none before.
345    *
346    * @param partitions
347    * @return
348    */
349   private static List<Element> getLastPartition(final List<List<Element>> partitions) {
350     if (partitions.isEmpty()) {
351       final List<Element> newPartition = Lists.newLinkedList();
352       partitions.add(newPartition);
353       return newPartition;
354     } else {
355       return partitions.get(partitions.size() - 1);
356     }
357   }
358 
359   /**
360    * Outputs the list of partition root elements to HTML.
361    *
362    * @param elements
363    * @return
364    */
365   private static String outerHtml(final List<Element> elements) {
366 
367     switch (elements.size()) {
368       case 0:
369         return "";
370 
371       case 1:
372         return elements.get(0).outerHtml();
373 
374       default:
375         // more than one element
376         // wrap into <div> which we will remove afterwards
377         final Element root = new Element(Tag.valueOf("div"), "");
378         for (final Element elem : elements) {
379           root.appendChild(elem);
380         }
381 
382         return root.html();
383     }
384   }
385 
386   /**
387    * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited to
388    * a certain amount, e.g. to bring just the first of selected elements to the top.
389    *
390    * @param content
391    *          HTML content to reorder
392    * @param selector
393    *          CSS selector for elements to bring to top of the content
394    * @param amount
395    *          Maximum number of elements to reorder
396    * @return HTML content with reordered elements, or the original content if no such elements found.
397    * @since 1.0
398    */
399   public String reorderToTop(final String content, final String selector, final int amount) {
400     return reorderToTop(content, selector, amount, null);
401   }
402 
403   /**
404    * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited to
405    * a certain amount, e.g. to bring just the first of selected elements to the top.
406    *
407    * @param content
408    *          HTML content to reorder
409    * @param selector
410    *          CSS selector for elements to bring to top of the content
411    * @param amount
412    *          Maximum number of elements to reorder
413    * @param wrapRemaining
414    *          HTML to wrap the remaining (non-reordered) part
415    * @return HTML content with reordered elements, or the original content if no such elements found.
416    * @since 1.0
417    */
418   public String reorderToTop(final String content,
419     final String selector,
420     final int amount,
421     final String wrapRemaining) {
422 
423     // extract the elements and then prepend them to the remaining body
424     final List<Element> extracted = extractElements(content, selector, amount);
425 
426     if (extracted.size() > 1) {
427 
428       final Element body = extracted.get(0);
429 
430       if (wrapRemaining != null) {
431         wrapInner(body, wrapRemaining);
432       }
433 
434       final List<Element> elements = extracted.subList(1, extracted.size());
435 
436       // now prepend extracted elements to the body (in backwards to preserve original
437       // order)
438       for (int index = elements.size() - 1; index >= 0; index--) {
439         body.prependChild(elements.get(index));
440       }
441 
442       return body.html();
443     } else {
444       // nothing to reorder
445       return content;
446     }
447   }
448 
449   private static Element wrapInner(final Element element, final String html) {
450 
451     // wrap everything into an additional <div> for wrapping
452     // otherwise there may be problems, e.g. with <body> element
453     final Element topDiv = new Element(Tag.valueOf("div"), "");
454     for (final Element topElem : element.children()) {
455       // add all elements in the body to the `topDiv`
456       topElem.remove();
457       topDiv.appendChild(topElem);
458     }
459 
460     // add topDiv to the body
461     element.appendChild(topDiv);
462 
463     // wrap topDiv
464     topDiv.wrap(html);
465     // now unwrap topDiv - will remove it from the hierarchy
466     topDiv.unwrap();
467 
468     return element;
469   }
470 
471   /**
472    * Extracts elements from the HTML content.
473    *
474    * @param content
475    * @param selector
476    * @param amount
477    * @return the remainder and a list of extracted elements. The main body (remainder after extraction) is always
478    *         returned as the first element of the list.
479    */
480   private List<Element> extractElements(final String content, final String selector, final int amount) {
481 
482     final Element body = parse(content).body();
483 
484     List<Element> elements = body.select(selector);
485     if (elements.size() > 0) {
486 
487       elements = filterParents(elements);
488 
489       if (amount >= 0) {
490         // limit to the indicated amount
491         elements = elements.subList(0, Math.min(amount, elements.size()));
492       }
493 
494       // remove all from their parents
495       for (final Element element : elements) {
496         element.remove();
497       }
498     }
499 
500     final List<Element> results = new ArrayList<>();
501     // first element is the body
502     results.add(body);
503     results.addAll(elements);
504     return results;
505   }
506 
507   /**
508    * Filters the list of elements to only contain parent elements. This is to avoid both parent and child being in the
509    * list of elements.
510    *
511    * @param elements
512    * @return
513    */
514   private static List<Element> filterParents(final List<Element> elements) {
515     final List<Element> filtered = new ArrayList<>();
516     for (final Element element : elements) {
517       // get the intersection of parents and selected elements
518       final List<Element> parentsInter = element.parents().asList();
519       parentsInter.retainAll(elements);
520       if (parentsInter.isEmpty()) {
521         // no intersection - element's parents are not in the selected list
522         filtered.add(element);
523       }
524     }
525 
526     return filtered;
527   }
528 
529   /**
530    * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML elements and the
531    * remainder of HTML content, with these elements removed. Can be limited to a certain amount, e.g. to extract just
532    * the first of selected elements.
533    *
534    * @param content
535    *          HTML content to extract elements from
536    * @param selector
537    *          CSS selector for elements to extract
538    * @param amount
539    *          Maximum number of elements to extract
540    * @return HTML content of the extracted elements together with the remainder of the original content. If no elements
541    *         are found, the remainder contains the original content.
542    * @since 1.0
543    */
544   @Nonnull
545   public ExtractResult extract(final String content, final String selector, final int amount) {
546 
547     final List<Element> extracted = extractElements(content, selector, amount);
548 
549     if (extracted.size() > 1) {
550 
551       // first element is the remaining body, the rest are extracted
552       final Element body = extracted.get(0);
553       final List<Element> elements = extracted.subList(1, extracted.size());
554 
555       // convert to HTML
556       final List<String> elementStr = new ArrayList<>();
557       for (final Element el : elements) {
558         elementStr.add(el.outerHtml());
559       }
560 
561       return new DefaultExtractResult(elementStr, body.html());
562     } else {
563       // nothing to extract
564       return new DefaultExtractResult(Collections.<String> emptyList(), content);
565     }
566   }
567 
568   /**
569    * A container to carry element extraction results. Contains the extracted element HTML code and the remainder of the
570    * body content with elements removed.
571    *
572    * @author Andrius Velykis
573    * @since 1.0
574    */
575   public interface ExtractResult {
576 
577     /**
578      * Retrieves the extracted HTML elements.
579      *
580      * @return List of HTML of extracted elements. Can be empty if no elements found.
581      */
582     List<String> getExtracted();
583 
584     /**
585      * Retrieves the content from which elements were extracted.
586      *
587      * @return The HTML content with extracted elements removed.
588      */
589     String getRemainder();
590   }
591 
592   /**
593    * @author Christophe Friederich
594    */
595   private static final class DefaultExtractResult implements ExtractResult {
596 
597     /** */
598     private final List<String> extracted;
599 
600     /** */
601     private final String remainder;
602 
603     private DefaultExtractResult(final List<String> extracted, final String remainder) {
604       this.extracted = extracted;
605       this.remainder = remainder;
606     }
607 
608     @Override
609     public List<String> getExtracted() {
610       return Collections.unmodifiableList(extracted);
611     }
612 
613     @Override
614     public String getRemainder() {
615       return remainder;
616     }
617   }
618 
619   /**
620    * Sets attribute to the given value on elements in HTML.
621    *
622    * @param content
623    *          HTML content to set attributes on
624    * @param selector
625    *          CSS selector for elements to modify
626    * @param attributeKey
627    *          Attribute name
628    * @param value
629    *          Attribute value
630    * @return HTML content with modified elements. If no elements are found, the original content is returned.
631    * @since 1.0
632    */
633   public String setAttr(final String content, final String selector, final String attributeKey, final String value) {
634 
635     final Element body = parse(content).body();
636 
637     final List<Element> elements = body.select(selector);
638     if (elements.size() > 0) {
639 
640       for (final Element element : elements) {
641         element.attr(attributeKey, value);
642       }
643 
644       return body.html();
645     } else {
646       // nothing to update
647       return content;
648     }
649   }
650 
651   /**
652    * Parses body fragment to the {@code <body>} element.
653    *
654    * @param content
655    *          body HTML fragment (can not be {@code null}).
656    * @return the {@code body} element of the parsed content
657    */
658   public Document parse(@Nonnull final String content) {
659     final Document doc = Jsoup.parseBodyFragment(content);
660     doc.outputSettings().charset(outputEncoding).prettyPrint(prettyPrint);
661     return doc;
662   }
663 
664   /**
665    * Retrieves attribute value on elements in HTML. Will return all attribute values for the selector, since there can
666    * be more than one element.
667    *
668    * @param content
669    *          HTML content to read attributes from
670    * @param selector
671    *          CSS selector for elements to find
672    * @param attributeKey
673    *          Attribute name
674    * @return Attribute values for all matching elements. If no elements are found, empty list is returned.
675    * @since 1.0
676    */
677   public List<String> getAttr(final String content, final String selector, final String attributeKey) {
678 
679     final Element body = parse(content).body();
680 
681     final List<Element> elements = body.select(selector);
682     final List<String> attrs = new ArrayList<>();
683 
684     for (final Element element : elements) {
685       final String attrValue = element.attr(attributeKey);
686       attrs.add(attrValue);
687     }
688 
689     return attrs;
690   }
691 
692   /**
693    * Adds given class names to a base class name.
694    *
695    * @param baseClass
696    *          Base class name
697    * @param additionalClasses
698    *          Additional class names
699    * @return Combined class names
700    */
701   @Nonnull
702   public String addClasses(@Nonnull String baseClass, @Nonnull String additionalClasses) {
703     return addClasses(baseClass, additionalClasses == null ? new String[] {} : additionalClasses.split(" "));
704   }
705 
706   /**
707    * Adds given class names to a base class name.
708    *
709    * @param baseClass
710    *          Base class name
711    * @param additionalClasses
712    *          Additional class names
713    * @return Combined class names
714    */
715   @Nonnull
716   public String addClasses(@Nonnull String baseClass, @Nonnull String... additionalClasses) {
717     StringBuilder sb = new StringBuilder();
718     Set<String> uniqueClasses = Sets.newHashSet();
719     uniqueClasses.addAll(Arrays.asList(baseClass.split(" ")));
720     uniqueClasses.addAll(Arrays.asList(additionalClasses));
721     for (String cl : uniqueClasses) {
722       if (!Strings.isNullOrEmpty(cl)) {
723         if (sb.length() > 0) {
724           sb.append(" ");
725         }
726         sb.append(cl);
727       }
728     }
729     return sb.toString();
730   }
731 
732   /**
733    * Adds given class names to the elements in HTML.
734    *
735    * @param content
736    *          HTML content to modify
737    * @param selector
738    *          CSS selector for elements to add classes to
739    * @param classNames
740    *          Names of classes to add to the selected elements
741    * @param amount
742    *          Maximum number of elements to modify
743    * @return HTML content with modified elements. If no elements are found, the original content is returned.
744    * @since 1.0
745    */
746   public String addClass(final String content, final String selector, final List<String> classNames, final int amount) {
747 
748     final Element body = parse(content).body();
749 
750     List<Element> elements = body.select(selector);
751     if (amount >= 0) {
752       // limit to the indicated amount
753       elements = elements.subList(0, Math.min(amount, elements.size()));
754     }
755 
756     if (elements.size() > 0) {
757 
758       for (final Element element : elements) {
759         for (final String className : classNames) {
760           element.addClass(className);
761         }
762       }
763 
764       return body.html();
765     } else {
766       // nothing to update
767       return content;
768     }
769   }
770 
771   /**
772    * Adds given class names to the elements in HTML.
773    *
774    * @param content
775    *          HTML content to modify
776    * @param selector
777    *          CSS selector for elements to add classes to
778    * @param classNames
779    *          Names of classes to add to the selected elements
780    * @return HTML content with modified elements. If no elements are found, the original content is returned.
781    * @since 1.0
782    */
783   public String addClass(final String content, final String selector, final List<String> classNames) {
784     return addClass(content, selector, classNames, -1);
785   }
786 
787   /**
788    * Adds given class to the elements in HTML.
789    *
790    * @param content
791    *          HTML content to modify
792    * @param selector
793    *          CSS selector for elements to add the class to
794    * @param className
795    *          Name of class to add to the selected elements
796    * @return HTML content with modified elements. If no elements are found, the original content is returned.
797    * @since 1.0
798    */
799   public String addClass(final String content, final String selector, final String className) {
800     return addClass(content, selector, Collections.singletonList(className));
801   }
802 
803   /**
804    * Wraps elements in HTML with the given HTML.
805    *
806    * @param content
807    *          HTML content to modify
808    * @param selector
809    *          CSS selector for elements to wrap
810    * @param wrapHtml
811    *          HTML to use for wrapping the selected elements
812    * @param amount
813    *          Maximum number of elements to modify
814    * @return HTML content with modified elements. If no elements are found, the original content is returned.
815    * @since 1.0
816    */
817   public String wrap(final String content, final String selector, final String wrapHtml, final int amount) {
818 
819     final Element body = parse(content).body();
820 
821     List<Element> elements = body.select(selector);
822     if (amount >= 0) {
823       // limit to the indicated amount
824       elements = elements.subList(0, Math.min(amount, elements.size()));
825     }
826 
827     if (elements.size() > 0) {
828 
829       for (final Element element : elements) {
830         element.wrap(wrapHtml);
831       }
832 
833       return body.html();
834     } else {
835       // nothing to update
836       return content;
837     }
838   }
839 
840   /**
841    * Removes elements from HTML.
842    *
843    * @param content
844    *          HTML content to modify
845    * @param selector
846    *          CSS selector for elements to remove
847    * @return HTML content with removed elements. If no elements are found, the original content is returned.
848    * @since 1.0
849    */
850   public String remove(final String content, final String selector) {
851 
852     final Element body = parse(content).body();
853 
854     final List<Element> elements = body.select(selector);
855     if (elements.size() > 0) {
856       for (final Element element : elements) {
857         element.remove();
858       }
859 
860       return body.html();
861     } else {
862       // nothing changed
863       return content;
864     }
865   }
866 
867   /**
868    * Replaces elements in HTML.
869    *
870    * @param content
871    *          HTML content to modify
872    * @param selector
873    *          CSS selector for elements to replace
874    * @param replacement
875    *          HTML replacement (must parse to a single element)
876    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
877    * @since 1.0
878    */
879   public String replace(final String content, final String selector, final String replacement) {
880     return replaceAll(content, Collections.singletonMap(selector, replacement));
881   }
882 
883   /**
884    * Replaces elements in HTML.
885    *
886    * @param content
887    *          HTML content to modify
888    * @param replacements
889    *          Map of CSS selectors to their replacement HTML texts. CSS selectors find elements to be replaced with the
890    *          HTML in the mapping. The HTML must parse to a single element.
891    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
892    * @since 1.0
893    */
894   public String replaceAll(final String content, final Map<String, String> replacements) {
895 
896     final Element body = parse(content).body();
897 
898     boolean modified = false;
899     for (final Entry<String, String> replacementEntry : replacements.entrySet()) {
900       final String selector = replacementEntry.getKey();
901       final String replacement = replacementEntry.getValue();
902 
903       final List<Element> elements = body.select(selector);
904       if (elements.size() > 0) {
905 
906         // take the first child
907         final Element replacementElem = parse(replacement).body().child(0);
908 
909         if (replacementElem != null) {
910           for (final Element element : elements) {
911             element.replaceWith(replacementElem.clone());
912           }
913 
914           modified = true;
915         }
916       }
917     }
918 
919     if (modified) {
920       return body.html();
921     } else {
922       // nothing changed
923       return content;
924     }
925   }
926 
927   /**
928    * Replaces All elements in HTML corresponding to <code>selector</code> while preserving the content of this element.
929    *
930    * @param content
931    *          HTML content to modify
932    * @param selector
933    *          CSS selector for elements to replace
934    * @param newElement
935    *          HTML replacement (must parse to a single element)
936    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
937    * @since 2.0
938    */
939   public String replaceWith(final String content, final String selector, final String newElement) {
940 
941     final Element body = parse(content).body();
942 
943     boolean modified = false;
944     final List<Element> elements = body.select(selector);
945     if (elements.size() > 0) {
946 
947       // take the first child
948       final Element replacementElem = parse(newElement).body().child(0);
949 
950       if (replacementElem != null) {
951         for (final Element element : elements) {
952           final List<Node> children = element.childNodes();
953           final Element el = replacementElem.clone();
954           for (final Node child : children) {
955             el.appendChild(child.clone());
956           }
957           element.replaceWith(el);
958         }
959 
960         modified = true;
961       }
962     }
963 
964     if (modified) {
965       return body.html();
966     } else {
967       // nothing changed
968       return content;
969     }
970   }
971 
972   /**
973    * Retrieves text content of the selected elements in HTML. Renders the element's text as it would be displayed on the
974    * web page (including its children).
975    *
976    * @param content
977    *          HTML content with the elements
978    * @param selector
979    *          CSS selector for elements to extract contents
980    * @return A list of element texts as rendered to display. Empty list if no elements are found.
981    * @since 1.0
982    */
983   @SuppressWarnings("null")
984   public List<String> text(@Nullable final String content, @Nonnull final String selector) {
985     if (Strings.isNullOrEmpty(content)) {
986       return emptyList();
987     }
988     final Element body = parse(content).body();
989 
990     final List<Element> elements = body.select(selector);
991     final List<String> texts = new ArrayList<>();
992 
993     for (final Element element : elements) {
994       texts.add(element.text());
995     }
996 
997     return texts;
998   }
999 
1000   public String link(ISkinConfig config, String href, String name, String target, String className) {
1001     return link(config, href, name, target, null, null, className);
1002   }
1003 
1004   public String link(ISkinConfig config,
1005     String href,
1006     String name,
1007     String target,
1008     String img,
1009     String icon,
1010     String className) {
1011 
1012     final Document doc = parse("");
1013     String css = Strings.isNullOrEmpty(className) ? "" : className;
1014     if (config.isExternalLink(href)) {
1015       css = "external-link " + className;
1016     }
1017     return JsoupUtils.link(doc, href, name, target, config.relativeLink(img), icon, css).outerHtml();
1018   }
1019 
1020   public String image(ISkinConfig config, String src, String alt, String border, String width, String height) {
1021     final Document doc = parse("");
1022     return JsoupUtils.image(doc, config.relativeLink(src), alt, border, width, height).outerHtml();
1023   }
1024 
1025   /**
1026    * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to IDs for heading
1027    * elements.
1028    * <p>
1029    * The anchors are used to indicate positions within a HTML page. In HTML5, however, the {@code name} attribute is no
1030    * longer supported on {@code <a>}) tag. The positions within pages are indicated using {@code id} attribute instead,
1031    * e.g. {@code
1032    *
1033    *
1034   <h1 id="myheading">}.
1035    * </p>
1036    * <p>
1037    * The method finds anchors inside, immediately before or after the heading tags and uses their name as heading
1038    * {@code id} instead. The anchors themselves are removed.
1039    * </p>
1040    *
1041    * @param content
1042    *          HTML content to modify
1043    * @return HTML content with modified elements. Anchor names are used for adjacent headings, and anchor tags are
1044    *         removed. If no elements are found, the original content is returned.
1045    * @since 1.0
1046    */
1047   public String headingAnchorToId(final String content) {
1048 
1049     final Element body = parse(content).body();
1050 
1051     // selectors for headings without IDs
1052     final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
1053 
1054     // selector for anchor with name attribute only
1055     final String nameA = "a[name]:not([href])";
1056 
1057     // select all headings that have inner named anchor
1058     final List<Element> headingsInnerA = body.select(String.join(", ", concat(headNoIds, ":has(" + nameA + ")", true)));
1059 
1060     boolean modified = false;
1061     for (final Element heading : headingsInnerA) {
1062       final List<Element> anchors = heading.select(nameA);
1063       // take first
1064       if (!anchors.isEmpty()) {
1065         anchorToId(heading, anchors.get(0));
1066         modified = true;
1067       }
1068     }
1069 
1070     // select all headings that have a preceding named anchor
1071     final List<Element> headingsPreA = body.select(String.join(", ", concat(headNoIds, nameA + " + ", false)));
1072 
1073     for (final Element heading : headingsPreA) {
1074       final Element anchor = heading.previousElementSibling();
1075       if (anchor != null) {
1076         anchorToId(heading, anchor);
1077         modified = true;
1078       }
1079     }
1080 
1081     // select all headings that are followed by a named anchor
1082     // no selector available for that, so first select the anchors
1083     // then retrieve the headings
1084     final List<Element> anchorsPreH = body.select(String.join(", ", concat(headNoIds, " + " + nameA, true)));
1085 
1086     for (final Element anchor : anchorsPreH) {
1087       final Element heading = anchor.previousElementSibling();
1088       if (heading != null) {
1089         anchorToId(heading, anchor);
1090         modified = true;
1091       }
1092     }
1093 
1094     if (modified) {
1095       return body.html();
1096     } else {
1097       // nothing to update
1098       return content;
1099     }
1100   }
1101 
1102   /**
1103    * Moves anchor name to heading id, if one does not exist. Removes the anchor.
1104    *
1105    * @param heading
1106    * @param anchor
1107    */
1108   private static void anchorToId(final Element heading, final Element anchor) {
1109 
1110     if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
1111       final String aName = anchor.attr("name");
1112       if (!aName.isEmpty()) {
1113         // set the anchor name as heading ID
1114         heading.attr("id", aName);
1115 
1116         // remove the anchor
1117         anchor.remove();
1118       }
1119     }
1120   }
1121 
1122   /**
1123    * Utility method to concatenate a String to a list of Strings. The text can be either appended or prepended.
1124    *
1125    * @param elements
1126    *          list of elements to append/prepend the text to
1127    * @param text
1128    *          the given text to append/prepend
1129    * @param append
1130    *          if {@code true}, text will be appended to the elements. If {@code false}, it will be prepended
1131    * @return list of elements with the text appended/prepended
1132    * @since 1.0
1133    */
1134   public static List<String> concat(final List<String> elements, final String text, final boolean append) {
1135     final List<String> concats = new ArrayList<>();
1136 
1137     for (final String element : elements) {
1138       concats.add(append ? element + text : text + element);
1139     }
1140 
1141     return concats;
1142   }
1143 
1144   /**
1145    * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that do not have one.
1146    * <p>
1147    * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a heading tag without an
1148    * {@code id} is found, its "slug" is generated automatically based on the heading contents and used as the ID.
1149    * </p>
1150    * <p>
1151    * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS selectors, e.g. ":", ".",
1152    * etc. The symbols are removed.
1153    * </p>
1154    *
1155    * @param pageType
1156    *          The type of page.
1157    * @param currentPage
1158    *          The name of current page.
1159    * @param content
1160    *          HTML content to modify.
1161    * @param idSeparator
1162    *          the seperator used to slug ID.
1163    * @return Returns a {@link String} representing HTML content with all heading elements having {@code id} attributes.
1164    *         If all headings were with IDs already, the original content is returned.
1165    * @since 1.0
1166    */
1167   public String ensureHeadingIds(final String pageType,
1168     final String currentPage,
1169     final String content,
1170     final String idSeparator) {
1171     final List<String> excludedPages = Arrays.asList("checkstyle-aggregate", "checkstyle");
1172 
1173     final Element body = parse(content).body();
1174 
1175     // exclude pages
1176     if (excludedPages.contains(currentPage)) {
1177       return content;
1178     }
1179 
1180     // first find all existing IDs (to avoid generating duplicates)
1181     final List<Element> idElems = body.select("*[id]");
1182 
1183     final Set<String> ids = new HashSet<>();
1184     boolean modified = false;
1185     for (final Element idElem : idElems) {
1186 
1187       // fix all existing IDs - remove colon and other symbols which mess up jQuery
1188       final String id = idElem.id();
1189       idElem.attr("id", slug(id, idSeparator, false));
1190       modified = true;
1191 
1192       ids.add(idElem.id());
1193     }
1194 
1195     // create unique id for all heading elements
1196     final List<String> headIds = concat(HEADINGS, "[id]", true);
1197     // select all headings that have an ID
1198     final List<Element> headingIds = body.select(String.join(", ", headIds));
1199 
1200     for (final Element heading : headingIds) {
1201       final String headingText = heading.text();
1202       String headingSlug = slug(headingText, idSeparator, true);
1203       // also limit slug to 50 symbols
1204       if (headingSlug.length() > SLUG_SIZE) {
1205         headingSlug = headingSlug.substring(0, SLUG_SIZE);
1206       }
1207       final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1208 
1209       heading.attr("id", headingId);
1210     }
1211 
1212     final List<String> headNoIds = concat(HEADINGS, ":not([id], .no-anchor)", true);
1213 
1214     // select all headings that do not have an ID
1215     final List<Element> headingsNoId = body.select(String.join(", ", headNoIds));
1216 
1217     if (!headingsNoId.isEmpty() || modified) {
1218       for (final Element heading : headingsNoId) {
1219 
1220         final String headingText = heading.text();
1221         String headingSlug = slug(headingText, idSeparator, true);
1222         // also limit slug to 50 symbols
1223         if (headingSlug.length() > SLUG_SIZE) {
1224           headingSlug = headingSlug.substring(0, SLUG_SIZE);
1225         }
1226         final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1227 
1228         heading.attr("id", headingId);
1229       }
1230     }
1231 
1232     return body.html();
1233   }
1234 
1235   /**
1236    * Generated a unique ID within the given set of IDs. Appends an incrementing number for duplicates.
1237    *
1238    * @param pageType
1239    *          The type of page.
1240    * @param currentPage
1241    *          Tthe name of current page.
1242    * @param ids
1243    *          The list of ID already existing or used.
1244    * @param idBase
1245    *          The prefix to use.
1246    * @return Returns a new {@link String} representing a new unique ID.
1247    */
1248   private static String generateUniqueId(final String pageType,
1249     final String currentPage,
1250     final Set<String> ids,
1251     final String idBase) {
1252     String id = idBase;
1253     int counter = 1;
1254     while (ids.contains(id)) {
1255       id = idBase + String.valueOf(counter++);
1256     }
1257 
1258     // put the newly generated one into the set
1259     ids.add(id);
1260     if ("frame".equals(pageType)) {
1261       id = currentPage + SEPARATOR_TOC + id;
1262     }
1263     return id;
1264   }
1265 
1266   /**
1267    * Fixes table heads: wraps rows with {@code
1268    *
1269    *
1270   
1271   <th>} (table heading) elements into {@code <thead>} element if they are currently in {@code <tbody>}.
1272    *
1273    * @param content
1274    *          HTML content to modify
1275    * @return HTML content with all table heads fixed. If all heads were correct, the original content is returned.
1276    * @since 1.0
1277    */
1278   public String fixTableHeads(final String content) {
1279 
1280     final Element body = parse(content).body();
1281 
1282     final List<Element> tables = body.select("table");
1283 
1284     for (final Element table : tables) {
1285       // select rows with <th> tags within <tbody>
1286       final List<Element> tableHeadRows = table.select("tbody > tr:has(th)");
1287       // convert only table containing one tr head.
1288       if (tableHeadRows.size() == 1) {
1289 
1290         for (final Element row : tableHeadRows) {
1291 
1292           // remove row from its original position
1293           row.remove();
1294 
1295           // create table header element with the row
1296           final Element thead = new Element(Tag.valueOf("thead"), "");
1297           thead.appendChild(row);
1298           // add at the beginning of the table
1299           table.prependChild(thead);
1300         }
1301       }
1302     }
1303     return body.html();
1304   }
1305 
1306   /** */
1307   private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
1308 
1309   /** */
1310   private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
1311 
1312   /**
1313    * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). Uses "-"
1314    * as a whitespace separator.
1315    *
1316    * @param input
1317    *          text to generate the slug from
1318    * @return the slug of the given text that contains alphanumeric symbols and "-" only
1319    * @since 1.0
1320    */
1321   public static String slug(final String input) {
1322     return slug(input, DEFAULT_SLUG_SEPARATOR, true);
1323   }
1324 
1325   /**
1326    * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs).
1327    *
1328    * @param input
1329    *          text to generate the slug from
1330    * @param separator
1331    *          separator for whitespace replacement
1332    * @return the slug of the given text that contains alphanumeric symbols and separator only
1333    * @since 1.0
1334    * @see <a href=
1335    *      "http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>
1336    */
1337   private static String slug(final String input, final String separator, boolean lowercase) {
1338     final String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
1339     final String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
1340     String slug = NONLATIN.matcher(normalized).replaceAll("");
1341     if (lowercase) {
1342       return slug.toLowerCase(Locale.ENGLISH);
1343     } else {
1344       return slug;
1345     }
1346   }
1347 
1348   /**
1349    * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are nested within bigger
1350    * ones, e.g. <code>&lt;h2&gt;</code> is nested under preceding <code>&lt;h1&gt;</code>.
1351    * <p>
1352    * Only headings with IDs are included in the hierarchy. The result elements contain ID and heading text for each
1353    * heading. The hierarchy is useful to generate a Table of Contents for a page.
1354    * </p>
1355    *
1356    * @param content
1357    *          HTML content to extract heading hierarchy from
1358    * @param sections
1359    *          list of all sections
1360    * @return a list of top-level heading items (with id and text). The remaining headings are nested within these
1361    *         top-level items. Empty list if no headings are in the content.
1362    * @since 1.0
1363    */
1364   public List<? extends IdElement> headingTree(final String content, final List<String> sections) {
1365 
1366     final List<String> sectionContents = this.split(content, "hr");
1367     final List<String> headIds = concat(HEADINGS, "[id]:not(.no-anchor)", true);
1368     final List<HeadingItem> headingItems = new ArrayList<>();
1369 
1370     int index = 0;
1371     for (final String sectionContent : sectionContents) {
1372       final String sectionType = index < sections.size() ? sections.get(index++) : "";
1373 
1374       // exclude carousel headings
1375       if ("carousel".equals(sectionType)) {
1376         continue;
1377       }
1378       final Element body = parse(sectionContent).body();
1379       // select all headings that have an ID
1380       final List<Element> headings = body.select(String.join(", ", headIds));
1381       for (final Element heading : headings) {
1382         if (LOGGER.isTraceEnabled()) {
1383           LOGGER.trace("Found heading: {} - {}", heading.id(), heading.text());
1384         }
1385         headingItems.add(new HeadingItem(heading.id(), heading.nodeName(), heading.text(), headingIndex(heading)));
1386       }
1387     }
1388 
1389     final List<HeadingItem> topHeadings = new ArrayList<>();
1390     final Stack<HeadingItem> parentHeadings = new Stack<>();
1391 
1392     for (final HeadingItem heading : headingItems) {
1393 
1394       while (!parentHeadings.isEmpty() && parentHeadings.peek().headingLevel >= heading.headingLevel) {
1395         parentHeadings.pop();
1396       }
1397 
1398       if (parentHeadings.isEmpty()) {
1399         // top level heading - no parents
1400         topHeadings.add(heading);
1401       } else {
1402         // add to the children of topmost stack parent
1403         parentHeadings.peek().children.add(heading);
1404       }
1405 
1406       // push the heading onto stack
1407       parentHeadings.push(heading);
1408     }
1409 
1410     return topHeadings;
1411   }
1412 
1413   /**
1414    * Retrieves numeric index of a heading.
1415    *
1416    * @param element
1417    * @return
1418    */
1419   private static int headingIndex(final Element element) {
1420     final String tagName = element.tagName();
1421     if (tagName.startsWith("h")) {
1422       try {
1423         return Integer.parseInt(tagName.substring(1));
1424       } catch (final Exception ex) {
1425         throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
1426       }
1427     } else {
1428       throw new IllegalArgumentException("Must be a header tag: " + tagName);
1429     }
1430   }
1431 
1432   /**
1433    * @author Christophe Friederich
1434    */
1435   private static final class HeadingItem implements IdElement {
1436 
1437     /** */
1438     private final String id;
1439 
1440     /** */
1441     private final String tagName;
1442 
1443     /** */
1444     private final String text;
1445 
1446     /** */
1447     private final int headingLevel;
1448 
1449     /** */
1450     private final List<HeadingItem> children = new ArrayList<>();
1451 
1452     private HeadingItem(final String id, final String tagName, final String text, final int headingLevel) {
1453       this.id = id;
1454       this.tagName = tagName;
1455       this.text = text;
1456       this.headingLevel = headingLevel;
1457     }
1458 
1459     @Override
1460     public String getId() {
1461       return id;
1462     }
1463 
1464     @Override
1465     public String getTagName() {
1466       return tagName;
1467     }
1468 
1469     @Override
1470     public String getText() {
1471       return text;
1472     }
1473 
1474     @Override
1475     public List<HeadingItem> getItems() {
1476       return Collections.unmodifiableList(children);
1477     }
1478 
1479     @Override
1480     public int getHeadingLevel() {
1481       return headingLevel;
1482     }
1483 
1484     @Override
1485     public String toString() {
1486       return ToStringBuilder.reflectionToString(this);
1487     }
1488   }
1489 
1490   /**
1491    * Representation of a HTML element with ID and a text content. Other such elements can be nested within.
1492    *
1493    * @author Andrius Velykis
1494    * @since 1.0
1495    */
1496   public interface IdElement {
1497 
1498     /**
1499      * Retrieves the ID of the HTML element (attribute {@code id}).
1500      *
1501      * @return element {@code id} value
1502      */
1503     String getId();
1504 
1505     /**
1506      * @return Returns the tag name of element.
1507      */
1508     String getTagName();
1509 
1510     /**
1511      * Retrieves the text contents of the HTML element (rendered for display).
1512      *
1513      * @return text contents of the element
1514      */
1515     String getText();
1516 
1517     /**
1518      * @return Returns the level of heading.
1519      */
1520     int getHeadingLevel();
1521 
1522     /**
1523      * Retrieves the children of the HTML element (nested within the element).
1524      *
1525      * @return nested items within the element
1526      */
1527     List<? extends IdElement> getItems();
1528   }
1529 }