1. Project Clover database mar. avr. 16 2024 08:19:06 CEST
  2. Package org.devacfr.maven.skins.reflow

File HtmlTool.java

 

Coverage histogram

../../../../../img/srcFileCovDistChart9.png
10% of files have more coverage

Code metrics

106
335
46
6
1 453
626
106
0,32
7,28
7,67
2,3

Classes

Class
Line #
Actions
HtmlTool 70 322 0% 97 74
0.840860284,1%
HtmlTool.JoinSeparator 85 0 - 0 0
-1.0 -
HtmlTool.ExtractResult 573 0 - 0 0
-1.0 -
HtmlTool.DefaultExtractResult 593 4 0% 3 0
1.0100%
HtmlTool.HeadingItem 1364 9 0% 6 0
1.0100%
HtmlTool.IdElement 1420 0 - 0 0
-1.0 -
 

Contributing tests

This file is covered by 29 tests. .

Source view

1    /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements. See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership. The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License. You may obtain a copy of the License at
9    *
10    * http://www.apache.org/licenses/LICENSE-2.0
11    *
12    * Unless required by applicable law or agreed to in writing,
13    * software distributed under the License is distributed on an
14    * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15    * KIND, either express or implied. See the License for the
16    * specific language governing permissions and limitations
17    * under the License.
18    */
19    package org.devacfr.maven.skins.reflow;
20   
21    import javax.annotation.Nonnull;
22    import javax.annotation.Nullable;
23   
24    import java.text.Normalizer;
25    import java.text.Normalizer.Form;
26    import java.util.ArrayList;
27    import java.util.Arrays;
28    import java.util.Collection;
29    import java.util.Collections;
30    import java.util.HashSet;
31    import java.util.List;
32    import java.util.Locale;
33    import java.util.Map;
34    import java.util.Map.Entry;
35    import java.util.Set;
36    import java.util.Stack;
37    import java.util.regex.Pattern;
38   
39    import com.google.common.base.Strings;
40    import com.google.common.collect.Lists;
41    import org.apache.velocity.tools.ToolContext;
42    import org.apache.velocity.tools.config.DefaultKey;
43    import org.apache.velocity.tools.generic.SafeConfig;
44    import org.apache.velocity.tools.generic.ValueParser;
45    import org.jsoup.Jsoup;
46    import org.jsoup.internal.StringUtil;
47    import org.jsoup.nodes.Document;
48    import org.jsoup.nodes.Element;
49    import org.jsoup.nodes.Node;
50    import org.jsoup.parser.Tag;
51   
52    import static java.util.Collections.emptyList;
53    import static java.util.Objects.requireNonNull;
54   
55    /**
56    * An Apache Velocity tool that provides utility methods to manipulate HTML code using
57    * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
58    * <p>
59    * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS selectors</a> to refer to
60    * specific elements for manipulation.
61    * </p>
62    *
63    * @author Andrius Velykis
64    * @author Christophe Friederich
65    * @since 1.0
66    * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
67    * @see <a href= "http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
68    */
69    @DefaultKey("htmlTool")
 
70    public class HtmlTool extends SafeConfig {
71   
72    private static final int SLUG_SIZE = 50;
73   
74    /** Default separator using to generate slug heading name. */
75    public static final String DEFAULT_SLUG_SEPARATOR = "-";
76   
77    /** prefix heading id associated to table of contents. */
78    private static final String SEPARATOR_TOC = "_toc_";
79   
80    /** A list of all HTML heading classes (h1-6). */
81    private static final List<String> HEADINGS = Collections
82    .unmodifiableList(Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
83   
84    /** Enum indicating separator handling strategy for document partitioning. */
 
85    public enum JoinSeparator {
86    /**
87    * Keep separators at the start of partitions. The first partition will not have a separator.
88    */
89    AFTER,
90    /**
91    * Keep separators at the end of partitions. The last partition will not have a separator.
92    */
93    BEFORE,
94    /** Drop separators altogether. */
95    NO
96    }
97   
98    /** */
99    private String outputEncoding = "UTF-8";
100   
101    private boolean prettyPrint = true;
102   
103    /**
104    * {@inheritDoc}
105    *
106    * @see SafeConfig#configure(ValueParser)
107    */
 
108  22 toggle @Override
109    protected void configure(final ValueParser values) {
110   
111    // retrieve the Velocity context for output encoding
112  22 final Object velocityContext = values.get("velocityContext");
113   
114  22 if (!(velocityContext instanceof ToolContext)) {
115  0 return;
116    }
117   
118  22 final ToolContext ctxt = (ToolContext) velocityContext;
119   
120    // get the output encoding
121  22 final Object outputEncodingObj = ctxt.get("outputEncoding");
122  22 if (outputEncodingObj instanceof String) {
123  0 this.outputEncoding = (String) outputEncodingObj;
124    }
125   
126  22 final Object prettyPrint = ctxt.get("prettyPrint");
127  22 if (prettyPrint instanceof Boolean) {
128  0 this.prettyPrint = (Boolean) prettyPrint;
129    }
130    }
131   
132    /**
133    * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
134    * (e.g. newline, tab) convert to a simple space
135    *
136    * @param html
137    * html content to normalise.
138    * @return Returns normalised string.
139    */
 
140  0 toggle @Nullable public String normaliseWhitespace(@Nullable final String html) {
141  0 if (Strings.isNullOrEmpty(html)) {
142  0 return null;
143    }
144  0 return StringUtil.normaliseWhitespace(html);
145    }
146   
147    /**
148    * Splits the given HTML content into partitions based on the given separator selector. The separators themselves
149    * are dropped from the results.
150    *
151    * @param content
152    * body HTML content to split (can not be empty or {@code null}).
153    * @param separatorCssSelector
154    * CSS selector for separators (can not be empty or {@code null}).
155    * @return a list of HTML partitions split on separator locations, but without the separators.
156    * @since 1.0
157    * @see #split(String, String, JoinSeparator)
158    */
 
159  4 toggle public List<String> split(@Nonnull final String content, @Nonnull final String separatorCssSelector) {
160  4 return split(content, separatorCssSelector, JoinSeparator.NO);
161    }
162   
163    /**
164    * Splits the given HTML content into partitions based on the given separator selector. The separators are kept as
165    * first elements of the partitions.
166    * <p>
167    * Note that the first part is removed if the split was successful. This is because the first part does not include
168    * the separator.
169    * </p>
170    *
171    * @param content
172    * HTML content to split
173    * @param separatorCssSelector
174    * CSS selector for separators
175    * @return a list of HTML partitions split on separator locations (except the first one), with separators at the
176    * beginning of each partition
177    * @since 1.0
178    * @see #split(String, String, JoinSeparator)
179    */
 
180  1 toggle public List<String> splitOnStarts(final @Nonnull String content, final @Nonnull String separatorCssSelector) {
181   
182  1 final List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);
183   
184  1 if (result == null || result.size() <= 1) {
185    // no result or just one part - return what we have
186  0 return result;
187    }
188   
189    // otherwise, drop the first part - the first split will be the first 'start'
190    // e.g. if we split on headings, the first part will contain everything
191    // before the first heading.
192  1 return result.subList(1, result.size());
193    }
194   
195    /**
196    * Splits the given HTML content into partitions based on the given separator selector. The separators are either
197    * dropped or joined with before/after depending on the indicated separator strategy.
198    *
199    * @param content
200    * HTML content to split
201    * @param separatorCssSelector
202    * CSS selector for separators
203    * @param separatorStrategy
204    * strategy to drop or keep separators, one of "after", "before" or "no"
205    * @return a list of HTML partitions split on separator locations.
206    * @since 1.0
207    * @see #split(String, String, JoinSeparator)
208    */
 
209  3 toggle public List<String> split(final @Nonnull String content,
210    final @Nonnull String separatorCssSelector,
211    final String separatorStrategy) {
212   
213  3 JoinSeparator sepStrategy;
214  3 if ("before".equals(separatorStrategy)) {
215  1 sepStrategy = JoinSeparator.BEFORE;
216  2 } else if ("after".equals(separatorStrategy)) {
217  1 sepStrategy = JoinSeparator.AFTER;
218    } else {
219  1 sepStrategy = JoinSeparator.NO;
220    }
221   
222  3 return split(content, separatorCssSelector, sepStrategy);
223    }
224   
225    /**
226    * Splits the given HTML content into partitions based on the given separator selector.The separators are either
227    * dropped or joined with before/after depending on the indicated separator strategy.
228    * <p>
229    * Note that splitting algorithm tries to resolve nested elements so that returned partitions are self-contained
230    * HTML elements. The nesting is normally contained within the first applicable partition.
231    * </p>
232    *
233    * @param content
234    * Body HTML content to split
235    * @param separatorCssSelector
236    * CSS selector for separators
237    * @param separatorStrategy
238    * strategy to drop or keep separators
239    * @return a list of HTML partitions split on separator locations. If no splitting occurs, returns the original
240    * content as the single element of the list
241    * @since 1.0
242    */
 
243  11 toggle public List<String> split(@Nonnull final String content,
244    @Nonnull final String separatorCssSelector,
245    @Nonnull final JoinSeparator separatorStrategy) {
246   
247  11 requireNonNull(separatorStrategy);
248  11 final Element body = parse(content).body();
249   
250  11 final List<Element> separators = body.select(separatorCssSelector);
251  11 if (separators.size() > 0) {
252  9 final List<List<Element>> partitions = split(separators, separatorStrategy, body);
253   
254  9 final List<String> sectionHtml = new ArrayList<>();
255   
256  9 for (final List<Element> partition : partitions) {
257  19 final String html = outerHtml(partition);
258  19 if (!Strings.isNullOrEmpty(html)) {
259  18 sectionHtml.add(outerHtml(partition));
260    }
261    }
262   
263  9 return sectionHtml;
264    } else {
265    // nothing to split
266  2 return Collections.singletonList(content);
267    }
268    }
269   
270    /**
271    * Recursively splits the {@code parent} element based on the given {@code separators}. If a separator is
272    * encountered in the parent, it is split on that position. The outstanding nested elements go with the first of the
273    * partitions in each case.
274    *
275    * @param separators
276    * @param separatorStrategy
277    * @param parent
278    * @return list of partitions (as lists of root elements for each partition). Partition can be an empty list, e.g.
279    * if the separator is at the start of the content.
280    */
 
281  28 toggle private static List<List<Element>> split(final Collection<Element> separators,
282    final JoinSeparator separatorStrategy,
283    final Element parent) {
284   
285  28 final List<List<Element>> partitions = Lists.newLinkedList();
286   
287  28 for (final Element child : parent.children()) {
288   
289  29 if (separators.contains(child)) {
290    // split here and do not go deeper
291   
292    // first ensure there was a partition before
293    // otherwise the split is not recognised on an outer level
294  10 getLastPartition(partitions);
295   
296  10 if (separatorStrategy == JoinSeparator.BEFORE) {
297    // add to the last partition
298  2 getLastPartition(partitions).add(child);
299    }
300   
301    // add an empty new partition
302  10 final List<Element> newPartition = Lists.newLinkedList();
303  10 partitions.add(newPartition);
304   
305  10 if (separatorStrategy == JoinSeparator.AFTER) {
306    // add to the new partition
307  3 newPartition.add(child);
308    }
309   
310    } else {
311    // go deeper
312  19 final List<List<Element>> childPartitions = split(separators, separatorStrategy, child);
313   
314    // add the child to the last partition
315  19 getLastPartition(partitions).add(child);
316   
317  19 if (childPartitions.size() > 1) {
318    // more than one partition:
319    // only keep the first partition elements in the child
320    // so for all other partitions, remove them from their parents
321   
322  2 final List<Element> allChildren = child.children();
323  2 final List<Element> firstPartition = childPartitions.get(0);
324   
325  2 allChildren.removeAll(firstPartition);
326  2 for (final Element removeChild : allChildren) {
327  2 removeChild.remove();
328    }
329   
330    // add the remaining partitions
331  2 for (final List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
332  2 partitions.add(nextPartition);
333    }
334    }
335    }
336    }
337   
338  28 return partitions;
339    }
340   
341    /**
342    * Retrieves the last partition (as list of elements) or creates a new one if there was none before.
343    *
344    * @param partitions
345    * @return
346    */
 
347  31 toggle private static List<Element> getLastPartition(final List<List<Element>> partitions) {
348  31 if (partitions.isEmpty()) {
349  11 final List<Element> newPartition = Lists.newLinkedList();
350  11 partitions.add(newPartition);
351  11 return newPartition;
352    } else {
353  20 return partitions.get(partitions.size() - 1);
354    }
355    }
356   
357    /**
358    * Outputs the list of partition root elements to HTML.
359    *
360    * @param elements
361    * @return
362    */
 
363  37 toggle private static String outerHtml(final List<Element> elements) {
364   
365  37 switch (elements.size()) {
366  1 case 0:
367  1 return "";
368   
369  24 case 1:
370  24 return elements.get(0).outerHtml();
371   
372  12 default:
373    // more than one element
374    // wrap into <div> which we will remove afterwards
375  12 final Element root = new Element(Tag.valueOf("div"), "");
376  12 for (final Element elem : elements) {
377  24 root.appendChild(elem);
378    }
379   
380  12 return root.html();
381    }
382    }
383   
384    /**
385    * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited
386    * to a certain amount, e.g. to bring just the first of selected elements to the top.
387    *
388    * @param content
389    * HTML content to reorder
390    * @param selector
391    * CSS selector for elements to bring to top of the content
392    * @param amount
393    * Maximum number of elements to reorder
394    * @return HTML content with reordered elements, or the original content if no such elements found.
395    * @since 1.0
396    */
 
397  0 toggle public String reorderToTop(final String content, final String selector, final int amount) {
398  0 return reorderToTop(content, selector, amount, null);
399    }
400   
401    /**
402    * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited
403    * to a certain amount, e.g. to bring just the first of selected elements to the top.
404    *
405    * @param content
406    * HTML content to reorder
407    * @param selector
408    * CSS selector for elements to bring to top of the content
409    * @param amount
410    * Maximum number of elements to reorder
411    * @param wrapRemaining
412    * HTML to wrap the remaining (non-reordered) part
413    * @return HTML content with reordered elements, or the original content if no such elements found.
414    * @since 1.0
415    */
 
416  1 toggle public String reorderToTop(final String content,
417    final String selector,
418    final int amount,
419    final String wrapRemaining) {
420   
421    // extract the elements and then prepend them to the remaining body
422  1 final List<Element> extracted = extractElements(content, selector, amount);
423   
424  1 if (extracted.size() > 1) {
425   
426  1 final Element body = extracted.get(0);
427   
428  1 if (wrapRemaining != null) {
429  1 wrapInner(body, wrapRemaining);
430    }
431   
432  1 final List<Element> elements = extracted.subList(1, extracted.size());
433   
434    // now prepend extracted elements to the body (in backwards to preserve original
435    // order)
436  2 for (int index = elements.size() - 1; index >= 0; index--) {
437  1 body.prependChild(elements.get(index));
438    }
439   
440  1 return body.html();
441    } else {
442    // nothing to reorder
443  0 return content;
444    }
445    }
446   
 
447  1 toggle private static Element wrapInner(final Element element, final String html) {
448   
449    // wrap everything into an additional <div> for wrapping
450    // otherwise there may be problems, e.g. with <body> element
451  1 final Element topDiv = new Element(Tag.valueOf("div"), "");
452  1 for (final Element topElem : element.children()) {
453    // add all elements in the body to the `topDiv`
454  1 topElem.remove();
455  1 topDiv.appendChild(topElem);
456    }
457   
458    // add topDiv to the body
459  1 element.appendChild(topDiv);
460   
461    // wrap topDiv
462  1 topDiv.wrap(html);
463    // now unwrap topDiv - will remove it from the hierarchy
464  1 topDiv.unwrap();
465   
466  1 return element;
467    }
468   
469    /**
470    * Extracts elements from the HTML content.
471    *
472    * @param content
473    * @param selector
474    * @param amount
475    * @return the remainder and a list of extracted elements. The main body (remainder after extraction) is always
476    * returned as the first element of the list.
477    */
 
478  2 toggle private List<Element> extractElements(final String content, final String selector, final int amount) {
479   
480  2 final Element body = parse(content).body();
481   
482  2 List<Element> elements = body.select(selector);
483  2 if (elements.size() > 0) {
484   
485  2 elements = filterParents(elements);
486   
487  2 if (amount >= 0) {
488    // limit to the indicated amount
489  2 elements = elements.subList(0, Math.min(amount, elements.size()));
490    }
491   
492    // remove all from their parents
493  2 for (final Element element : elements) {
494  4 element.remove();
495    }
496    }
497   
498  2 final List<Element> results = new ArrayList<>();
499    // first element is the body
500  2 results.add(body);
501  2 results.addAll(elements);
502  2 return results;
503    }
504   
505    /**
506    * Filters the list of elements to only contain parent elements. This is to avoid both parent and child being in the
507    * list of elements.
508    *
509    * @param elements
510    * @return
511    */
 
512  2 toggle private static List<Element> filterParents(final List<Element> elements) {
513  2 final List<Element> filtered = new ArrayList<>();
514  2 for (final Element element : elements) {
515    // get the intersection of parents and selected elements
516  6 final List<Element> parentsInter = element.parents();
517  6 parentsInter.retainAll(elements);
518  6 if (parentsInter.isEmpty()) {
519    // no intersection - element's parents are not in the selected list
520  4 filtered.add(element);
521    }
522    }
523   
524  2 return filtered;
525    }
526   
527    /**
528    * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML elements and the
529    * remainder of HTML content, with these elements removed. Can be limited to a certain amount, e.g. to extract just
530    * the first of selected elements.
531    *
532    * @param content
533    * HTML content to extract elements from
534    * @param selector
535    * CSS selector for elements to extract
536    * @param amount
537    * Maximum number of elements to extract
538    * @return HTML content of the extracted elements together with the remainder of the original content. If no
539    * elements are found, the remainder contains the original content.
540    * @since 1.0
541    */
 
542  1 toggle @Nonnull
543    public ExtractResult extract(final String content, final String selector, final int amount) {
544   
545  1 final List<Element> extracted = extractElements(content, selector, amount);
546   
547  1 if (extracted.size() > 1) {
548   
549    // first element is the remaining body, the rest are extracted
550  1 final Element body = extracted.get(0);
551  1 final List<Element> elements = extracted.subList(1, extracted.size());
552   
553    // convert to HTML
554  1 final List<String> elementStr = new ArrayList<>();
555  1 for (final Element el : elements) {
556  3 elementStr.add(el.outerHtml());
557    }
558   
559  1 return new DefaultExtractResult(elementStr, body.html());
560    } else {
561    // nothing to extract
562  0 return new DefaultExtractResult(Collections.<String> emptyList(), content);
563    }
564    }
565   
566    /**
567    * A container to carry element extraction results. Contains the extracted element HTML code and the remainder of
568    * the body content with elements removed.
569    *
570    * @author Andrius Velykis
571    * @since 1.0
572    */
 
573    public interface ExtractResult {
574   
575    /**
576    * Retrieves the extracted HTML elements.
577    *
578    * @return List of HTML of extracted elements. Can be empty if no elements found.
579    */
580    List<String> getExtracted();
581   
582    /**
583    * Retrieves the content from which elements were extracted.
584    *
585    * @return The HTML content with extracted elements removed.
586    */
587    String getRemainder();
588    }
589   
590    /**
591    * @author Christophe Friederich
592    */
 
593    private static final class DefaultExtractResult implements ExtractResult {
594   
595    /** */
596    private final List<String> extracted;
597   
598    /** */
599    private final String remainder;
600   
 
601  1 toggle private DefaultExtractResult(final List<String> extracted, final String remainder) {
602  1 this.extracted = extracted;
603  1 this.remainder = remainder;
604    }
605   
 
606  1 toggle @Override
607    public List<String> getExtracted() {
608  1 return Collections.unmodifiableList(extracted);
609    }
610   
 
611  1 toggle @Override
612    public String getRemainder() {
613  1 return remainder;
614    }
615    }
616   
617    /**
618    * Sets attribute to the given value on elements in HTML.
619    *
620    * @param content
621    * HTML content to set attributes on
622    * @param selector
623    * CSS selector for elements to modify
624    * @param attributeKey
625    * Attribute name
626    * @param value
627    * Attribute value
628    * @return HTML content with modified elements. If no elements are found, the original content is returned.
629    * @since 1.0
630    */
 
631  6 toggle public String setAttr(final String content, final String selector, final String attributeKey, final String value) {
632   
633  6 final Element body = parse(content).body();
634   
635  6 final List<Element> elements = body.select(selector);
636  6 if (elements.size() > 0) {
637   
638  2 for (final Element element : elements) {
639  2 element.attr(attributeKey, value);
640    }
641   
642  2 return body.html();
643    } else {
644    // nothing to update
645  4 return content;
646    }
647    }
648   
649    /**
650    * Parses body fragment to the {@code <body>} element.
651    *
652    * @param content
653    * body HTML fragment (can not be {@code null}).
654    * @return the {@code body} element of the parsed content
655    */
 
656  74 toggle public Document parse(@Nonnull final String content) {
657  74 final Document doc = Jsoup.parseBodyFragment(content);
658  74 doc.outputSettings().charset(outputEncoding).prettyPrint(prettyPrint);
659  74 return doc;
660    }
661   
662    /**
663    * Retrieves attribute value on elements in HTML. Will return all attribute values for the selector, since there can
664    * be more than one element.
665    *
666    * @param content
667    * HTML content to read attributes from
668    * @param selector
669    * CSS selector for elements to find
670    * @param attributeKey
671    * Attribute name
672    * @return Attribute values for all matching elements. If no elements are found, empty list is returned.
673    * @since 1.0
674    */
 
675  1 toggle public List<String> getAttr(final String content, final String selector, final String attributeKey) {
676   
677  1 final Element body = parse(content).body();
678   
679  1 final List<Element> elements = body.select(selector);
680  1 final List<String> attrs = new ArrayList<>();
681   
682  1 for (final Element element : elements) {
683  1 final String attrValue = element.attr(attributeKey);
684  1 attrs.add(attrValue);
685    }
686   
687  1 return attrs;
688    }
689   
690    /**
691    * Adds given class names to the elements in HTML.
692    *
693    * @param content
694    * HTML content to modify
695    * @param selector
696    * CSS selector for elements to add classes to
697    * @param classNames
698    * Names of classes to add to the selected elements
699    * @param amount
700    * Maximum number of elements to modify
701    * @return HTML content with modified elements. If no elements are found, the original content is returned.
702    * @since 1.0
703    */
 
704  11 toggle public String addClass(final String content,
705    final String selector,
706    final List<String> classNames,
707    final int amount) {
708   
709  11 final Element body = parse(content).body();
710   
711  11 List<Element> elements = body.select(selector);
712  11 if (amount >= 0) {
713    // limit to the indicated amount
714  0 elements = elements.subList(0, Math.min(amount, elements.size()));
715    }
716   
717  11 if (elements.size() > 0) {
718   
719  4 for (final Element element : elements) {
720  12 for (final String className : classNames) {
721  14 element.addClass(className);
722    }
723    }
724   
725  4 return body.html();
726    } else {
727    // nothing to update
728  7 return content;
729    }
730    }
731   
732    /**
733    * Adds given class names to the elements in HTML.
734    *
735    * @param content
736    * HTML content to modify
737    * @param selector
738    * CSS selector for elements to add classes to
739    * @param classNames
740    * Names of classes to add to the selected elements
741    * @return HTML content with modified elements. If no elements are found, the original content is returned.
742    * @since 1.0
743    */
 
744  11 toggle public String addClass(final String content, final String selector, final List<String> classNames) {
745  11 return addClass(content, selector, classNames, -1);
746    }
747   
748    /**
749    * Adds given class to the elements in HTML.
750    *
751    * @param content
752    * HTML content to modify
753    * @param selector
754    * CSS selector for elements to add the class to
755    * @param className
756    * Name of class to add to the selected elements
757    * @return HTML content with modified elements. If no elements are found, the original content is returned.
758    * @since 1.0
759    */
 
760  1 toggle public String addClass(final String content, final String selector, final String className) {
761  1 return addClass(content, selector, Collections.singletonList(className));
762    }
763   
764    /**
765    * Wraps elements in HTML with the given HTML.
766    *
767    * @param content
768    * HTML content to modify
769    * @param selector
770    * CSS selector for elements to wrap
771    * @param wrapHtml
772    * HTML to use for wrapping the selected elements
773    * @param amount
774    * Maximum number of elements to modify
775    * @return HTML content with modified elements. If no elements are found, the original content is returned.
776    * @since 1.0
777    */
 
778  1 toggle public String wrap(final String content, final String selector, final String wrapHtml, final int amount) {
779   
780  1 final Element body = parse(content).body();
781   
782  1 List<Element> elements = body.select(selector);
783  1 if (amount >= 0) {
784    // limit to the indicated amount
785  1 elements = elements.subList(0, Math.min(amount, elements.size()));
786    }
787   
788  1 if (elements.size() > 0) {
789   
790  1 for (final Element element : elements) {
791  1 element.wrap(wrapHtml);
792    }
793   
794  1 return body.html();
795    } else {
796    // nothing to update
797  0 return content;
798    }
799    }
800   
801    /**
802    * Removes elements from HTML.
803    *
804    * @param content
805    * HTML content to modify
806    * @param selector
807    * CSS selector for elements to remove
808    * @return HTML content with removed elements. If no elements are found, the original content is returned.
809    * @since 1.0
810    */
 
811  1 toggle public String remove(final String content, final String selector) {
812   
813  1 final Element body = parse(content).body();
814   
815  1 final List<Element> elements = body.select(selector);
816  1 if (elements.size() > 0) {
817  1 for (final Element element : elements) {
818  1 element.remove();
819    }
820   
821  1 return body.html();
822    } else {
823    // nothing changed
824  0 return content;
825    }
826    }
827   
828    /**
829    * Replaces elements in HTML.
830    *
831    * @param content
832    * HTML content to modify
833    * @param selector
834    * CSS selector for elements to replace
835    * @param replacement
836    * HTML replacement (must parse to a single element)
837    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
838    * @since 1.0
839    */
 
840  1 toggle public String replace(final String content, final String selector, final String replacement) {
841  1 return replaceAll(content, Collections.singletonMap(selector, replacement));
842    }
843   
844    /**
845    * Replaces elements in HTML.
846    *
847    * @param content
848    * HTML content to modify
849    * @param replacements
850    * Map of CSS selectors to their replacement HTML texts. CSS selectors find elements to be replaced with
851    * the HTML in the mapping. The HTML must parse to a single element.
852    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
853    * @since 1.0
854    */
 
855  6 toggle public String replaceAll(final String content, final Map<String, String> replacements) {
856   
857  6 final Element body = parse(content).body();
858   
859  6 boolean modified = false;
860  6 for (final Entry<String, String> replacementEntry : replacements.entrySet()) {
861  46 final String selector = replacementEntry.getKey();
862  46 final String replacement = replacementEntry.getValue();
863   
864  46 final List<Element> elements = body.select(selector);
865  46 if (elements.size() > 0) {
866   
867    // take the first child
868  10 final Element replacementElem = parse(replacement).body().child(0);
869   
870  10 if (replacementElem != null) {
871  10 for (final Element element : elements) {
872  10 element.replaceWith(replacementElem.clone());
873    }
874   
875  10 modified = true;
876    }
877    }
878    }
879   
880  6 if (modified) {
881  2 return body.html();
882    } else {
883    // nothing changed
884  4 return content;
885    }
886    }
887   
888    /**
889    * Replaces All elements in HTML corresponding to <code>selector</code> while preserving the content of this
890    * element.
891    *
892    * @param content
893    * HTML content to modify
894    * @param selector
895    * CSS selector for elements to replace
896    * @param newElement
897    * HTML replacement (must parse to a single element)
898    * @return HTML content with replaced elements. If no elements are found, the original content is returned.
899    * @since 2.0
900    */
 
901  6 toggle public String replaceWith(final String content, final String selector, final String newElement) {
902   
903  6 final Element body = parse(content).body();
904   
905  6 boolean modified = false;
906  6 final List<Element> elements = body.select(selector);
907  6 if (elements.size() > 0) {
908   
909    // take the first child
910  2 final Element replacementElem = parse(newElement).body().child(0);
911   
912  2 if (replacementElem != null) {
913  2 for (final Element element : elements) {
914  2 final List<Node> children = element.childNodes();
915  2 final Element el = replacementElem.clone();
916  2 for (final Node child : children) {
917  2 el.appendChild(child.clone());
918    }
919  2 element.replaceWith(el);
920    }
921   
922  2 modified = true;
923    }
924    }
925   
926  6 if (modified) {
927  2 return body.html();
928    } else {
929    // nothing changed
930  4 return content;
931    }
932    }
933   
934    /**
935    * Retrieves text content of the selected elements in HTML. Renders the element's text as it would be displayed on
936    * the web page (including its children).
937    *
938    * @param content
939    * HTML content with the elements
940    * @param selector
941    * CSS selector for elements to extract contents
942    * @return A list of element texts as rendered to display. Empty list if no elements are found.
943    * @since 1.0
944    */
 
945  1 toggle public List<String> text(@Nullable final String content, @Nonnull final String selector) {
946  1 if (Strings.isNullOrEmpty(content)) {
947  0 return emptyList();
948    }
949  1 final Element body = parse(content).body();
950   
951  1 final List<Element> elements = body.select(selector);
952  1 final List<String> texts = new ArrayList<>();
953   
954  1 for (final Element element : elements) {
955  1 texts.add(element.text());
956    }
957   
958  1 return texts;
959    }
960   
961    /**
962    * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to IDs for heading
963    * elements.
964    * <p>
965    * The anchors are used to indicate positions within a HTML page. In HTML5, however, the {@code name} attribute is
966    * no longer supported on {@code <a>}) tag. The positions within pages are indicated using {@code id} attribute
967    * instead, e.g. {@code
968    *
969    *
970    <h1 id="myheading">}.
971    * </p>
972    * <p>
973    * The method finds anchors inside, immediately before or after the heading tags and uses their name as heading
974    * {@code id} instead. The anchors themselves are removed.
975    * </p>
976    *
977    * @param content
978    * HTML content to modify
979    * @return HTML content with modified elements. Anchor names are used for adjacent headings, and anchor tags are
980    * removed. If no elements are found, the original content is returned.
981    * @since 1.0
982    */
 
983  6 toggle public String headingAnchorToId(final String content) {
984   
985  6 final Element body = parse(content).body();
986   
987    // selectors for headings without IDs
988  6 final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
989   
990    // selector for anchor with name attribute only
991  6 final String nameA = "a[name]:not([href])";
992   
993    // select all headings that have inner named anchor
994  6 final List<Element> headingsInnerA = body
995    .select(String.join(", ", concat(headNoIds, ":has(" + nameA + ")", true)));
996   
997  6 boolean modified = false;
998  6 for (final Element heading : headingsInnerA) {
999  9 final List<Element> anchors = heading.select(nameA);
1000    // take first
1001  9 if (!anchors.isEmpty()) {
1002  9 anchorToId(heading, anchors.get(0));
1003  9 modified = true;
1004    }
1005    }
1006   
1007    // select all headings that have a preceding named anchor
1008  6 final List<Element> headingsPreA = body.select(String.join(", ", concat(headNoIds, nameA + " + ", false)));
1009   
1010  6 for (final Element heading : headingsPreA) {
1011  0 final Element anchor = heading.previousElementSibling();
1012  0 if (anchor != null) {
1013  0 anchorToId(heading, anchor);
1014  0 modified = true;
1015    }
1016    }
1017   
1018    // select all headings that are followed by a named anchor
1019    // no selector available for that, so first select the anchors
1020    // then retrieve the headings
1021  6 final List<Element> anchorsPreH = body.select(String.join(", ", concat(headNoIds, " + " + nameA, true)));
1022   
1023  6 for (final Element anchor : anchorsPreH) {
1024  0 final Element heading = anchor.previousElementSibling();
1025  0 if (heading != null) {
1026  0 anchorToId(heading, anchor);
1027  0 modified = true;
1028    }
1029    }
1030   
1031  6 if (modified) {
1032  4 return body.html();
1033    } else {
1034    // nothing to update
1035  2 return content;
1036    }
1037    }
1038   
1039    /**
1040    * Moves anchor name to heading id, if one does not exist. Removes the anchor.
1041    *
1042    * @param heading
1043    * @param anchor
1044    */
 
1045  9 toggle private static void anchorToId(final Element heading, final Element anchor) {
1046   
1047  9 if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
1048  9 final String aName = anchor.attr("name");
1049  9 if (!aName.isEmpty()) {
1050    // set the anchor name as heading ID
1051  9 heading.attr("id", aName);
1052   
1053    // remove the anchor
1054  9 anchor.remove();
1055    }
1056    }
1057    }
1058   
1059    /**
1060    * Utility method to concatenate a String to a list of Strings. The text can be either appended or prepended.
1061    *
1062    * @param elements
1063    * list of elements to append/prepend the text to
1064    * @param text
1065    * the given text to append/prepend
1066    * @param append
1067    * if {@code true}, text will be appended to the elements. If {@code false}, it will be prepended
1068    * @return list of elements with the text appended/prepended
1069    * @since 1.0
1070    */
 
1071  31 toggle public static List<String> concat(final List<String> elements, final String text, final boolean append) {
1072  31 final List<String> concats = new ArrayList<>();
1073   
1074  31 for (final String element : elements) {
1075  186 concats.add(append ? element + text : text + element);
1076    }
1077   
1078  31 return concats;
1079    }
1080   
1081    /**
1082    * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that do not have one.
1083    * <p>
1084    * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a heading tag without an
1085    * {@code id} is found, its "slug" is generated automatically based on the heading contents and used as the ID.
1086    * </p>
1087    * <p>
1088    * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS selectors, e.g. ":", ".",
1089    * etc. The symbols are removed.
1090    * </p>
1091    *
1092    * @param pageType
1093    * The type of page.
1094    * @param currentPage
1095    * The name of current page.
1096    * @param content
1097    * HTML content to modify.
1098    * @param idSeparator
1099    * the seperator used to slug ID.
1100    * @return Returns a {@link String} representing HTML content with all heading elements having {@code id}
1101    * attributes. If all headings were with IDs already, the original content is returned.
1102    * @since 1.0
1103    */
 
1104  3 toggle public String ensureHeadingIds(final String pageType,
1105    final String currentPage,
1106    final String content,
1107    final String idSeparator) {
1108  3 final List<String> excludedPages = Arrays.asList("checkstyle-aggregate", "checkstyle");
1109   
1110  3 final Element body = parse(content).body();
1111   
1112    // exclude pages
1113  3 if (excludedPages.contains(currentPage)) {
1114  0 return content;
1115    }
1116   
1117    // first find all existing IDs (to avoid generating duplicates)
1118  3 final List<Element> idElems = body.select("*[id]");
1119   
1120  3 final Set<String> ids = new HashSet<>();
1121  3 boolean modified = false;
1122  3 for (final Element idElem : idElems) {
1123   
1124    // fix all existing IDs - remove colon and other symbols which mess up jQuery
1125  0 final String id = idElem.id();
1126  0 idElem.attr("id", slug(id, idSeparator));
1127  0 modified = true;
1128   
1129  0 ids.add(idElem.id());
1130    }
1131   
1132    // create unique id for all heading elements
1133  3 final List<String> headIds = concat(HEADINGS, "[id]", true);
1134    // select all headings that have an ID
1135  3 final List<Element> headingIds = body.select(String.join(", ", headIds));
1136   
1137  3 for (final Element heading : headingIds) {
1138  0 final String headingText = heading.text();
1139  0 String headingSlug = slug(headingText, idSeparator);
1140    // also limit slug to 50 symbols
1141  0 if (headingSlug.length() > SLUG_SIZE) {
1142  0 headingSlug = headingSlug.substring(0, SLUG_SIZE);
1143    }
1144  0 final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1145   
1146  0 heading.attr("id", headingId);
1147    }
1148   
1149  3 final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
1150   
1151    // select all headings that do not have an ID
1152  3 final List<Element> headingsNoId = body.select(String.join(", ", headNoIds));
1153   
1154  3 if (!headingsNoId.isEmpty() || modified) {
1155  3 for (final Element heading : headingsNoId) {
1156   
1157  9 final String headingText = heading.text();
1158  9 String headingSlug = slug(headingText, idSeparator);
1159    // also limit slug to 50 symbols
1160  9 if (headingSlug.length() > SLUG_SIZE) {
1161  0 headingSlug = headingSlug.substring(0, SLUG_SIZE);
1162    }
1163  9 final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);
1164   
1165  9 heading.attr("id", headingId);
1166    }
1167    }
1168   
1169  3 return body.html();
1170    }
1171   
1172    /**
1173    * Generated a unique ID within the given set of IDs. Appends an incrementing number for duplicates.
1174    *
1175    * @param pageType
1176    * The type of page.
1177    * @param currentPage
1178    * Tthe name of current page.
1179    * @param ids
1180    * The list of ID already existing or used.
1181    * @param idBase
1182    * The prefix to use.
1183    * @return Returns a new {@link String} representing a new unique ID.
1184    */
 
1185  9 toggle private static String generateUniqueId(final String pageType,
1186    final String currentPage,
1187    final Set<String> ids,
1188    final String idBase) {
1189  9 String id = idBase;
1190  9 int counter = 1;
1191  9 while (ids.contains(id)) {
1192  0 id = idBase + String.valueOf(counter++);
1193    }
1194   
1195    // put the newly generated one into the set
1196  9 ids.add(id);
1197  9 if ("frame".equals(pageType)) {
1198  3 id = currentPage + SEPARATOR_TOC + id;
1199    }
1200  9 return id;
1201    }
1202   
1203    /**
1204    * Fixes table heads: wraps rows with {@code
1205    *
1206    *
1207    <th>} (table heading) elements into {@code <thead>} element if they are currently in {@code <tbody>}.
1208    *
1209    * @param content
1210    * HTML content to modify
1211    * @return HTML content with all table heads fixed. If all heads were correct, the original content is returned.
1212    * @since 1.0
1213    */
 
1214  6 toggle public String fixTableHeads(final String content) {
1215   
1216  6 final Element body = parse(content).body();
1217   
1218  6 final List<Element> tables = body.select("table");
1219   
1220  6 for (final Element table : tables) {
1221    // select rows with <th> tags within <tbody>
1222  7 final List<Element> tableHeadRows = table.select("tbody > tr:has(th)");
1223    // convert only table containing one tr head.
1224  7 if (tableHeadRows.size() == 1) {
1225   
1226  4 for (final Element row : tableHeadRows) {
1227   
1228    // remove row from its original position
1229  4 row.remove();
1230   
1231    // create table header element with the row
1232  4 final Element thead = new Element(Tag.valueOf("thead"), "");
1233  4 thead.appendChild(row);
1234    // add at the beginning of the table
1235  4 table.prependChild(thead);
1236    }
1237    }
1238    }
1239  6 return body.html();
1240    }
1241   
1242    /** */
1243    private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
1244   
1245    /** */
1246    private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
1247   
1248    /**
1249    * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). Uses "-"
1250    * as a whitespace separator.
1251    *
1252    * @param input
1253    * text to generate the slug from
1254    * @return the slug of the given text that contains alphanumeric symbols and "-" only
1255    * @since 1.0
1256    */
 
1257  36 toggle public static String slug(final String input) {
1258  36 return slug(input, DEFAULT_SLUG_SEPARATOR);
1259    }
1260   
1261    /**
1262    * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs).
1263    *
1264    * @param input
1265    * text to generate the slug from
1266    * @param separator
1267    * separator for whitespace replacement
1268    * @return the slug of the given text that contains alphanumeric symbols and separator only
1269    * @since 1.0
1270    * @see <a href=
1271    * "http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>
1272    */
 
1273  45 toggle private static String slug(final String input, final String separator) {
1274  45 final String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
1275  45 final String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
1276  45 return NONLATIN.matcher(normalized).replaceAll("").toLowerCase(Locale.ENGLISH);
1277    }
1278   
1279    /**
1280    * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are nested within bigger
1281    * ones, e.g. <code>&lt;h2&gt;</code> is nested under preceding <code>&lt;h1&gt;</code>.
1282    * <p>
1283    * Only headings with IDs are included in the hierarchy. The result elements contain ID and heading text for each
1284    * heading. The hierarchy is useful to generate a Table of Contents for a page.
1285    * </p>
1286    *
1287    * @param content
1288    * HTML content to extract heading hierarchy from
1289    * @param sections
1290    * list of all sections
1291    * @return a list of top-level heading items (with id and text). The remaining headings are nested within these
1292    * top-level items. Empty list if no headings are in the content.
1293    * @since 1.0
1294    */
 
1295  1 toggle public List<? extends IdElement> headingTree(final String content, final List<String> sections) {
1296   
1297  1 final List<String> sectionContents = this.split(content, "hr");
1298  1 final List<String> headIds = concat(HEADINGS, "[id]:not(.no-anchor)", true);
1299  1 final List<HeadingItem> headingItems = new ArrayList<>();
1300   
1301  1 int index = 0;
1302  1 for (final String sectionContent : sectionContents) {
1303  1 final String sectionType = index < sections.size() ? sections.get(index++) : "";
1304   
1305    // exclude carousel headings
1306  1 if ("carousel".equals(sectionType)) {
1307  0 continue;
1308    }
1309  1 final Element body = parse(sectionContent).body();
1310    // select all headings that have an ID
1311  1 final List<Element> headings = body.select(String.join(", ", headIds));
1312  1 for (final Element heading : headings) {
1313  3 headingItems
1314    .add(new HeadingItem(heading.id(), heading.nodeName(), heading.text(), headingIndex(heading)));
1315    }
1316    }
1317   
1318  1 final List<HeadingItem> topHeadings = new ArrayList<>();
1319  1 final Stack<HeadingItem> parentHeadings = new Stack<>();
1320   
1321  1 for (final HeadingItem heading : headingItems) {
1322   
1323  4 while (!parentHeadings.isEmpty() && parentHeadings.peek().headingLevel >= heading.headingLevel) {
1324  1 parentHeadings.pop();
1325    }
1326   
1327  3 if (parentHeadings.isEmpty()) {
1328    // top level heading - no parents
1329  1 topHeadings.add(heading);
1330    } else {
1331    // add to the children of topmost stack parent
1332  2 parentHeadings.peek().children.add(heading);
1333    }
1334   
1335    // push the heading onto stack
1336  3 parentHeadings.push(heading);
1337    }
1338   
1339  1 return topHeadings;
1340    }
1341   
1342    /**
1343    * Retrieves numeric index of a heading.
1344    *
1345    * @param element
1346    * @return
1347    */
 
1348  3 toggle private static int headingIndex(final Element element) {
1349  3 final String tagName = element.tagName();
1350  3 if (tagName.startsWith("h")) {
1351  3 try {
1352  3 return Integer.parseInt(tagName.substring(1));
1353    } catch (final Exception ex) {
1354  0 throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
1355    }
1356    } else {
1357  0 throw new IllegalArgumentException("Must be a header tag: " + tagName);
1358    }
1359    }
1360   
1361    /**
1362    * @author Christophe Friederich
1363    */
 
1364    private static final class HeadingItem implements IdElement {
1365   
1366    /** */
1367    private final String id;
1368   
1369    /** */
1370    private final String tagName;
1371   
1372    /** */
1373    private final String text;
1374   
1375    /** */
1376    private final int headingLevel;
1377   
1378    /** */
1379    private final List<HeadingItem> children = new ArrayList<>();
1380   
 
1381  3 toggle private HeadingItem(final String id, final String tagName, final String text, final int headingLevel) {
1382  3 this.id = id;
1383  3 this.tagName = tagName;
1384  3 this.text = text;
1385  3 this.headingLevel = headingLevel;
1386    }
1387   
 
1388  3 toggle @Override
1389    public String getId() {
1390  3 return id;
1391    }
1392   
 
1393  1 toggle @Override
1394    public String getTagName() {
1395  1 return tagName;
1396    }
1397   
 
1398  1 toggle @Override
1399    public String getText() {
1400  1 return text;
1401    }
1402   
 
1403  2 toggle @Override
1404    public List<HeadingItem> getItems() {
1405  2 return Collections.unmodifiableList(children);
1406    }
1407   
 
1408  3 toggle @Override
1409    public int getHeadingLevel() {
1410  3 return headingLevel;
1411    }
1412    }
1413   
1414    /**
1415    * Representation of a HTML element with ID and a text content. Other such elements can be nested within.
1416    *
1417    * @author Andrius Velykis
1418    * @since 1.0
1419    */
 
1420    public interface IdElement {
1421   
1422    /**
1423    * Retrieves the ID of the HTML element (attribute {@code id}).
1424    *
1425    * @return element {@code id} value
1426    */
1427    String getId();
1428   
1429    /**
1430    * @return Returns the tag name of element.
1431    */
1432    String getTagName();
1433   
1434    /**
1435    * Retrieves the text contents of the HTML element (rendered for display).
1436    *
1437    * @return text contents of the element
1438    */
1439    String getText();
1440   
1441    /**
1442    * @return Returns the level of heading.
1443    */
1444    int getHeadingLevel();
1445   
1446    /**
1447    * Retrieves the children of the HTML element (nested within the element).
1448    *
1449    * @return nested items within the element
1450    */
1451    List<? extends IdElement> getItems();
1452    }
1453    }