Class |
Line # |
Actions |
|||||
---|---|---|---|---|---|---|---|
HtmlTool | 70 | 322 | 0% | 97 | 74 | ||
HtmlTool.JoinSeparator | 85 | 0 | - | 0 | 0 | ||
HtmlTool.ExtractResult | 573 | 0 | - | 0 | 0 | ||
HtmlTool.DefaultExtractResult | 593 | 4 | 0% | 3 | 0 | ||
HtmlTool.HeadingItem | 1364 | 9 | 0% | 6 | 0 | ||
HtmlTool.IdElement | 1420 | 0 | - | 0 | 0 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one | |
3 | * or more contributor license agreements. See the NOTICE file | |
4 | * distributed with this work for additional information | |
5 | * regarding copyright ownership. The ASF licenses this file | |
6 | * to you under the Apache License, Version 2.0 (the | |
7 | * "License"); you may not use this file except in compliance | |
8 | * with the License. You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | package org.devacfr.maven.skins.reflow; | |
20 | ||
21 | import javax.annotation.Nonnull; | |
22 | import javax.annotation.Nullable; | |
23 | ||
24 | import java.text.Normalizer; | |
25 | import java.text.Normalizer.Form; | |
26 | import java.util.ArrayList; | |
27 | import java.util.Arrays; | |
28 | import java.util.Collection; | |
29 | import java.util.Collections; | |
30 | import java.util.HashSet; | |
31 | import java.util.List; | |
32 | import java.util.Locale; | |
33 | import java.util.Map; | |
34 | import java.util.Map.Entry; | |
35 | import java.util.Set; | |
36 | import java.util.Stack; | |
37 | import java.util.regex.Pattern; | |
38 | ||
39 | import com.google.common.base.Strings; | |
40 | import com.google.common.collect.Lists; | |
41 | import org.apache.velocity.tools.ToolContext; | |
42 | import org.apache.velocity.tools.config.DefaultKey; | |
43 | import org.apache.velocity.tools.generic.SafeConfig; | |
44 | import org.apache.velocity.tools.generic.ValueParser; | |
45 | import org.jsoup.Jsoup; | |
46 | import org.jsoup.internal.StringUtil; | |
47 | import org.jsoup.nodes.Document; | |
48 | import org.jsoup.nodes.Element; | |
49 | import org.jsoup.nodes.Node; | |
50 | import org.jsoup.parser.Tag; | |
51 | ||
52 | import static java.util.Collections.emptyList; | |
53 | import static java.util.Objects.requireNonNull; | |
54 | ||
55 | /** | |
56 | * An Apache Velocity tool that provides utility methods to manipulate HTML code using | |
57 | * <a href="http://jsoup.org/">jsoup</a> HTML5 parser. | |
58 | * <p> | |
59 | * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS selectors</a> to refer to | |
60 | * specific elements for manipulation. | |
61 | * </p> | |
62 | * | |
63 | * @author Andrius Velykis | |
64 | * @author Christophe Friederich | |
65 | * @since 1.0 | |
66 | * @see <a href="http://jsoup.org/">jsoup HTML parser</a> | |
67 | * @see <a href= "http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a> | |
68 | */ | |
69 | @DefaultKey("htmlTool") | |
70 | public class HtmlTool extends SafeConfig { | |
71 | ||
72 | private static final int SLUG_SIZE = 50; | |
73 | ||
74 | /** Default separator using to generate slug heading name. */ | |
75 | public static final String DEFAULT_SLUG_SEPARATOR = "-"; | |
76 | ||
77 | /** prefix heading id associated to table of contents. */ | |
78 | private static final String SEPARATOR_TOC = "_toc_"; | |
79 | ||
80 | /** A list of all HTML heading classes (h1-6). */ | |
81 | private static final List<String> HEADINGS = Collections | |
82 | .unmodifiableList(Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6")); | |
83 | ||
84 | /** Enum indicating separator handling strategy for document partitioning. */ | |
85 | public enum JoinSeparator { | |
86 | /** | |
87 | * Keep separators at the start of partitions. The first partition will not have a separator. | |
88 | */ | |
89 | AFTER, | |
90 | /** | |
91 | * Keep separators at the end of partitions. The last partition will not have a separator. | |
92 | */ | |
93 | BEFORE, | |
94 | /** Drop separators altogether. */ | |
95 | NO | |
96 | } | |
97 | ||
98 | /** */ | |
99 | private String outputEncoding = "UTF-8"; | |
100 | ||
101 | private boolean prettyPrint = true; | |
102 | ||
103 | /** | |
104 | * {@inheritDoc} | |
105 | * | |
106 | * @see SafeConfig#configure(ValueParser) | |
107 | */ | |
108 | 22 | @Override |
109 | protected void configure(final ValueParser values) { | |
110 | ||
111 | // retrieve the Velocity context for output encoding | |
112 | 22 | final Object velocityContext = values.get("velocityContext"); |
113 | ||
114 | 22 | if (!(velocityContext instanceof ToolContext)) { |
115 | 0 | return; |
116 | } | |
117 | ||
118 | 22 | final ToolContext ctxt = (ToolContext) velocityContext; |
119 | ||
120 | // get the output encoding | |
121 | 22 | final Object outputEncodingObj = ctxt.get("outputEncoding"); |
122 | 22 | if (outputEncodingObj instanceof String) { |
123 | 0 | this.outputEncoding = (String) outputEncodingObj; |
124 | } | |
125 | ||
126 | 22 | final Object prettyPrint = ctxt.get("prettyPrint"); |
127 | 22 | if (prettyPrint instanceof Boolean) { |
128 | 0 | this.prettyPrint = (Boolean) prettyPrint; |
129 | } | |
130 | } | |
131 | ||
132 | /** | |
133 | * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters | |
134 | * (e.g. newline, tab) convert to a simple space | |
135 | * | |
136 | * @param html | |
137 | * html content to normalise. | |
138 | * @return Returns normalised string. | |
139 | */ | |
140 | 0 | @Nullable public String normaliseWhitespace(@Nullable final String html) { |
141 | 0 | if (Strings.isNullOrEmpty(html)) { |
142 | 0 | return null; |
143 | } | |
144 | 0 | return StringUtil.normaliseWhitespace(html); |
145 | } | |
146 | ||
147 | /** | |
148 | * Splits the given HTML content into partitions based on the given separator selector. The separators themselves | |
149 | * are dropped from the results. | |
150 | * | |
151 | * @param content | |
152 | * body HTML content to split (can not be empty or {@code null}). | |
153 | * @param separatorCssSelector | |
154 | * CSS selector for separators (can not be empty or {@code null}). | |
155 | * @return a list of HTML partitions split on separator locations, but without the separators. | |
156 | * @since 1.0 | |
157 | * @see #split(String, String, JoinSeparator) | |
158 | */ | |
159 | 4 | public List<String> split(@Nonnull final String content, @Nonnull final String separatorCssSelector) { |
160 | 4 | return split(content, separatorCssSelector, JoinSeparator.NO); |
161 | } | |
162 | ||
163 | /** | |
164 | * Splits the given HTML content into partitions based on the given separator selector. The separators are kept as | |
165 | * first elements of the partitions. | |
166 | * <p> | |
167 | * Note that the first part is removed if the split was successful. This is because the first part does not include | |
168 | * the separator. | |
169 | * </p> | |
170 | * | |
171 | * @param content | |
172 | * HTML content to split | |
173 | * @param separatorCssSelector | |
174 | * CSS selector for separators | |
175 | * @return a list of HTML partitions split on separator locations (except the first one), with separators at the | |
176 | * beginning of each partition | |
177 | * @since 1.0 | |
178 | * @see #split(String, String, JoinSeparator) | |
179 | */ | |
180 | 1 | public List<String> splitOnStarts(final @Nonnull String content, final @Nonnull String separatorCssSelector) { |
181 | ||
182 | 1 | final List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER); |
183 | ||
184 | 1 | if (result == null || result.size() <= 1) { |
185 | // no result or just one part - return what we have | |
186 | 0 | return result; |
187 | } | |
188 | ||
189 | // otherwise, drop the first part - the first split will be the first 'start' | |
190 | // e.g. if we split on headings, the first part will contain everything | |
191 | // before the first heading. | |
192 | 1 | return result.subList(1, result.size()); |
193 | } | |
194 | ||
195 | /** | |
196 | * Splits the given HTML content into partitions based on the given separator selector. The separators are either | |
197 | * dropped or joined with before/after depending on the indicated separator strategy. | |
198 | * | |
199 | * @param content | |
200 | * HTML content to split | |
201 | * @param separatorCssSelector | |
202 | * CSS selector for separators | |
203 | * @param separatorStrategy | |
204 | * strategy to drop or keep separators, one of "after", "before" or "no" | |
205 | * @return a list of HTML partitions split on separator locations. | |
206 | * @since 1.0 | |
207 | * @see #split(String, String, JoinSeparator) | |
208 | */ | |
209 | 3 | public List<String> split(final @Nonnull String content, |
210 | final @Nonnull String separatorCssSelector, | |
211 | final String separatorStrategy) { | |
212 | ||
213 | 3 | JoinSeparator sepStrategy; |
214 | 3 | if ("before".equals(separatorStrategy)) { |
215 | 1 | sepStrategy = JoinSeparator.BEFORE; |
216 | 2 | } else if ("after".equals(separatorStrategy)) { |
217 | 1 | sepStrategy = JoinSeparator.AFTER; |
218 | } else { | |
219 | 1 | sepStrategy = JoinSeparator.NO; |
220 | } | |
221 | ||
222 | 3 | return split(content, separatorCssSelector, sepStrategy); |
223 | } | |
224 | ||
225 | /** | |
226 | * Splits the given HTML content into partitions based on the given separator selector.The separators are either | |
227 | * dropped or joined with before/after depending on the indicated separator strategy. | |
228 | * <p> | |
229 | * Note that splitting algorithm tries to resolve nested elements so that returned partitions are self-contained | |
230 | * HTML elements. The nesting is normally contained within the first applicable partition. | |
231 | * </p> | |
232 | * | |
233 | * @param content | |
234 | * Body HTML content to split | |
235 | * @param separatorCssSelector | |
236 | * CSS selector for separators | |
237 | * @param separatorStrategy | |
238 | * strategy to drop or keep separators | |
239 | * @return a list of HTML partitions split on separator locations. If no splitting occurs, returns the original | |
240 | * content as the single element of the list | |
241 | * @since 1.0 | |
242 | */ | |
243 | 11 | public List<String> split(@Nonnull final String content, |
244 | @Nonnull final String separatorCssSelector, | |
245 | @Nonnull final JoinSeparator separatorStrategy) { | |
246 | ||
247 | 11 | requireNonNull(separatorStrategy); |
248 | 11 | final Element body = parse(content).body(); |
249 | ||
250 | 11 | final List<Element> separators = body.select(separatorCssSelector); |
251 | 11 | if (separators.size() > 0) { |
252 | 9 | final List<List<Element>> partitions = split(separators, separatorStrategy, body); |
253 | ||
254 | 9 | final List<String> sectionHtml = new ArrayList<>(); |
255 | ||
256 | 9 | for (final List<Element> partition : partitions) { |
257 | 19 | final String html = outerHtml(partition); |
258 | 19 | if (!Strings.isNullOrEmpty(html)) { |
259 | 18 | sectionHtml.add(outerHtml(partition)); |
260 | } | |
261 | } | |
262 | ||
263 | 9 | return sectionHtml; |
264 | } else { | |
265 | // nothing to split | |
266 | 2 | return Collections.singletonList(content); |
267 | } | |
268 | } | |
269 | ||
270 | /** | |
271 | * Recursively splits the {@code parent} element based on the given {@code separators}. If a separator is | |
272 | * encountered in the parent, it is split on that position. The outstanding nested elements go with the first of the | |
273 | * partitions in each case. | |
274 | * | |
275 | * @param separators | |
276 | * @param separatorStrategy | |
277 | * @param parent | |
278 | * @return list of partitions (as lists of root elements for each partition). Partition can be an empty list, e.g. | |
279 | * if the separator is at the start of the content. | |
280 | */ | |
281 | 28 | private static List<List<Element>> split(final Collection<Element> separators, |
282 | final JoinSeparator separatorStrategy, | |
283 | final Element parent) { | |
284 | ||
285 | 28 | final List<List<Element>> partitions = Lists.newLinkedList(); |
286 | ||
287 | 28 | for (final Element child : parent.children()) { |
288 | ||
289 | 29 | if (separators.contains(child)) { |
290 | // split here and do not go deeper | |
291 | ||
292 | // first ensure there was a partition before | |
293 | // otherwise the split is not recognised on an outer level | |
294 | 10 | getLastPartition(partitions); |
295 | ||
296 | 10 | if (separatorStrategy == JoinSeparator.BEFORE) { |
297 | // add to the last partition | |
298 | 2 | getLastPartition(partitions).add(child); |
299 | } | |
300 | ||
301 | // add an empty new partition | |
302 | 10 | final List<Element> newPartition = Lists.newLinkedList(); |
303 | 10 | partitions.add(newPartition); |
304 | ||
305 | 10 | if (separatorStrategy == JoinSeparator.AFTER) { |
306 | // add to the new partition | |
307 | 3 | newPartition.add(child); |
308 | } | |
309 | ||
310 | } else { | |
311 | // go deeper | |
312 | 19 | final List<List<Element>> childPartitions = split(separators, separatorStrategy, child); |
313 | ||
314 | // add the child to the last partition | |
315 | 19 | getLastPartition(partitions).add(child); |
316 | ||
317 | 19 | if (childPartitions.size() > 1) { |
318 | // more than one partition: | |
319 | // only keep the first partition elements in the child | |
320 | // so for all other partitions, remove them from their parents | |
321 | ||
322 | 2 | final List<Element> allChildren = child.children(); |
323 | 2 | final List<Element> firstPartition = childPartitions.get(0); |
324 | ||
325 | 2 | allChildren.removeAll(firstPartition); |
326 | 2 | for (final Element removeChild : allChildren) { |
327 | 2 | removeChild.remove(); |
328 | } | |
329 | ||
330 | // add the remaining partitions | |
331 | 2 | for (final List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) { |
332 | 2 | partitions.add(nextPartition); |
333 | } | |
334 | } | |
335 | } | |
336 | } | |
337 | ||
338 | 28 | return partitions; |
339 | } | |
340 | ||
341 | /** | |
342 | * Retrieves the last partition (as list of elements) or creates a new one if there was none before. | |
343 | * | |
344 | * @param partitions | |
345 | * @return | |
346 | */ | |
347 | 31 | private static List<Element> getLastPartition(final List<List<Element>> partitions) { |
348 | 31 | if (partitions.isEmpty()) { |
349 | 11 | final List<Element> newPartition = Lists.newLinkedList(); |
350 | 11 | partitions.add(newPartition); |
351 | 11 | return newPartition; |
352 | } else { | |
353 | 20 | return partitions.get(partitions.size() - 1); |
354 | } | |
355 | } | |
356 | ||
357 | /** | |
358 | * Outputs the list of partition root elements to HTML. | |
359 | * | |
360 | * @param elements | |
361 | * @return | |
362 | */ | |
363 | 37 | private static String outerHtml(final List<Element> elements) { |
364 | ||
365 | 37 | switch (elements.size()) { |
366 | 1 | case 0: |
367 | 1 | return ""; |
368 | ||
369 | 24 | case 1: |
370 | 24 | return elements.get(0).outerHtml(); |
371 | ||
372 | 12 | default: |
373 | // more than one element | |
374 | // wrap into <div> which we will remove afterwards | |
375 | 12 | final Element root = new Element(Tag.valueOf("div"), ""); |
376 | 12 | for (final Element elem : elements) { |
377 | 24 | root.appendChild(elem); |
378 | } | |
379 | ||
380 | 12 | return root.html(); |
381 | } | |
382 | } | |
383 | ||
384 | /** | |
385 | * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited | |
386 | * to a certain amount, e.g. to bring just the first of selected elements to the top. | |
387 | * | |
388 | * @param content | |
389 | * HTML content to reorder | |
390 | * @param selector | |
391 | * CSS selector for elements to bring to top of the content | |
392 | * @param amount | |
393 | * Maximum number of elements to reorder | |
394 | * @return HTML content with reordered elements, or the original content if no such elements found. | |
395 | * @since 1.0 | |
396 | */ | |
397 | 0 | public String reorderToTop(final String content, final String selector, final int amount) { |
398 | 0 | return reorderToTop(content, selector, amount, null); |
399 | } | |
400 | ||
401 | /** | |
402 | * Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited | |
403 | * to a certain amount, e.g. to bring just the first of selected elements to the top. | |
404 | * | |
405 | * @param content | |
406 | * HTML content to reorder | |
407 | * @param selector | |
408 | * CSS selector for elements to bring to top of the content | |
409 | * @param amount | |
410 | * Maximum number of elements to reorder | |
411 | * @param wrapRemaining | |
412 | * HTML to wrap the remaining (non-reordered) part | |
413 | * @return HTML content with reordered elements, or the original content if no such elements found. | |
414 | * @since 1.0 | |
415 | */ | |
416 | 1 | public String reorderToTop(final String content, |
417 | final String selector, | |
418 | final int amount, | |
419 | final String wrapRemaining) { | |
420 | ||
421 | // extract the elements and then prepend them to the remaining body | |
422 | 1 | final List<Element> extracted = extractElements(content, selector, amount); |
423 | ||
424 | 1 | if (extracted.size() > 1) { |
425 | ||
426 | 1 | final Element body = extracted.get(0); |
427 | ||
428 | 1 | if (wrapRemaining != null) { |
429 | 1 | wrapInner(body, wrapRemaining); |
430 | } | |
431 | ||
432 | 1 | final List<Element> elements = extracted.subList(1, extracted.size()); |
433 | ||
434 | // now prepend extracted elements to the body (in backwards to preserve original | |
435 | // order) | |
436 | 2 | for (int index = elements.size() - 1; index >= 0; index--) { |
437 | 1 | body.prependChild(elements.get(index)); |
438 | } | |
439 | ||
440 | 1 | return body.html(); |
441 | } else { | |
442 | // nothing to reorder | |
443 | 0 | return content; |
444 | } | |
445 | } | |
446 | ||
447 | 1 | private static Element wrapInner(final Element element, final String html) { |
448 | ||
449 | // wrap everything into an additional <div> for wrapping | |
450 | // otherwise there may be problems, e.g. with <body> element | |
451 | 1 | final Element topDiv = new Element(Tag.valueOf("div"), ""); |
452 | 1 | for (final Element topElem : element.children()) { |
453 | // add all elements in the body to the `topDiv` | |
454 | 1 | topElem.remove(); |
455 | 1 | topDiv.appendChild(topElem); |
456 | } | |
457 | ||
458 | // add topDiv to the body | |
459 | 1 | element.appendChild(topDiv); |
460 | ||
461 | // wrap topDiv | |
462 | 1 | topDiv.wrap(html); |
463 | // now unwrap topDiv - will remove it from the hierarchy | |
464 | 1 | topDiv.unwrap(); |
465 | ||
466 | 1 | return element; |
467 | } | |
468 | ||
469 | /** | |
470 | * Extracts elements from the HTML content. | |
471 | * | |
472 | * @param content | |
473 | * @param selector | |
474 | * @param amount | |
475 | * @return the remainder and a list of extracted elements. The main body (remainder after extraction) is always | |
476 | * returned as the first element of the list. | |
477 | */ | |
478 | 2 | private List<Element> extractElements(final String content, final String selector, final int amount) { |
479 | ||
480 | 2 | final Element body = parse(content).body(); |
481 | ||
482 | 2 | List<Element> elements = body.select(selector); |
483 | 2 | if (elements.size() > 0) { |
484 | ||
485 | 2 | elements = filterParents(elements); |
486 | ||
487 | 2 | if (amount >= 0) { |
488 | // limit to the indicated amount | |
489 | 2 | elements = elements.subList(0, Math.min(amount, elements.size())); |
490 | } | |
491 | ||
492 | // remove all from their parents | |
493 | 2 | for (final Element element : elements) { |
494 | 4 | element.remove(); |
495 | } | |
496 | } | |
497 | ||
498 | 2 | final List<Element> results = new ArrayList<>(); |
499 | // first element is the body | |
500 | 2 | results.add(body); |
501 | 2 | results.addAll(elements); |
502 | 2 | return results; |
503 | } | |
504 | ||
505 | /** | |
506 | * Filters the list of elements to only contain parent elements. This is to avoid both parent and child being in the | |
507 | * list of elements. | |
508 | * | |
509 | * @param elements | |
510 | * @return | |
511 | */ | |
512 | 2 | private static List<Element> filterParents(final List<Element> elements) { |
513 | 2 | final List<Element> filtered = new ArrayList<>(); |
514 | 2 | for (final Element element : elements) { |
515 | // get the intersection of parents and selected elements | |
516 | 6 | final List<Element> parentsInter = element.parents(); |
517 | 6 | parentsInter.retainAll(elements); |
518 | 6 | if (parentsInter.isEmpty()) { |
519 | // no intersection - element's parents are not in the selected list | |
520 | 4 | filtered.add(element); |
521 | } | |
522 | } | |
523 | ||
524 | 2 | return filtered; |
525 | } | |
526 | ||
527 | /** | |
528 | * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML elements and the | |
529 | * remainder of HTML content, with these elements removed. Can be limited to a certain amount, e.g. to extract just | |
530 | * the first of selected elements. | |
531 | * | |
532 | * @param content | |
533 | * HTML content to extract elements from | |
534 | * @param selector | |
535 | * CSS selector for elements to extract | |
536 | * @param amount | |
537 | * Maximum number of elements to extract | |
538 | * @return HTML content of the extracted elements together with the remainder of the original content. If no | |
539 | * elements are found, the remainder contains the original content. | |
540 | * @since 1.0 | |
541 | */ | |
542 | 1 | @Nonnull |
543 | public ExtractResult extract(final String content, final String selector, final int amount) { | |
544 | ||
545 | 1 | final List<Element> extracted = extractElements(content, selector, amount); |
546 | ||
547 | 1 | if (extracted.size() > 1) { |
548 | ||
549 | // first element is the remaining body, the rest are extracted | |
550 | 1 | final Element body = extracted.get(0); |
551 | 1 | final List<Element> elements = extracted.subList(1, extracted.size()); |
552 | ||
553 | // convert to HTML | |
554 | 1 | final List<String> elementStr = new ArrayList<>(); |
555 | 1 | for (final Element el : elements) { |
556 | 3 | elementStr.add(el.outerHtml()); |
557 | } | |
558 | ||
559 | 1 | return new DefaultExtractResult(elementStr, body.html()); |
560 | } else { | |
561 | // nothing to extract | |
562 | 0 | return new DefaultExtractResult(Collections.<String> emptyList(), content); |
563 | } | |
564 | } | |
565 | ||
566 | /** | |
567 | * A container to carry element extraction results. Contains the extracted element HTML code and the remainder of | |
568 | * the body content with elements removed. | |
569 | * | |
570 | * @author Andrius Velykis | |
571 | * @since 1.0 | |
572 | */ | |
573 | public interface ExtractResult { | |
574 | ||
575 | /** | |
576 | * Retrieves the extracted HTML elements. | |
577 | * | |
578 | * @return List of HTML of extracted elements. Can be empty if no elements found. | |
579 | */ | |
580 | List<String> getExtracted(); | |
581 | ||
582 | /** | |
583 | * Retrieves the content from which elements were extracted. | |
584 | * | |
585 | * @return The HTML content with extracted elements removed. | |
586 | */ | |
587 | String getRemainder(); | |
588 | } | |
589 | ||
590 | /** | |
591 | * @author Christophe Friederich | |
592 | */ | |
593 | private static final class DefaultExtractResult implements ExtractResult { | |
594 | ||
595 | /** */ | |
596 | private final List<String> extracted; | |
597 | ||
598 | /** */ | |
599 | private final String remainder; | |
600 | ||
601 | 1 | private DefaultExtractResult(final List<String> extracted, final String remainder) { |
602 | 1 | this.extracted = extracted; |
603 | 1 | this.remainder = remainder; |
604 | } | |
605 | ||
606 | 1 | @Override |
607 | public List<String> getExtracted() { | |
608 | 1 | return Collections.unmodifiableList(extracted); |
609 | } | |
610 | ||
611 | 1 | @Override |
612 | public String getRemainder() { | |
613 | 1 | return remainder; |
614 | } | |
615 | } | |
616 | ||
617 | /** | |
618 | * Sets attribute to the given value on elements in HTML. | |
619 | * | |
620 | * @param content | |
621 | * HTML content to set attributes on | |
622 | * @param selector | |
623 | * CSS selector for elements to modify | |
624 | * @param attributeKey | |
625 | * Attribute name | |
626 | * @param value | |
627 | * Attribute value | |
628 | * @return HTML content with modified elements. If no elements are found, the original content is returned. | |
629 | * @since 1.0 | |
630 | */ | |
631 | 6 | public String setAttr(final String content, final String selector, final String attributeKey, final String value) { |
632 | ||
633 | 6 | final Element body = parse(content).body(); |
634 | ||
635 | 6 | final List<Element> elements = body.select(selector); |
636 | 6 | if (elements.size() > 0) { |
637 | ||
638 | 2 | for (final Element element : elements) { |
639 | 2 | element.attr(attributeKey, value); |
640 | } | |
641 | ||
642 | 2 | return body.html(); |
643 | } else { | |
644 | // nothing to update | |
645 | 4 | return content; |
646 | } | |
647 | } | |
648 | ||
649 | /** | |
650 | * Parses body fragment to the {@code <body>} element. | |
651 | * | |
652 | * @param content | |
653 | * body HTML fragment (can not be {@code null}). | |
654 | * @return the {@code body} element of the parsed content | |
655 | */ | |
656 | 74 | public Document parse(@Nonnull final String content) { |
657 | 74 | final Document doc = Jsoup.parseBodyFragment(content); |
658 | 74 | doc.outputSettings().charset(outputEncoding).prettyPrint(prettyPrint); |
659 | 74 | return doc; |
660 | } | |
661 | ||
662 | /** | |
663 | * Retrieves attribute value on elements in HTML. Will return all attribute values for the selector, since there can | |
664 | * be more than one element. | |
665 | * | |
666 | * @param content | |
667 | * HTML content to read attributes from | |
668 | * @param selector | |
669 | * CSS selector for elements to find | |
670 | * @param attributeKey | |
671 | * Attribute name | |
672 | * @return Attribute values for all matching elements. If no elements are found, empty list is returned. | |
673 | * @since 1.0 | |
674 | */ | |
675 | 1 | public List<String> getAttr(final String content, final String selector, final String attributeKey) { |
676 | ||
677 | 1 | final Element body = parse(content).body(); |
678 | ||
679 | 1 | final List<Element> elements = body.select(selector); |
680 | 1 | final List<String> attrs = new ArrayList<>(); |
681 | ||
682 | 1 | for (final Element element : elements) { |
683 | 1 | final String attrValue = element.attr(attributeKey); |
684 | 1 | attrs.add(attrValue); |
685 | } | |
686 | ||
687 | 1 | return attrs; |
688 | } | |
689 | ||
690 | /** | |
691 | * Adds given class names to the elements in HTML. | |
692 | * | |
693 | * @param content | |
694 | * HTML content to modify | |
695 | * @param selector | |
696 | * CSS selector for elements to add classes to | |
697 | * @param classNames | |
698 | * Names of classes to add to the selected elements | |
699 | * @param amount | |
700 | * Maximum number of elements to modify | |
701 | * @return HTML content with modified elements. If no elements are found, the original content is returned. | |
702 | * @since 1.0 | |
703 | */ | |
704 | 11 | public String addClass(final String content, |
705 | final String selector, | |
706 | final List<String> classNames, | |
707 | final int amount) { | |
708 | ||
709 | 11 | final Element body = parse(content).body(); |
710 | ||
711 | 11 | List<Element> elements = body.select(selector); |
712 | 11 | if (amount >= 0) { |
713 | // limit to the indicated amount | |
714 | 0 | elements = elements.subList(0, Math.min(amount, elements.size())); |
715 | } | |
716 | ||
717 | 11 | if (elements.size() > 0) { |
718 | ||
719 | 4 | for (final Element element : elements) { |
720 | 12 | for (final String className : classNames) { |
721 | 14 | element.addClass(className); |
722 | } | |
723 | } | |
724 | ||
725 | 4 | return body.html(); |
726 | } else { | |
727 | // nothing to update | |
728 | 7 | return content; |
729 | } | |
730 | } | |
731 | ||
732 | /** | |
733 | * Adds given class names to the elements in HTML. | |
734 | * | |
735 | * @param content | |
736 | * HTML content to modify | |
737 | * @param selector | |
738 | * CSS selector for elements to add classes to | |
739 | * @param classNames | |
740 | * Names of classes to add to the selected elements | |
741 | * @return HTML content with modified elements. If no elements are found, the original content is returned. | |
742 | * @since 1.0 | |
743 | */ | |
744 | 11 | public String addClass(final String content, final String selector, final List<String> classNames) { |
745 | 11 | return addClass(content, selector, classNames, -1); |
746 | } | |
747 | ||
748 | /** | |
749 | * Adds given class to the elements in HTML. | |
750 | * | |
751 | * @param content | |
752 | * HTML content to modify | |
753 | * @param selector | |
754 | * CSS selector for elements to add the class to | |
755 | * @param className | |
756 | * Name of class to add to the selected elements | |
757 | * @return HTML content with modified elements. If no elements are found, the original content is returned. | |
758 | * @since 1.0 | |
759 | */ | |
760 | 1 | public String addClass(final String content, final String selector, final String className) { |
761 | 1 | return addClass(content, selector, Collections.singletonList(className)); |
762 | } | |
763 | ||
764 | /** | |
765 | * Wraps elements in HTML with the given HTML. | |
766 | * | |
767 | * @param content | |
768 | * HTML content to modify | |
769 | * @param selector | |
770 | * CSS selector for elements to wrap | |
771 | * @param wrapHtml | |
772 | * HTML to use for wrapping the selected elements | |
773 | * @param amount | |
774 | * Maximum number of elements to modify | |
775 | * @return HTML content with modified elements. If no elements are found, the original content is returned. | |
776 | * @since 1.0 | |
777 | */ | |
778 | 1 | public String wrap(final String content, final String selector, final String wrapHtml, final int amount) { |
779 | ||
780 | 1 | final Element body = parse(content).body(); |
781 | ||
782 | 1 | List<Element> elements = body.select(selector); |
783 | 1 | if (amount >= 0) { |
784 | // limit to the indicated amount | |
785 | 1 | elements = elements.subList(0, Math.min(amount, elements.size())); |
786 | } | |
787 | ||
788 | 1 | if (elements.size() > 0) { |
789 | ||
790 | 1 | for (final Element element : elements) { |
791 | 1 | element.wrap(wrapHtml); |
792 | } | |
793 | ||
794 | 1 | return body.html(); |
795 | } else { | |
796 | // nothing to update | |
797 | 0 | return content; |
798 | } | |
799 | } | |
800 | ||
801 | /** | |
802 | * Removes elements from HTML. | |
803 | * | |
804 | * @param content | |
805 | * HTML content to modify | |
806 | * @param selector | |
807 | * CSS selector for elements to remove | |
808 | * @return HTML content with removed elements. If no elements are found, the original content is returned. | |
809 | * @since 1.0 | |
810 | */ | |
811 | 1 | public String remove(final String content, final String selector) { |
812 | ||
813 | 1 | final Element body = parse(content).body(); |
814 | ||
815 | 1 | final List<Element> elements = body.select(selector); |
816 | 1 | if (elements.size() > 0) { |
817 | 1 | for (final Element element : elements) { |
818 | 1 | element.remove(); |
819 | } | |
820 | ||
821 | 1 | return body.html(); |
822 | } else { | |
823 | // nothing changed | |
824 | 0 | return content; |
825 | } | |
826 | } | |
827 | ||
828 | /** | |
829 | * Replaces elements in HTML. | |
830 | * | |
831 | * @param content | |
832 | * HTML content to modify | |
833 | * @param selector | |
834 | * CSS selector for elements to replace | |
835 | * @param replacement | |
836 | * HTML replacement (must parse to a single element) | |
837 | * @return HTML content with replaced elements. If no elements are found, the original content is returned. | |
838 | * @since 1.0 | |
839 | */ | |
840 | 1 | public String replace(final String content, final String selector, final String replacement) { |
841 | 1 | return replaceAll(content, Collections.singletonMap(selector, replacement)); |
842 | } | |
843 | ||
844 | /** | |
845 | * Replaces elements in HTML. | |
846 | * | |
847 | * @param content | |
848 | * HTML content to modify | |
849 | * @param replacements | |
850 | * Map of CSS selectors to their replacement HTML texts. CSS selectors find elements to be replaced with | |
851 | * the HTML in the mapping. The HTML must parse to a single element. | |
852 | * @return HTML content with replaced elements. If no elements are found, the original content is returned. | |
853 | * @since 1.0 | |
854 | */ | |
855 | 6 | public String replaceAll(final String content, final Map<String, String> replacements) { |
856 | ||
857 | 6 | final Element body = parse(content).body(); |
858 | ||
859 | 6 | boolean modified = false; |
860 | 6 | for (final Entry<String, String> replacementEntry : replacements.entrySet()) { |
861 | 46 | final String selector = replacementEntry.getKey(); |
862 | 46 | final String replacement = replacementEntry.getValue(); |
863 | ||
864 | 46 | final List<Element> elements = body.select(selector); |
865 | 46 | if (elements.size() > 0) { |
866 | ||
867 | // take the first child | |
868 | 10 | final Element replacementElem = parse(replacement).body().child(0); |
869 | ||
870 | 10 | if (replacementElem != null) { |
871 | 10 | for (final Element element : elements) { |
872 | 10 | element.replaceWith(replacementElem.clone()); |
873 | } | |
874 | ||
875 | 10 | modified = true; |
876 | } | |
877 | } | |
878 | } | |
879 | ||
880 | 6 | if (modified) { |
881 | 2 | return body.html(); |
882 | } else { | |
883 | // nothing changed | |
884 | 4 | return content; |
885 | } | |
886 | } | |
887 | ||
888 | /** | |
889 | * Replaces All elements in HTML corresponding to <code>selector</code> while preserving the content of this | |
890 | * element. | |
891 | * | |
892 | * @param content | |
893 | * HTML content to modify | |
894 | * @param selector | |
895 | * CSS selector for elements to replace | |
896 | * @param newElement | |
897 | * HTML replacement (must parse to a single element) | |
898 | * @return HTML content with replaced elements. If no elements are found, the original content is returned. | |
899 | * @since 2.0 | |
900 | */ | |
901 | 6 | public String replaceWith(final String content, final String selector, final String newElement) { |
902 | ||
903 | 6 | final Element body = parse(content).body(); |
904 | ||
905 | 6 | boolean modified = false; |
906 | 6 | final List<Element> elements = body.select(selector); |
907 | 6 | if (elements.size() > 0) { |
908 | ||
909 | // take the first child | |
910 | 2 | final Element replacementElem = parse(newElement).body().child(0); |
911 | ||
912 | 2 | if (replacementElem != null) { |
913 | 2 | for (final Element element : elements) { |
914 | 2 | final List<Node> children = element.childNodes(); |
915 | 2 | final Element el = replacementElem.clone(); |
916 | 2 | for (final Node child : children) { |
917 | 2 | el.appendChild(child.clone()); |
918 | } | |
919 | 2 | element.replaceWith(el); |
920 | } | |
921 | ||
922 | 2 | modified = true; |
923 | } | |
924 | } | |
925 | ||
926 | 6 | if (modified) { |
927 | 2 | return body.html(); |
928 | } else { | |
929 | // nothing changed | |
930 | 4 | return content; |
931 | } | |
932 | } | |
933 | ||
934 | /** | |
935 | * Retrieves text content of the selected elements in HTML. Renders the element's text as it would be displayed on | |
936 | * the web page (including its children). | |
937 | * | |
938 | * @param content | |
939 | * HTML content with the elements | |
940 | * @param selector | |
941 | * CSS selector for elements to extract contents | |
942 | * @return A list of element texts as rendered to display. Empty list if no elements are found. | |
943 | * @since 1.0 | |
944 | */ | |
945 | 1 | public List<String> text(@Nullable final String content, @Nonnull final String selector) { |
946 | 1 | if (Strings.isNullOrEmpty(content)) { |
947 | 0 | return emptyList(); |
948 | } | |
949 | 1 | final Element body = parse(content).body(); |
950 | ||
951 | 1 | final List<Element> elements = body.select(selector); |
952 | 1 | final List<String> texts = new ArrayList<>(); |
953 | ||
954 | 1 | for (final Element element : elements) { |
955 | 1 | texts.add(element.text()); |
956 | } | |
957 | ||
958 | 1 | return texts; |
959 | } | |
960 | ||
961 | /** | |
962 | * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to IDs for heading | |
963 | * elements. | |
964 | * <p> | |
965 | * The anchors are used to indicate positions within a HTML page. In HTML5, however, the {@code name} attribute is | |
966 | * no longer supported on {@code <a>}) tag. The positions within pages are indicated using {@code id} attribute | |
967 | * instead, e.g. {@code | |
968 | * | |
969 | * | |
970 | <h1 id="myheading">}. | |
971 | * </p> | |
972 | * <p> | |
973 | * The method finds anchors inside, immediately before or after the heading tags and uses their name as heading | |
974 | * {@code id} instead. The anchors themselves are removed. | |
975 | * </p> | |
976 | * | |
977 | * @param content | |
978 | * HTML content to modify | |
979 | * @return HTML content with modified elements. Anchor names are used for adjacent headings, and anchor tags are | |
980 | * removed. If no elements are found, the original content is returned. | |
981 | * @since 1.0 | |
982 | */ | |
983 | 6 | public String headingAnchorToId(final String content) { |
984 | ||
985 | 6 | final Element body = parse(content).body(); |
986 | ||
987 | // selectors for headings without IDs | |
988 | 6 | final List<String> headNoIds = concat(HEADINGS, ":not([id])", true); |
989 | ||
990 | // selector for anchor with name attribute only | |
991 | 6 | final String nameA = "a[name]:not([href])"; |
992 | ||
993 | // select all headings that have inner named anchor | |
994 | 6 | final List<Element> headingsInnerA = body |
995 | .select(String.join(", ", concat(headNoIds, ":has(" + nameA + ")", true))); | |
996 | ||
997 | 6 | boolean modified = false; |
998 | 6 | for (final Element heading : headingsInnerA) { |
999 | 9 | final List<Element> anchors = heading.select(nameA); |
1000 | // take first | |
1001 | 9 | if (!anchors.isEmpty()) { |
1002 | 9 | anchorToId(heading, anchors.get(0)); |
1003 | 9 | modified = true; |
1004 | } | |
1005 | } | |
1006 | ||
1007 | // select all headings that have a preceding named anchor | |
1008 | 6 | final List<Element> headingsPreA = body.select(String.join(", ", concat(headNoIds, nameA + " + ", false))); |
1009 | ||
1010 | 6 | for (final Element heading : headingsPreA) { |
1011 | 0 | final Element anchor = heading.previousElementSibling(); |
1012 | 0 | if (anchor != null) { |
1013 | 0 | anchorToId(heading, anchor); |
1014 | 0 | modified = true; |
1015 | } | |
1016 | } | |
1017 | ||
1018 | // select all headings that are followed by a named anchor | |
1019 | // no selector available for that, so first select the anchors | |
1020 | // then retrieve the headings | |
1021 | 6 | final List<Element> anchorsPreH = body.select(String.join(", ", concat(headNoIds, " + " + nameA, true))); |
1022 | ||
1023 | 6 | for (final Element anchor : anchorsPreH) { |
1024 | 0 | final Element heading = anchor.previousElementSibling(); |
1025 | 0 | if (heading != null) { |
1026 | 0 | anchorToId(heading, anchor); |
1027 | 0 | modified = true; |
1028 | } | |
1029 | } | |
1030 | ||
1031 | 6 | if (modified) { |
1032 | 4 | return body.html(); |
1033 | } else { | |
1034 | // nothing to update | |
1035 | 2 | return content; |
1036 | } | |
1037 | } | |
1038 | ||
1039 | /** | |
1040 | * Moves anchor name to heading id, if one does not exist. Removes the anchor. | |
1041 | * | |
1042 | * @param heading | |
1043 | * @param anchor | |
1044 | */ | |
1045 | 9 | private static void anchorToId(final Element heading, final Element anchor) { |
1046 | ||
1047 | 9 | if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) { |
1048 | 9 | final String aName = anchor.attr("name"); |
1049 | 9 | if (!aName.isEmpty()) { |
1050 | // set the anchor name as heading ID | |
1051 | 9 | heading.attr("id", aName); |
1052 | ||
1053 | // remove the anchor | |
1054 | 9 | anchor.remove(); |
1055 | } | |
1056 | } | |
1057 | } | |
1058 | ||
1059 | /** | |
1060 | * Utility method to concatenate a String to a list of Strings. The text can be either appended or prepended. | |
1061 | * | |
1062 | * @param elements | |
1063 | * list of elements to append/prepend the text to | |
1064 | * @param text | |
1065 | * the given text to append/prepend | |
1066 | * @param append | |
1067 | * if {@code true}, text will be appended to the elements. If {@code false}, it will be prepended | |
1068 | * @return list of elements with the text appended/prepended | |
1069 | * @since 1.0 | |
1070 | */ | |
1071 | 31 | public static List<String> concat(final List<String> elements, final String text, final boolean append) { |
1072 | 31 | final List<String> concats = new ArrayList<>(); |
1073 | ||
1074 | 31 | for (final String element : elements) { |
1075 | 186 | concats.add(append ? element + text : text + element); |
1076 | } | |
1077 | ||
1078 | 31 | return concats; |
1079 | } | |
1080 | ||
1081 | /** | |
1082 | * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that do not have one. | |
1083 | * <p> | |
1084 | * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a heading tag without an | |
1085 | * {@code id} is found, its "slug" is generated automatically based on the heading contents and used as the ID. | |
1086 | * </p> | |
1087 | * <p> | |
1088 | * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS selectors, e.g. ":", ".", | |
1089 | * etc. The symbols are removed. | |
1090 | * </p> | |
1091 | * | |
1092 | * @param pageType | |
1093 | * The type of page. | |
1094 | * @param currentPage | |
1095 | * The name of current page. | |
1096 | * @param content | |
1097 | * HTML content to modify. | |
1098 | * @param idSeparator | |
1099 | * the seperator used to slug ID. | |
1100 | * @return Returns a {@link String} representing HTML content with all heading elements having {@code id} | |
1101 | * attributes. If all headings were with IDs already, the original content is returned. | |
1102 | * @since 1.0 | |
1103 | */ | |
1104 | 3 | public String ensureHeadingIds(final String pageType, |
1105 | final String currentPage, | |
1106 | final String content, | |
1107 | final String idSeparator) { | |
1108 | 3 | final List<String> excludedPages = Arrays.asList("checkstyle-aggregate", "checkstyle"); |
1109 | ||
1110 | 3 | final Element body = parse(content).body(); |
1111 | ||
1112 | // exclude pages | |
1113 | 3 | if (excludedPages.contains(currentPage)) { |
1114 | 0 | return content; |
1115 | } | |
1116 | ||
1117 | // first find all existing IDs (to avoid generating duplicates) | |
1118 | 3 | final List<Element> idElems = body.select("*[id]"); |
1119 | ||
1120 | 3 | final Set<String> ids = new HashSet<>(); |
1121 | 3 | boolean modified = false; |
1122 | 3 | for (final Element idElem : idElems) { |
1123 | ||
1124 | // fix all existing IDs - remove colon and other symbols which mess up jQuery | |
1125 | 0 | final String id = idElem.id(); |
1126 | 0 | idElem.attr("id", slug(id, idSeparator)); |
1127 | 0 | modified = true; |
1128 | ||
1129 | 0 | ids.add(idElem.id()); |
1130 | } | |
1131 | ||
1132 | // create unique id for all heading elements | |
1133 | 3 | final List<String> headIds = concat(HEADINGS, "[id]", true); |
1134 | // select all headings that have an ID | |
1135 | 3 | final List<Element> headingIds = body.select(String.join(", ", headIds)); |
1136 | ||
1137 | 3 | for (final Element heading : headingIds) { |
1138 | 0 | final String headingText = heading.text(); |
1139 | 0 | String headingSlug = slug(headingText, idSeparator); |
1140 | // also limit slug to 50 symbols | |
1141 | 0 | if (headingSlug.length() > SLUG_SIZE) { |
1142 | 0 | headingSlug = headingSlug.substring(0, SLUG_SIZE); |
1143 | } | |
1144 | 0 | final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug); |
1145 | ||
1146 | 0 | heading.attr("id", headingId); |
1147 | } | |
1148 | ||
1149 | 3 | final List<String> headNoIds = concat(HEADINGS, ":not([id])", true); |
1150 | ||
1151 | // select all headings that do not have an ID | |
1152 | 3 | final List<Element> headingsNoId = body.select(String.join(", ", headNoIds)); |
1153 | ||
1154 | 3 | if (!headingsNoId.isEmpty() || modified) { |
1155 | 3 | for (final Element heading : headingsNoId) { |
1156 | ||
1157 | 9 | final String headingText = heading.text(); |
1158 | 9 | String headingSlug = slug(headingText, idSeparator); |
1159 | // also limit slug to 50 symbols | |
1160 | 9 | if (headingSlug.length() > SLUG_SIZE) { |
1161 | 0 | headingSlug = headingSlug.substring(0, SLUG_SIZE); |
1162 | } | |
1163 | 9 | final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug); |
1164 | ||
1165 | 9 | heading.attr("id", headingId); |
1166 | } | |
1167 | } | |
1168 | ||
1169 | 3 | return body.html(); |
1170 | } | |
1171 | ||
1172 | /** | |
1173 | * Generated a unique ID within the given set of IDs. Appends an incrementing number for duplicates. | |
1174 | * | |
1175 | * @param pageType | |
1176 | * The type of page. | |
1177 | * @param currentPage | |
1178 | * Tthe name of current page. | |
1179 | * @param ids | |
1180 | * The list of ID already existing or used. | |
1181 | * @param idBase | |
1182 | * The prefix to use. | |
1183 | * @return Returns a new {@link String} representing a new unique ID. | |
1184 | */ | |
1185 | 9 | private static String generateUniqueId(final String pageType, |
1186 | final String currentPage, | |
1187 | final Set<String> ids, | |
1188 | final String idBase) { | |
1189 | 9 | String id = idBase; |
1190 | 9 | int counter = 1; |
1191 | 9 | while (ids.contains(id)) { |
1192 | 0 | id = idBase + String.valueOf(counter++); |
1193 | } | |
1194 | ||
1195 | // put the newly generated one into the set | |
1196 | 9 | ids.add(id); |
1197 | 9 | if ("frame".equals(pageType)) { |
1198 | 3 | id = currentPage + SEPARATOR_TOC + id; |
1199 | } | |
1200 | 9 | return id; |
1201 | } | |
1202 | ||
1203 | /** | |
1204 | * Fixes table heads: wraps rows with {@code | |
1205 | * | |
1206 | * | |
1207 | <th>} (table heading) elements into {@code <thead>} element if they are currently in {@code <tbody>}. | |
1208 | * | |
1209 | * @param content | |
1210 | * HTML content to modify | |
1211 | * @return HTML content with all table heads fixed. If all heads were correct, the original content is returned. | |
1212 | * @since 1.0 | |
1213 | */ | |
1214 | 6 | public String fixTableHeads(final String content) { |
1215 | ||
1216 | 6 | final Element body = parse(content).body(); |
1217 | ||
1218 | 6 | final List<Element> tables = body.select("table"); |
1219 | ||
1220 | 6 | for (final Element table : tables) { |
1221 | // select rows with <th> tags within <tbody> | |
1222 | 7 | final List<Element> tableHeadRows = table.select("tbody > tr:has(th)"); |
1223 | // convert only table containing one tr head. | |
1224 | 7 | if (tableHeadRows.size() == 1) { |
1225 | ||
1226 | 4 | for (final Element row : tableHeadRows) { |
1227 | ||
1228 | // remove row from its original position | |
1229 | 4 | row.remove(); |
1230 | ||
1231 | // create table header element with the row | |
1232 | 4 | final Element thead = new Element(Tag.valueOf("thead"), ""); |
1233 | 4 | thead.appendChild(row); |
1234 | // add at the beginning of the table | |
1235 | 4 | table.prependChild(thead); |
1236 | } | |
1237 | } | |
1238 | } | |
1239 | 6 | return body.html(); |
1240 | } | |
1241 | ||
1242 | /** */ | |
1243 | private static final Pattern NONLATIN = Pattern.compile("[^\\w-]"); | |
1244 | ||
1245 | /** */ | |
1246 | private static final Pattern WHITESPACE = Pattern.compile("[\\s]"); | |
1247 | ||
1248 | /** | |
1249 | * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). Uses "-" | |
1250 | * as a whitespace separator. | |
1251 | * | |
1252 | * @param input | |
1253 | * text to generate the slug from | |
1254 | * @return the slug of the given text that contains alphanumeric symbols and "-" only | |
1255 | * @since 1.0 | |
1256 | */ | |
1257 | 36 | public static String slug(final String input) { |
1258 | 36 | return slug(input, DEFAULT_SLUG_SEPARATOR); |
1259 | } | |
1260 | ||
1261 | /** | |
1262 | * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). | |
1263 | * | |
1264 | * @param input | |
1265 | * text to generate the slug from | |
1266 | * @param separator | |
1267 | * separator for whitespace replacement | |
1268 | * @return the slug of the given text that contains alphanumeric symbols and separator only | |
1269 | * @since 1.0 | |
1270 | * @see <a href= | |
1271 | * "http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a> | |
1272 | */ | |
1273 | 45 | private static String slug(final String input, final String separator) { |
1274 | 45 | final String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator); |
1275 | 45 | final String normalized = Normalizer.normalize(nowhitespace, Form.NFD); |
1276 | 45 | return NONLATIN.matcher(normalized).replaceAll("").toLowerCase(Locale.ENGLISH); |
1277 | } | |
1278 | ||
1279 | /** | |
1280 | * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are nested within bigger | |
1281 | * ones, e.g. <code><h2></code> is nested under preceding <code><h1></code>. | |
1282 | * <p> | |
1283 | * Only headings with IDs are included in the hierarchy. The result elements contain ID and heading text for each | |
1284 | * heading. The hierarchy is useful to generate a Table of Contents for a page. | |
1285 | * </p> | |
1286 | * | |
1287 | * @param content | |
1288 | * HTML content to extract heading hierarchy from | |
1289 | * @param sections | |
1290 | * list of all sections | |
1291 | * @return a list of top-level heading items (with id and text). The remaining headings are nested within these | |
1292 | * top-level items. Empty list if no headings are in the content. | |
1293 | * @since 1.0 | |
1294 | */ | |
1295 | 1 | public List<? extends IdElement> headingTree(final String content, final List<String> sections) { |
1296 | ||
1297 | 1 | final List<String> sectionContents = this.split(content, "hr"); |
1298 | 1 | final List<String> headIds = concat(HEADINGS, "[id]:not(.no-anchor)", true); |
1299 | 1 | final List<HeadingItem> headingItems = new ArrayList<>(); |
1300 | ||
1301 | 1 | int index = 0; |
1302 | 1 | for (final String sectionContent : sectionContents) { |
1303 | 1 | final String sectionType = index < sections.size() ? sections.get(index++) : ""; |
1304 | ||
1305 | // exclude carousel headings | |
1306 | 1 | if ("carousel".equals(sectionType)) { |
1307 | 0 | continue; |
1308 | } | |
1309 | 1 | final Element body = parse(sectionContent).body(); |
1310 | // select all headings that have an ID | |
1311 | 1 | final List<Element> headings = body.select(String.join(", ", headIds)); |
1312 | 1 | for (final Element heading : headings) { |
1313 | 3 | headingItems |
1314 | .add(new HeadingItem(heading.id(), heading.nodeName(), heading.text(), headingIndex(heading))); | |
1315 | } | |
1316 | } | |
1317 | ||
1318 | 1 | final List<HeadingItem> topHeadings = new ArrayList<>(); |
1319 | 1 | final Stack<HeadingItem> parentHeadings = new Stack<>(); |
1320 | ||
1321 | 1 | for (final HeadingItem heading : headingItems) { |
1322 | ||
1323 | 4 | while (!parentHeadings.isEmpty() && parentHeadings.peek().headingLevel >= heading.headingLevel) { |
1324 | 1 | parentHeadings.pop(); |
1325 | } | |
1326 | ||
1327 | 3 | if (parentHeadings.isEmpty()) { |
1328 | // top level heading - no parents | |
1329 | 1 | topHeadings.add(heading); |
1330 | } else { | |
1331 | // add to the children of topmost stack parent | |
1332 | 2 | parentHeadings.peek().children.add(heading); |
1333 | } | |
1334 | ||
1335 | // push the heading onto stack | |
1336 | 3 | parentHeadings.push(heading); |
1337 | } | |
1338 | ||
1339 | 1 | return topHeadings; |
1340 | } | |
1341 | ||
1342 | /** | |
1343 | * Retrieves numeric index of a heading. | |
1344 | * | |
1345 | * @param element | |
1346 | * @return | |
1347 | */ | |
1348 | 3 | private static int headingIndex(final Element element) { |
1349 | 3 | final String tagName = element.tagName(); |
1350 | 3 | if (tagName.startsWith("h")) { |
1351 | 3 | try { |
1352 | 3 | return Integer.parseInt(tagName.substring(1)); |
1353 | } catch (final Exception ex) { | |
1354 | 0 | throw new IllegalArgumentException("Must be a header tag: " + tagName, ex); |
1355 | } | |
1356 | } else { | |
1357 | 0 | throw new IllegalArgumentException("Must be a header tag: " + tagName); |
1358 | } | |
1359 | } | |
1360 | ||
1361 | /** | |
1362 | * @author Christophe Friederich | |
1363 | */ | |
1364 | private static final class HeadingItem implements IdElement { | |
1365 | ||
1366 | /** */ | |
1367 | private final String id; | |
1368 | ||
1369 | /** */ | |
1370 | private final String tagName; | |
1371 | ||
1372 | /** */ | |
1373 | private final String text; | |
1374 | ||
1375 | /** */ | |
1376 | private final int headingLevel; | |
1377 | ||
1378 | /** */ | |
1379 | private final List<HeadingItem> children = new ArrayList<>(); | |
1380 | ||
1381 | 3 | private HeadingItem(final String id, final String tagName, final String text, final int headingLevel) { |
1382 | 3 | this.id = id; |
1383 | 3 | this.tagName = tagName; |
1384 | 3 | this.text = text; |
1385 | 3 | this.headingLevel = headingLevel; |
1386 | } | |
1387 | ||
1388 | 3 | @Override |
1389 | public String getId() { | |
1390 | 3 | return id; |
1391 | } | |
1392 | ||
1393 | 1 | @Override |
1394 | public String getTagName() { | |
1395 | 1 | return tagName; |
1396 | } | |
1397 | ||
1398 | 1 | @Override |
1399 | public String getText() { | |
1400 | 1 | return text; |
1401 | } | |
1402 | ||
1403 | 2 | @Override |
1404 | public List<HeadingItem> getItems() { | |
1405 | 2 | return Collections.unmodifiableList(children); |
1406 | } | |
1407 | ||
1408 | 3 | @Override |
1409 | public int getHeadingLevel() { | |
1410 | 3 | return headingLevel; |
1411 | } | |
1412 | } | |
1413 | ||
1414 | /** | |
1415 | * Representation of a HTML element with ID and a text content. Other such elements can be nested within. | |
1416 | * | |
1417 | * @author Andrius Velykis | |
1418 | * @since 1.0 | |
1419 | */ | |
1420 | public interface IdElement { | |
1421 | ||
1422 | /** | |
1423 | * Retrieves the ID of the HTML element (attribute {@code id}). | |
1424 | * | |
1425 | * @return element {@code id} value | |
1426 | */ | |
1427 | String getId(); | |
1428 | ||
1429 | /** | |
1430 | * @return Returns the tag name of element. | |
1431 | */ | |
1432 | String getTagName(); | |
1433 | ||
1434 | /** | |
1435 | * Retrieves the text contents of the HTML element (rendered for display). | |
1436 | * | |
1437 | * @return text contents of the element | |
1438 | */ | |
1439 | String getText(); | |
1440 | ||
1441 | /** | |
1442 | * @return Returns the level of heading. | |
1443 | */ | |
1444 | int getHeadingLevel(); | |
1445 | ||
1446 | /** | |
1447 | * Retrieves the children of the HTML element (nested within the element). | |
1448 | * | |
1449 | * @return nested items within the element | |
1450 | */ | |
1451 | List<? extends IdElement> getItems(); | |
1452 | } | |
1453 | } |