File HtmlTool.java

Branches:

106

Statements:

335

Methods:

Classes:

LOC:

1 453

NCLOC:

626

Total complexity:

106

Complexity density:

0,32

Statements/Method:

7,28

Methods/Class:

7,67

Average method complexity:

2,3

Classes

Class	Line #	Total Statements	% Filtered	Complexity	Uncovered Elements	TOTAL Coverage
HtmlTool	70	322	0%	97	74	0.840860284,1%
HtmlTool.JoinSeparator	85	0	-	0	0	-1.0 -
HtmlTool.ExtractResult	573	0	-	0	0	-1.0 -
HtmlTool.DefaultExtractResult	593	4	0%	3	0	1.0100%
HtmlTool.HeadingItem	1364	9	0%	6	0	1.0100%
HtmlTool.IdElement	1420	0	-	0	0	-1.0 -

Class HtmlTool

Class HtmlTool	Line # 70	Total Statements 322	% Filtered 0%	Complexity 97	Uncovered Elements 74	TOTAL Coverage 0.840860284,1%
configure(ValueParser) : void configure(ValueParser) : void	108108	10.010	0.00%	4.04	6.06	0.625 0.62562,5%
normaliseWhitespace(String) : String normaliseWhitespace(String) : String	140140	3.03	0.00%	2.02	5.05	0.0 0.00%
split(String,String) : List<String> split(String,String) : List<String>	159159	1.01	0.00%	1.01	0.00	1.0 1.0100%
splitOnStarts(String,String) : List<String> splitOnStarts(String,String) : List<String>	180180	4.04	0.00%	3.03	2.02	0.6666667 0.666666766,7%
split(String,String,String) : List<String> split(String,String,String) : List<String>	209209	7.07	0.00%	3.03	0.00	1.0 1.0100%
split(String,String,JoinSeparator) : List<String> split(String,String,JoinSeparator) : List<String>	243243	12.012	0.00%	3.03	0.00	1.0 1.0100%
split(Collection<Element>,JoinSeparator,Element) : List<List<Element>> split(Collection<Element>,JoinSeparator,Element) : List<List<Element>>	281281	21.021	0.00%	5.05	0.00	1.0 1.0100%
getLastPartition(List<List<Element>>) : List<Element> getLastPartition(List<List<Element>>) : List<Element>	347347	5.05	0.00%	2.02	0.00	1.0 1.0100%
outerHtml(List<Element>) : String outerHtml(List<Element>) : String	363363	10.010	0.00%	3.03	0.00	1.0 1.0100%
reorderToTop(String,String,int) : String reorderToTop(String,String,int) : String	397397	1.01	0.00%	1.01	1.01	0.0 0.00%
reorderToTop(String,String,int,String) : String reorderToTop(String,String,int,String) : String	416416	10.010	0.00%	4.04	3.03	0.8125 0.812581,2%
wrapInner(Element,String) : Element wrapInner(Element,String) : Element	447447	8.08	0.00%	1.01	0.00	1.0 1.0100%
extractElements(String,String,int) : List<Element> extractElements(String,String,int) : List<Element>	478478	12.012	0.00%	3.03	2.02	0.875 0.87587,5%
filterParents(List<Element>) : List<Element> filterParents(List<Element>) : List<Element>	512512	7.07	0.00%	2.02	0.00	1.0 1.0100%
extract(String,String,int) : ExtractResult extract(String,String,int) : ExtractResult	542542	9.09	0.00%	2.02	2.02	0.8181818 0.818181881,8%
setAttr(String,String,String,String) : String setAttr(String,String,String,String) : String	631631	7.07	0.00%	2.02	0.00	1.0 1.0100%
parse(String) : Document parse(String) : Document	656656	3.03	0.00%	1.01	0.00	1.0 1.0100%
getAttr(String,String,String) : List<String> getAttr(String,String,String) : List<String>	675675	7.07	0.00%	1.01	0.00	1.0 1.0100%
addClass(String,String,List<String>,int) : String addClass(String,String,List<String>,int) : String	704704	10.010	0.00%	3.03	2.02	0.85714287 0.8571428785,7%
addClass(String,String,List<String>) : String addClass(String,String,List<String>) : String	744744	1.01	0.00%	1.01	0.00	1.0 1.0100%
addClass(String,String,String) : String addClass(String,String,String) : String	760760	1.01	0.00%	1.01	0.00	1.0 1.0100%
wrap(String,String,String,int) : String wrap(String,String,String,int) : String	778778	9.09	0.00%	3.03	3.03	0.7692308 0.769230876,9%
remove(String,String) : String remove(String,String) : String	811811	7.07	0.00%	2.02	2.02	0.7777778 0.777777877,8%
replace(String,String,String) : String replace(String,String,String) : String	840840	1.01	0.00%	1.01	0.00	1.0 1.0100%
replaceAll(String,Map<String, String>) : String replaceAll(String,Map<String, String>) : String	855855	15.015	0.00%	4.04	1.01	0.95238096 0.9523809695,2%
replaceWith(String,String,String) : String replaceWith(String,String,String) : String	901901	16.016	0.00%	4.04	1.01	0.95454544 0.9545454495,5%
text(String,String) : List<String> text(String,String) : List<String>	945945	8.08	0.00%	2.02	2.02	0.8 0.880%
headingAnchorToId(String) : String headingAnchorToId(String) : String	983983	25.025	0.00%	5.05	13.013	0.6060606 0.606060660,6%
anchorToId(Element,Element) : void anchorToId(Element,Element) : void	10451045	5.05	0.00%	4.04	2.02	0.7777778 0.777777877,8%
concat(List<String>,String,boolean) : List<String> concat(List<String>,String,boolean) : List<String>	10711071	4.04	0.00%	2.02	0.00	1.0 1.0100%
ensureHeadingIds(String,String,String,String) : String ensureHeadingIds(String,String,String,String) : String	11041104	32.032	0.00%	6.06	17.017	0.575 0.57557,5%
generateUniqueId(String,String,Set<String>,String) : String generateUniqueId(String,String,Set<String>,String) : String	11851185	8.08	0.00%	3.03	2.02	0.8333333 0.833333383,3%
fixTableHeads(String) : String fixTableHeads(String) : String	12141214	11.011	0.00%	2.02	0.00	1.0 1.0100%
slug(String) : String slug(String) : String	12571257	1.01	0.00%	1.01	0.00	1.0 1.0100%
slug(String,String) : String slug(String,String) : String	12731273	3.03	0.00%	1.01	0.00	1.0 1.0100%
headingTree(String,List<String>) : List<? extends IdElement> headingTree(String,List<String>) : List<? extends IdElement>	12951295	22.022	0.00%	6.06	3.03	0.9 0.990%
headingIndex(Element) : int headingIndex(Element) : int	13481348	6.06	0.00%	3.03	3.03	0.625 0.62562,5%

Class HtmlTool.JoinSeparator

Class HtmlTool.JoinSeparator	Line # 85	Total Statements 0	% Filtered -	Complexity 0	Uncovered Elements 0	TOTAL Coverage -1.0 -

Class HtmlTool.ExtractResult

Class HtmlTool.ExtractResult	Line # 573	Total Statements 0	% Filtered -	Complexity 0	Uncovered Elements 0	TOTAL Coverage -1.0 -

Class HtmlTool.DefaultExtractResult

Class HtmlTool.DefaultExtractResult	Line # 593	Total Statements 4	% Filtered 0%	Complexity 3	TOTAL Coverage 1.0100%
DefaultExtractResult(List<String>,String) DefaultExtractResult(List<String>,String)	601601	2.02	0.00%	1.01	1.0 1.0100%
getExtracted() : List<String> getExtracted() : List<String>	606606	1.01	0.00%	1.01	1.0 1.0100%
getRemainder() : String getRemainder() : String	611611	1.01	0.00%	1.01	1.0 1.0100%

Class HtmlTool.HeadingItem

Class HtmlTool.HeadingItem	Line # 1364	Total Statements 9	% Filtered 0%	Complexity 6	TOTAL Coverage 1.0100%
HeadingItem(String,String,String,int) HeadingItem(String,String,String,int)	13811381	4.04	0.00%	1.01	1.0 1.0100%
getId() : String getId() : String	13881388	1.01	0.00%	1.01	1.0 1.0100%
getTagName() : String getTagName() : String	13931393	1.01	0.00%	1.01	1.0 1.0100%
getText() : String getText() : String	13981398	1.01	0.00%	1.01	1.0 1.0100%
getItems() : List<HeadingItem> getItems() : List<HeadingItem>	14031403	1.01	0.00%	1.01	1.0 1.0100%
getHeadingLevel() : int getHeadingLevel() : int	14081408	1.01	0.00%	1.01	1.0 1.0100%

Class HtmlTool.IdElement

Class HtmlTool.IdElement	Line # 1420	Total Statements 0	% Filtered -	Complexity 0	Uncovered Elements 0	TOTAL Coverage -1.0 -

Contributing tests

This file is covered by 29 tests. .

Contributing tests

Test contribution	Test	Result
0.21560575	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldHeadingTreeorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldHeadingTree	1PASS
0.1889117	org.devacfr.maven.skins.reflow.context.ContextTest.shouldApplyBootstrapCssorg.devacfr.maven.skins.reflow.context.ContextTest.shouldApplyBootstrapCss	1PASS
0.18480493	org.devacfr.maven.skins.reflow.context.ContextTest.shouldReplaceTTTagorg.devacfr.maven.skins.reflow.context.ContextTest.shouldReplaceTTTag	1PASS
0.16837782	org.devacfr.maven.skins.reflow.context.ContextTest.shouldAddLighboxAttributeorg.devacfr.maven.skins.reflow.context.ContextTest.shouldAddLighboxAttribute	1PASS
0.16016427	org.devacfr.maven.skins.reflow.context.ContextTest.shouldReplaceIconsorg.devacfr.maven.skins.reflow.context.ContextTest.shouldReplaceIcons	1PASS
0.137577	org.devacfr.maven.skins.reflow.context.ContextTest.shouldNotChangeCodePartorg.devacfr.maven.skins.reflow.context.ContextTest.shouldNotChangeCodePart	1PASS
0.13552362	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitRecursivelyorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitRecursively	1PASS
0.12731007	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorAfterorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorAfter	1PASS
0.123203285	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorBeforeorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorBefore	1PASS
0.1211499	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitOnStartsorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitOnStarts	1PASS
0.114989735	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorNoorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitJoinSeparatorNo	1PASS
0.10677618	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldReorderToTopOneSectionorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldReorderToTopOneSection	1PASS
0.1026694	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitBodyFragmentorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSplitBodyFragment	1PASS
0.09856263	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldEnsureHeadingIdsForFrameorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldEnsureHeadingIdsForFrame	1PASS
0.09650924	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldEnsureHeadingIdsorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldEnsureHeadingIds	1PASS
0.09445585	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldExtractorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldExtract	1PASS
0.078028746	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldHeadingAnchorToIdorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldHeadingAnchorToId	1PASS
0.049281314	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldReplaceorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldReplace	1PASS
0.047227927	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldReplaceWithorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldReplaceWith	1PASS
0.039014373	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldAddClassorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldAddClass	1PASS
0.036960986	org.devacfr.maven.skins.reflow.HtmlToolTest.fixTableHeadsWithTagListReportOuputorg.devacfr.maven.skins.reflow.HtmlToolTest.fixTableHeadsWithTagListReportOuput	1PASS
0.030800821	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldWrapElementorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldWrapElement	1PASS
0.026694044	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldExtractTextorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldExtractText	1PASS
0.026694044	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldNotSplitorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldNotSplit	1PASS
0.024640657	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldRemoveElementorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldRemoveElement	1PASS
0.024640657	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldSetAttributeorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldSetAttribute	1PASS
0.024640657	org.devacfr.maven.skins.reflow.HtmlToolTest.shouldgetAttributeorg.devacfr.maven.skins.reflow.HtmlToolTest.shouldgetAttribute	1PASS
0.0123203285	org.devacfr.maven.skins.reflow.SkinConfigToolTest.testSlugFilenameorg.devacfr.maven.skins.reflow.SkinConfigToolTest.testSlugFilename	1PASS
0.0123203285	org.devacfr.maven.skins.reflow.context.FrameContextTest.shouldBuildFrameContextorg.devacfr.maven.skins.reflow.context.FrameContextTest.shouldBuildFrameContext	1PASS

Source view

* Licensed to the Apache Software Foundation (ASF) under one

* or more contributor license agreements. See the NOTICE file

* distributed with this work for additional information

* regarding copyright ownership. The ASF licenses this file

* to you under the Apache License, Version 2.0 (the

* "License"); you may not use this file except in compliance

* with the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing,

* software distributed under the License is distributed on an

* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

* KIND, either express or implied. See the License for the

* specific language governing permissions and limitations

* under the License.

package org.devacfr.maven.skins.reflow;

import javax.annotation.Nonnull;

import javax.annotation.Nullable;

import java.text.Normalizer;

import java.text.Normalizer.Form;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collection;

import java.util.Collections;

import java.util.HashSet;

import java.util.List;

import java.util.Locale;

import java.util.Map;

import java.util.Map.Entry;

import java.util.Set;

import java.util.Stack;

import java.util.regex.Pattern;

import com.google.common.base.Strings;

import com.google.common.collect.Lists;

import org.apache.velocity.tools.ToolContext;

import org.apache.velocity.tools.config.DefaultKey;

import org.apache.velocity.tools.generic.SafeConfig;

import org.apache.velocity.tools.generic.ValueParser;

import org.jsoup.Jsoup;

import org.jsoup.internal.StringUtil;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.parser.Tag;

import static java.util.Collections.emptyList;

import static java.util.Objects.requireNonNull;

/**

* An Apache Velocity tool that provides utility methods to manipulate HTML code using

* <a href="http://jsoup.org/">jsoup</a> HTML5 parser.

* <p>

* The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS selectors</a> to refer to

* specific elements for manipulation.

* </p>

* @author Andrius Velykis

* @author Christophe Friederich

* @since 1.0

* @see <a href="http://jsoup.org/">jsoup HTML parser</a>

* @see <a href= "http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>

@DefaultKey("htmlTool")

public class HtmlTool extends SafeConfig {

private static final int SLUG_SIZE = 50;

/** Default separator using to generate slug heading name. */

public static final String DEFAULT_SLUG_SEPARATOR = "-";

/** prefix heading id associated to table of contents. */

private static final String SEPARATOR_TOC = "_toc_";

/** A list of all HTML heading classes (h1-6). */

private static final List<String> HEADINGS = Collections

.unmodifiableList(Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));

/** Enum indicating separator handling strategy for document partitioning. */

public enum JoinSeparator {

/**

* Keep separators at the start of partitions. The first partition will not have a separator.

AFTER,

/**

* Keep separators at the end of partitions. The last partition will not have a separator.

BEFORE,

/** Drop separators altogether. */

}

/** */

private String outputEncoding = "UTF-8";

100

101

private boolean prettyPrint = true;

/**

* {@inheritDoc}

* @see SafeConfig#configure(ValueParser)

107

108

@Override

109

protected void configure(final ValueParser values) {

110

111

// retrieve the Velocity context for output encoding

112

final Object velocityContext = values.get("velocityContext");

113

114

if (!(velocityContext instanceof ToolContext)) {

return;

}

final ToolContext ctxt = (ToolContext) velocityContext;

119

120

// get the output encoding

121

final Object outputEncodingObj = ctxt.get("outputEncoding");

122

if (outputEncodingObj instanceof String) {

123

this.outputEncoding = (String) outputEncodingObj;

124

}

125

126

final Object prettyPrint = ctxt.get("prettyPrint");

127

if (prettyPrint instanceof Boolean) {

128

this.prettyPrint = (Boolean) prettyPrint;

}

}

/**

* Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters

134

* (e.g. newline, tab) convert to a simple space

135

136

* @param html

137

* html content to normalise.

138

* @return Returns normalised string.

139

140

@Nullable public String normaliseWhitespace(@Nullable final String html) {

141

if (Strings.isNullOrEmpty(html)) {

142

return null;

143

}

144

return StringUtil.normaliseWhitespace(html);

}

/**

* Splits the given HTML content into partitions based on the given separator selector. The separators themselves

149

* are dropped from the results.

150

151

* @param content

152

* body HTML content to split (can not be empty or {@code null}).

153

* @param separatorCssSelector

154

* CSS selector for separators (can not be empty or {@code null}).

155

* @return a list of HTML partitions split on separator locations, but without the separators.

156

* @since 1.0

157

* @see #split(String, String, JoinSeparator)

158

159

public List<String> split(@Nonnull final String content, @Nonnull final String separatorCssSelector) {

160

return split(content, separatorCssSelector, JoinSeparator.NO);

}

/**

* Splits the given HTML content into partitions based on the given separator selector. The separators are kept as

165

* first elements of the partitions.

166

* <p>

167

* Note that the first part is removed if the split was successful. This is because the first part does not include

* the separator.

* </p>

* @param content

* HTML content to split

173

* @param separatorCssSelector

174

* CSS selector for separators

175

* @return a list of HTML partitions split on separator locations (except the first one), with separators at the

176

* beginning of each partition

177

* @since 1.0

178

* @see #split(String, String, JoinSeparator)

179

180

public List<String> splitOnStarts(final @Nonnull String content, final @Nonnull String separatorCssSelector) {

181

182

final List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);

183

184

if (result == null || result.size() <= 1) {

185

// no result or just one part - return what we have

return result;

}

// otherwise, drop the first part - the first split will be the first 'start'

190

// e.g. if we split on headings, the first part will contain everything

191

// before the first heading.

192

return result.subList(1, result.size());

}

/**

* Splits the given HTML content into partitions based on the given separator selector. The separators are either

197

* dropped or joined with before/after depending on the indicated separator strategy.

198

199

* @param content

200

* HTML content to split

201

* @param separatorCssSelector

202

* CSS selector for separators

203

* @param separatorStrategy

204

* strategy to drop or keep separators, one of "after", "before" or "no"

205

* @return a list of HTML partitions split on separator locations.

206

* @since 1.0

207

* @see #split(String, String, JoinSeparator)

208

209

public List<String> split(final @Nonnull String content,

210

final @Nonnull String separatorCssSelector,

211

final String separatorStrategy) {

212

213

JoinSeparator sepStrategy;

214

if ("before".equals(separatorStrategy)) {

215

sepStrategy = JoinSeparator.BEFORE;

216

} else if ("after".equals(separatorStrategy)) {

217

sepStrategy = JoinSeparator.AFTER;

218

} else {

219

sepStrategy = JoinSeparator.NO;

220

}

221

222

return split(content, separatorCssSelector, sepStrategy);

}

/**

* Splits the given HTML content into partitions based on the given separator selector.The separators are either

227

* dropped or joined with before/after depending on the indicated separator strategy.

228

* <p>

229

* Note that splitting algorithm tries to resolve nested elements so that returned partitions are self-contained

230

* HTML elements. The nesting is normally contained within the first applicable partition.

* </p>

* @param content

* Body HTML content to split

235

* @param separatorCssSelector

236

* CSS selector for separators

237

* @param separatorStrategy

238

* strategy to drop or keep separators

239

* @return a list of HTML partitions split on separator locations. If no splitting occurs, returns the original

240

* content as the single element of the list

241

* @since 1.0

242

243

public List<String> split(@Nonnull final String content,

244

@Nonnull final String separatorCssSelector,

245

@Nonnull final JoinSeparator separatorStrategy) {

246

247

requireNonNull(separatorStrategy);

248

final Element body = parse(content).body();

249

250

final List<Element> separators = body.select(separatorCssSelector);

251

if (separators.size() > 0) {

252

final List<List<Element>> partitions = split(separators, separatorStrategy, body);

253

254

final List<String> sectionHtml = new ArrayList<>();

255

256

for (final List<Element> partition : partitions) {

257

final String html = outerHtml(partition);

258

if (!Strings.isNullOrEmpty(html)) {

259

sectionHtml.add(outerHtml(partition));

}

}

return sectionHtml;

} else {

// nothing to split

return Collections.singletonList(content);

}

}

/**

* Recursively splits the {@code parent} element based on the given {@code separators}. If a separator is

272

* encountered in the parent, it is split on that position. The outstanding nested elements go with the first of the

273

* partitions in each case.

274

275

* @param separators

276

* @param separatorStrategy

277

* @param parent

278

* @return list of partitions (as lists of root elements for each partition). Partition can be an empty list, e.g.

279

* if the separator is at the start of the content.

280

281

private static List<List<Element>> split(final Collection<Element> separators,

282

final JoinSeparator separatorStrategy,

283

final Element parent) {

284

285

final List<List<Element>> partitions = Lists.newLinkedList();

286

287

for (final Element child : parent.children()) {

288

289

if (separators.contains(child)) {

290

// split here and do not go deeper

291

292

// first ensure there was a partition before

293

// otherwise the split is not recognised on an outer level

294

getLastPartition(partitions);

295

296

if (separatorStrategy == JoinSeparator.BEFORE) {

297

// add to the last partition

298

getLastPartition(partitions).add(child);

299

}

300

301

// add an empty new partition

302

final List<Element> newPartition = Lists.newLinkedList();

303

partitions.add(newPartition);

304

305

if (separatorStrategy == JoinSeparator.AFTER) {

306

// add to the new partition

307

newPartition.add(child);

}

} else {

// go deeper

final List<List<Element>> childPartitions = split(separators, separatorStrategy, child);

313

314

// add the child to the last partition

315

getLastPartition(partitions).add(child);

316

317

if (childPartitions.size() > 1) {

318

// more than one partition:

319

// only keep the first partition elements in the child

320

// so for all other partitions, remove them from their parents

321

322

final List<Element> allChildren = child.children();

323

final List<Element> firstPartition = childPartitions.get(0);

324

325

allChildren.removeAll(firstPartition);

326

for (final Element removeChild : allChildren) {

327

removeChild.remove();

328

}

329

330

// add the remaining partitions

331

for (final List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {

332

partitions.add(nextPartition);

}

}

}

}

return partitions;

}

/**

* Retrieves the last partition (as list of elements) or creates a new one if there was none before.

* @param partitions

* @return

private static List<Element> getLastPartition(final List<List<Element>> partitions) {

348

if (partitions.isEmpty()) {

349

final List<Element> newPartition = Lists.newLinkedList();

350

partitions.add(newPartition);

351

return newPartition;

352

} else {

353

return partitions.get(partitions.size() - 1);

}

}

/**

* Outputs the list of partition root elements to HTML.

* @param elements

* @return

private static String outerHtml(final List<Element> elements) {

364

365

switch (elements.size()) {

case 0:

return "";

case 1:

return elements.get(0).outerHtml();

371

372

default:

373

// more than one element

374

// wrap into <div> which we will remove afterwards

375

final Element root = new Element(Tag.valueOf("div"), "");

376

for (final Element elem : elements) {

377

root.appendChild(elem);

}

return root.html();

}

}

/**

* Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited

386

* to a certain amount, e.g. to bring just the first of selected elements to the top.

387

388

* @param content

389

* HTML content to reorder

390

* @param selector

391

* CSS selector for elements to bring to top of the content

392

* @param amount

393

* Maximum number of elements to reorder

394

* @return HTML content with reordered elements, or the original content if no such elements found.

395

* @since 1.0

396

397

public String reorderToTop(final String content, final String selector, final int amount) {

398

return reorderToTop(content, selector, amount, null);

}

/**

* Reorders elements in HTML content so that selected elements are found at the top of the content. Can be limited

403

* to a certain amount, e.g. to bring just the first of selected elements to the top.

404

405

* @param content

406

* HTML content to reorder

407

* @param selector

408

* CSS selector for elements to bring to top of the content

409

* @param amount

410

* Maximum number of elements to reorder

411

* @param wrapRemaining

412

* HTML to wrap the remaining (non-reordered) part

413

* @return HTML content with reordered elements, or the original content if no such elements found.

414

* @since 1.0

415

416

public String reorderToTop(final String content,

417

final String selector,

418

final int amount,

419

final String wrapRemaining) {

420

421

// extract the elements and then prepend them to the remaining body

422

final List<Element> extracted = extractElements(content, selector, amount);

423

424

if (extracted.size() > 1) {

425

426

final Element body = extracted.get(0);

427

428

if (wrapRemaining != null) {

429

wrapInner(body, wrapRemaining);

430

}

431

432

final List<Element> elements = extracted.subList(1, extracted.size());

433

434

// now prepend extracted elements to the body (in backwards to preserve original

435

// order)

436

for (int index = elements.size() - 1; index >= 0; index--) {

437

body.prependChild(elements.get(index));

}

return body.html();

} else {

// nothing to reorder

return content;

}

}

private static Element wrapInner(final Element element, final String html) {

448

449

// wrap everything into an additional <div> for wrapping

450

// otherwise there may be problems, e.g. with <body> element

451

final Element topDiv = new Element(Tag.valueOf("div"), "");

452

for (final Element topElem : element.children()) {

453

// add all elements in the body to the `topDiv`

454

topElem.remove();

455

topDiv.appendChild(topElem);

456

}

457

458

// add topDiv to the body

459

element.appendChild(topDiv);

// wrap topDiv

topDiv.wrap(html);

// now unwrap topDiv - will remove it from the hierarchy

topDiv.unwrap();

return element;

}

/**

* Extracts elements from the HTML content.

* @param content

* @param selector

* @param amount

* @return the remainder and a list of extracted elements. The main body (remainder after extraction) is always

476

* returned as the first element of the list.

477

478

private List<Element> extractElements(final String content, final String selector, final int amount) {

479

480

final Element body = parse(content).body();

481

482

List<Element> elements = body.select(selector);

483

if (elements.size() > 0) {

484

485

elements = filterParents(elements);

486

487

if (amount >= 0) {

488

// limit to the indicated amount

489

elements = elements.subList(0, Math.min(amount, elements.size()));

490

}

491

492

// remove all from their parents

493

for (final Element element : elements) {

element.remove();

}

}

final List<Element> results = new ArrayList<>();

499

// first element is the body

500

results.add(body);

501

results.addAll(elements);

return results;

}

/**

* Filters the list of elements to only contain parent elements. This is to avoid both parent and child being in the

* list of elements.

* @param elements

* @return

private static List<Element> filterParents(final List<Element> elements) {

513

final List<Element> filtered = new ArrayList<>();

514

for (final Element element : elements) {

515

// get the intersection of parents and selected elements

516

final List<Element> parentsInter = element.parents();

517

parentsInter.retainAll(elements);

518

if (parentsInter.isEmpty()) {

519

// no intersection - element's parents are not in the selected list

520

filtered.add(element);

}

}

return filtered;

}

/**

* Extracts HTML elements from the main HTML content. The result consists of the extracted HTML elements and the

529

* remainder of HTML content, with these elements removed. Can be limited to a certain amount, e.g. to extract just

530

* the first of selected elements.

531

532

* @param content

533

* HTML content to extract elements from

534

* @param selector

535

* CSS selector for elements to extract

536

* @param amount

537

* Maximum number of elements to extract

538

* @return HTML content of the extracted elements together with the remainder of the original content. If no

539

* elements are found, the remainder contains the original content.

* @since 1.0

@Nonnull

public ExtractResult extract(final String content, final String selector, final int amount) {

544

545

final List<Element> extracted = extractElements(content, selector, amount);

546

547

if (extracted.size() > 1) {

548

549

// first element is the remaining body, the rest are extracted

550

final Element body = extracted.get(0);

551

final List<Element> elements = extracted.subList(1, extracted.size());

552

553

// convert to HTML

554

final List<String> elementStr = new ArrayList<>();

555

for (final Element el : elements) {

556

elementStr.add(el.outerHtml());

557

}

558

559

return new DefaultExtractResult(elementStr, body.html());

560

} else {

561

// nothing to extract

562

return new DefaultExtractResult(Collections.<String> emptyList(), content);

}

}

/**

* A container to carry element extraction results. Contains the extracted element HTML code and the remainder of

568

* the body content with elements removed.

569

570

* @author Andrius Velykis

571

* @since 1.0

572

573

public interface ExtractResult {

574

575

/**

576

* Retrieves the extracted HTML elements.

577

578

* @return List of HTML of extracted elements. Can be empty if no elements found.

579

580

List<String> getExtracted();

581

582

/**

583

* Retrieves the content from which elements were extracted.

584

585

* @return The HTML content with extracted elements removed.

586

587

String getRemainder();

}

/**

* @author Christophe Friederich

592

593

private static final class DefaultExtractResult implements ExtractResult {

594

595

/** */

596

private final List<String> extracted;

597

598

/** */

599

private final String remainder;

600

601

private DefaultExtractResult(final List<String> extracted, final String remainder) {

602

this.extracted = extracted;

603

this.remainder = remainder;

}

@Override

public List<String> getExtracted() {

608

return Collections.unmodifiableList(extracted);

}

@Override

public String getRemainder() {

return remainder;

}

}

/**

* Sets attribute to the given value on elements in HTML.

619

620

* @param content

621

* HTML content to set attributes on

622

* @param selector

623

* CSS selector for elements to modify

624

* @param attributeKey

* Attribute name

* @param value

* Attribute value

* @return HTML content with modified elements. If no elements are found, the original content is returned.

629

* @since 1.0

630

631

public String setAttr(final String content, final String selector, final String attributeKey, final String value) {

632

633

final Element body = parse(content).body();

634

635

final List<Element> elements = body.select(selector);

636

if (elements.size() > 0) {

637

638

for (final Element element : elements) {

639

element.attr(attributeKey, value);

}

return body.html();

} else {

// nothing to update

return content;

}

}

/**

* Parses body fragment to the {@code <body>} element.

651

652

* @param content

653

* body HTML fragment (can not be {@code null}).

654

* @return the {@code body} element of the parsed content

655

656

public Document parse(@Nonnull final String content) {

657

final Document doc = Jsoup.parseBodyFragment(content);

658

doc.outputSettings().charset(outputEncoding).prettyPrint(prettyPrint);

return doc;

}

/**

* Retrieves attribute value on elements in HTML. Will return all attribute values for the selector, since there can

664

* be more than one element.

665

666

* @param content

667

* HTML content to read attributes from

668

* @param selector

669

* CSS selector for elements to find

670

* @param attributeKey

671

* Attribute name

672

* @return Attribute values for all matching elements. If no elements are found, empty list is returned.

673

* @since 1.0

674

675

public List<String> getAttr(final String content, final String selector, final String attributeKey) {

676

677

final Element body = parse(content).body();

678

679

final List<Element> elements = body.select(selector);

680

final List<String> attrs = new ArrayList<>();

681

682

for (final Element element : elements) {

683

final String attrValue = element.attr(attributeKey);

684

attrs.add(attrValue);

}

return attrs;

}

/**

* Adds given class names to the elements in HTML.

692

693

* @param content

694

* HTML content to modify

695

* @param selector

696

* CSS selector for elements to add classes to

697

* @param classNames

698

* Names of classes to add to the selected elements

699

* @param amount

700

* Maximum number of elements to modify

701

* @return HTML content with modified elements. If no elements are found, the original content is returned.

702

* @since 1.0

703

704

public String addClass(final String content,

705

final String selector,

706

final List<String> classNames,

707

final int amount) {

708

709

final Element body = parse(content).body();

710

711

List<Element> elements = body.select(selector);

712

if (amount >= 0) {

713

// limit to the indicated amount

714

elements = elements.subList(0, Math.min(amount, elements.size()));

715

}

716

717

if (elements.size() > 0) {

718

719

for (final Element element : elements) {

720

for (final String className : classNames) {

721

element.addClass(className);

}

}

return body.html();

} else {

// nothing to update

return content;

}

}

/**

* Adds given class names to the elements in HTML.

734

735

* @param content

736

* HTML content to modify

737

* @param selector

738

* CSS selector for elements to add classes to

739

* @param classNames

740

* Names of classes to add to the selected elements

741

* @return HTML content with modified elements. If no elements are found, the original content is returned.

742

* @since 1.0

743

744

public String addClass(final String content, final String selector, final List<String> classNames) {

745

return addClass(content, selector, classNames, -1);

}

/**

* Adds given class to the elements in HTML.

750

751

* @param content

752

* HTML content to modify

753

* @param selector

754

* CSS selector for elements to add the class to

755

* @param className

756

* Name of class to add to the selected elements

757

* @return HTML content with modified elements. If no elements are found, the original content is returned.

758

* @since 1.0

759

760

public String addClass(final String content, final String selector, final String className) {

761

return addClass(content, selector, Collections.singletonList(className));

}

/**

* Wraps elements in HTML with the given HTML.

766

767

* @param content

768

* HTML content to modify

769

* @param selector

770

* CSS selector for elements to wrap

771

* @param wrapHtml

772

* HTML to use for wrapping the selected elements

773

* @param amount

774

* Maximum number of elements to modify

775

* @return HTML content with modified elements. If no elements are found, the original content is returned.

776

* @since 1.0

777

778

public String wrap(final String content, final String selector, final String wrapHtml, final int amount) {

779

780

final Element body = parse(content).body();

781

782

List<Element> elements = body.select(selector);

783

if (amount >= 0) {

784

// limit to the indicated amount

785

elements = elements.subList(0, Math.min(amount, elements.size()));

786

}

787

788

if (elements.size() > 0) {

789

790

for (final Element element : elements) {

791

element.wrap(wrapHtml);

}

return body.html();

} else {

// nothing to update

return content;

}

}

/**

* Removes elements from HTML.

803

804

* @param content

805

* HTML content to modify

806

* @param selector

807

* CSS selector for elements to remove

808

* @return HTML content with removed elements. If no elements are found, the original content is returned.

809

* @since 1.0

810

811

public String remove(final String content, final String selector) {

812

813

final Element body = parse(content).body();

814

815

final List<Element> elements = body.select(selector);

816

if (elements.size() > 0) {

817

for (final Element element : elements) {

element.remove();

}

return body.html();

} else {

// nothing changed

return content;

}

}

/**

* Replaces elements in HTML.

830

831

* @param content

832

* HTML content to modify

833

* @param selector

834

* CSS selector for elements to replace

835

* @param replacement

836

* HTML replacement (must parse to a single element)

837

* @return HTML content with replaced elements. If no elements are found, the original content is returned.

838

* @since 1.0

839

840

public String replace(final String content, final String selector, final String replacement) {

841

return replaceAll(content, Collections.singletonMap(selector, replacement));

}

/**

* Replaces elements in HTML.

846

847

* @param content

848

* HTML content to modify

849

* @param replacements

850

* Map of CSS selectors to their replacement HTML texts. CSS selectors find elements to be replaced with

851

* the HTML in the mapping. The HTML must parse to a single element.

852

* @return HTML content with replaced elements. If no elements are found, the original content is returned.

853

* @since 1.0

854

855

public String replaceAll(final String content, final Map<String, String> replacements) {

856

857

final Element body = parse(content).body();

858

859

boolean modified = false;

860

for (final Entry<String, String> replacementEntry : replacements.entrySet()) {

861

final String selector = replacementEntry.getKey();

862

final String replacement = replacementEntry.getValue();

863

864

final List<Element> elements = body.select(selector);

865

if (elements.size() > 0) {

866

867

// take the first child

868

final Element replacementElem = parse(replacement).body().child(0);

869

870

if (replacementElem != null) {

871

for (final Element element : elements) {

872

element.replaceWith(replacementElem.clone());

}

modified = true;

}

}

}

if (modified) {

return body.html();

} else {

// nothing changed

return content;

}

}

/**

* Replaces All elements in HTML corresponding to <code>selector</code> while preserving the content of this

* element.

* @param content

* HTML content to modify

894

* @param selector

895

* CSS selector for elements to replace

896

* @param newElement

897

* HTML replacement (must parse to a single element)

898

* @return HTML content with replaced elements. If no elements are found, the original content is returned.

899

* @since 2.0

900

901

public String replaceWith(final String content, final String selector, final String newElement) {

902

903

final Element body = parse(content).body();

904

905

boolean modified = false;

906

final List<Element> elements = body.select(selector);

907

if (elements.size() > 0) {

908

909

// take the first child

910

final Element replacementElem = parse(newElement).body().child(0);

911

912

if (replacementElem != null) {

913

for (final Element element : elements) {

914

final List<Node> children = element.childNodes();

915

final Element el = replacementElem.clone();

916

for (final Node child : children) {

917

el.appendChild(child.clone());

918

}

919

element.replaceWith(el);

}

modified = true;

}

}

if (modified) {

return body.html();

} else {

// nothing changed

return content;

}

}

/**

* Retrieves text content of the selected elements in HTML. Renders the element's text as it would be displayed on

936

* the web page (including its children).

937

938

* @param content

939

* HTML content with the elements

940

* @param selector

941

* CSS selector for elements to extract contents

942

* @return A list of element texts as rendered to display. Empty list if no elements are found.

943

* @since 1.0

944

945

public List<String> text(@Nullable final String content, @Nonnull final String selector) {

946

if (Strings.isNullOrEmpty(content)) {

947

return emptyList();

948

}

949

final Element body = parse(content).body();

950

951

final List<Element> elements = body.select(selector);

952

final List<String> texts = new ArrayList<>();

953

954

for (final Element element : elements) {

955

texts.add(element.text());

}

return texts;

}

/**

* Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to IDs for heading

963

* elements.

964

* <p>

965

* The anchors are used to indicate positions within a HTML page. In HTML5, however, the {@code name} attribute is

966

* no longer supported on {@code <a>}) tag. The positions within pages are indicated using {@code id} attribute

967

* instead, e.g. {@code

968

969

970

<h1 id="myheading">}.

971

* </p>

972

* <p>

973

* The method finds anchors inside, immediately before or after the heading tags and uses their name as heading

974

* {@code id} instead. The anchors themselves are removed.

* </p>

* @param content

* HTML content to modify

979

* @return HTML content with modified elements. Anchor names are used for adjacent headings, and anchor tags are

980

* removed. If no elements are found, the original content is returned.

981

* @since 1.0

982

983

public String headingAnchorToId(final String content) {

984

985

final Element body = parse(content).body();

986

987

// selectors for headings without IDs

988

final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);

989

990

// selector for anchor with name attribute only

991

final String nameA = "a[name]:not([href])";

992

993

// select all headings that have inner named anchor

994

final List<Element> headingsInnerA = body

995

.select(String.join(", ", concat(headNoIds, ":has(" + nameA + ")", true)));

996

997

boolean modified = false;

998

for (final Element heading : headingsInnerA) {

999

final List<Element> anchors = heading.select(nameA);

1000

// take first

1001

if (!anchors.isEmpty()) {

1002

anchorToId(heading, anchors.get(0));

modified = true;

}

}

// select all headings that have a preceding named anchor

1008

final List<Element> headingsPreA = body.select(String.join(", ", concat(headNoIds, nameA + " + ", false)));

1009

1010

for (final Element heading : headingsPreA) {

1011

final Element anchor = heading.previousElementSibling();

1012

if (anchor != null) {

1013

anchorToId(heading, anchor);

modified = true;

}

}

// select all headings that are followed by a named anchor

1019

// no selector available for that, so first select the anchors

1020

// then retrieve the headings

1021

final List<Element> anchorsPreH = body.select(String.join(", ", concat(headNoIds, " + " + nameA, true)));

1022

1023

for (final Element anchor : anchorsPreH) {

1024

final Element heading = anchor.previousElementSibling();

1025

if (heading != null) {

1026

anchorToId(heading, anchor);

modified = true;

}

}

if (modified) {

return body.html();

} else {

// nothing to update

return content;

}

}

/**

* Moves anchor name to heading id, if one does not exist. Removes the anchor.

* @param heading

* @param anchor

private static void anchorToId(final Element heading, final Element anchor) {

1046

1047

if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {

1048

final String aName = anchor.attr("name");

1049

if (!aName.isEmpty()) {

1050

// set the anchor name as heading ID

1051

heading.attr("id", aName);

// remove the anchor

anchor.remove();

}

}

}

/**

* Utility method to concatenate a String to a list of Strings. The text can be either appended or prepended.

1061

1062

* @param elements

1063

* list of elements to append/prepend the text to

1064

* @param text

1065

* the given text to append/prepend

1066

* @param append

1067

* if {@code true}, text will be appended to the elements. If {@code false}, it will be prepended

1068

* @return list of elements with the text appended/prepended

1069

* @since 1.0

1070

1071

public static List<String> concat(final List<String> elements, final String text, final boolean append) {

1072

final List<String> concats = new ArrayList<>();

1073

1074

for (final String element : elements) {

1075

186

concats.add(append ? element + text : text + element);

}

return concats;

}

/**

* Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that do not have one.

1083

* <p>

1084

* IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a heading tag without an

1085

* {@code id} is found, its "slug" is generated automatically based on the heading contents and used as the ID.

1086

* </p>

1087

* <p>

1088

* Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS selectors, e.g. ":", ".",

1089

* etc. The symbols are removed.

* </p>

* @param pageType

* The type of page.

* @param currentPage

* The name of current page.

1096

* @param content

1097

* HTML content to modify.

1098

* @param idSeparator

1099

* the seperator used to slug ID.

1100

* @return Returns a {@link String} representing HTML content with all heading elements having {@code id}

1101

* attributes. If all headings were with IDs already, the original content is returned.

1102

* @since 1.0

1103

1104

public String ensureHeadingIds(final String pageType,

1105

final String currentPage,

1106

final String content,

1107

final String idSeparator) {

1108

final List<String> excludedPages = Arrays.asList("checkstyle-aggregate", "checkstyle");

1109

1110

final Element body = parse(content).body();

1111

1112

// exclude pages

1113

if (excludedPages.contains(currentPage)) {

return content;

}

// first find all existing IDs (to avoid generating duplicates)

1118

final List<Element> idElems = body.select("*[id]");

1119

1120

final Set<String> ids = new HashSet<>();

1121

boolean modified = false;

1122

for (final Element idElem : idElems) {

1123

1124

// fix all existing IDs - remove colon and other symbols which mess up jQuery

1125

final String id = idElem.id();

1126

idElem.attr("id", slug(id, idSeparator));

1127

modified = true;

1128

1129

ids.add(idElem.id());

1130

}

1131

1132

// create unique id for all heading elements

1133

final List<String> headIds = concat(HEADINGS, "[id]", true);

1134

// select all headings that have an ID

1135

final List<Element> headingIds = body.select(String.join(", ", headIds));

1136

1137

for (final Element heading : headingIds) {

1138

final String headingText = heading.text();

1139

String headingSlug = slug(headingText, idSeparator);

1140

// also limit slug to 50 symbols

1141

if (headingSlug.length() > SLUG_SIZE) {

1142

headingSlug = headingSlug.substring(0, SLUG_SIZE);

1143

}

1144

final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);

1145

1146

heading.attr("id", headingId);

1147

}

1148

1149

final List<String> headNoIds = concat(HEADINGS, ":not([id])", true);

1150

1151

// select all headings that do not have an ID

1152

final List<Element> headingsNoId = body.select(String.join(", ", headNoIds));

1153

1154

if (!headingsNoId.isEmpty() || modified) {

1155

for (final Element heading : headingsNoId) {

1156

1157

final String headingText = heading.text();

1158

String headingSlug = slug(headingText, idSeparator);

1159

// also limit slug to 50 symbols

1160

if (headingSlug.length() > SLUG_SIZE) {

1161

headingSlug = headingSlug.substring(0, SLUG_SIZE);

1162

}

1163

final String headingId = generateUniqueId(pageType, currentPage, ids, headingSlug);

1164

1165

heading.attr("id", headingId);

}

}

return body.html();

}

/**

* Generated a unique ID within the given set of IDs. Appends an incrementing number for duplicates.

* @param pageType

* The type of page.

* @param currentPage

* Tthe name of current page.

1179

* @param ids

1180

* The list of ID already existing or used.

1181

* @param idBase

1182

* The prefix to use.

1183

* @return Returns a new {@link String} representing a new unique ID.

1184

1185

private static String generateUniqueId(final String pageType,

1186

final String currentPage,

1187

final Set<String> ids,

1188

final String idBase) {

1189

String id = idBase;

1190

int counter = 1;

1191

while (ids.contains(id)) {

1192

id = idBase + String.valueOf(counter++);

1193

}

1194

1195

// put the newly generated one into the set

1196

ids.add(id);

1197

if ("frame".equals(pageType)) {

1198

id = currentPage + SEPARATOR_TOC + id;

}

return id;

}

/**

* Fixes table heads: wraps rows with {@code

1205

1206

1207

<th>} (table heading) elements into {@code <thead>} element if they are currently in {@code <tbody>}.

1208

1209

* @param content

1210

* HTML content to modify

1211

* @return HTML content with all table heads fixed. If all heads were correct, the original content is returned.

1212

* @since 1.0

1213

1214

public String fixTableHeads(final String content) {

1215

1216

final Element body = parse(content).body();

1217

1218

final List<Element> tables = body.select("table");

1219

1220

for (final Element table : tables) {

1221

// select rows with <th> tags within <tbody>

1222

final List<Element> tableHeadRows = table.select("tbody > tr:has(th)");

1223

// convert only table containing one tr head.

1224

if (tableHeadRows.size() == 1) {

1225

1226

for (final Element row : tableHeadRows) {

1227

1228

// remove row from its original position

1229

row.remove();

1230

1231

// create table header element with the row

1232

final Element thead = new Element(Tag.valueOf("thead"), "");

1233

thead.appendChild(row);

1234

// add at the beginning of the table

1235

table.prependChild(thead);

}

}

}

return body.html();

}

/** */

private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");

1244

1245

/** */

1246

private static final Pattern WHITESPACE = Pattern.compile("[\\s]");

1247

1248

/**

1249

* Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs). Uses "-"

1250

* as a whitespace separator.

1251

1252

* @param input

1253

* text to generate the slug from

1254

* @return the slug of the given text that contains alphanumeric symbols and "-" only

1255

* @since 1.0

1256

1257

public static String slug(final String input) {

1258

return slug(input, DEFAULT_SLUG_SEPARATOR);

}

/**

* Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to use in URLs).

1263

1264

* @param input

1265

* text to generate the slug from

1266

* @param separator

1267

* separator for whitespace replacement

1268

* @return the slug of the given text that contains alphanumeric symbols and separator only

1269

* @since 1.0

1270

* @see <a href=

1271

* "http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>

1272

1273

private static String slug(final String input, final String separator) {

1274

final String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);

1275

final String normalized = Normalizer.normalize(nowhitespace, Form.NFD);

1276

return NONLATIN.matcher(normalized).replaceAll("").toLowerCase(Locale.ENGLISH);

}

/**

* Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are nested within bigger

1281

* ones, e.g. <code><h2></code> is nested under preceding <code><h1></code>.

1282

* <p>

1283

* Only headings with IDs are included in the hierarchy. The result elements contain ID and heading text for each

1284

* heading. The hierarchy is useful to generate a Table of Contents for a page.

* </p>

* @param content

* HTML content to extract heading hierarchy from

1289

* @param sections

1290

* list of all sections

1291

* @return a list of top-level heading items (with id and text). The remaining headings are nested within these

1292

* top-level items. Empty list if no headings are in the content.

1293

* @since 1.0

1294

1295

public List<? extends IdElement> headingTree(final String content, final List<String> sections) {

1296

1297

final List<String> sectionContents = this.split(content, "hr");

1298

final List<String> headIds = concat(HEADINGS, "[id]:not(.no-anchor)", true);

1299

final List<HeadingItem> headingItems = new ArrayList<>();

1300

1301

int index = 0;

1302

for (final String sectionContent : sectionContents) {

1303

final String sectionType = index < sections.size() ? sections.get(index++) : "";

1304

1305

// exclude carousel headings

1306

if ("carousel".equals(sectionType)) {

1307

continue;

1308

}

1309

final Element body = parse(sectionContent).body();

1310

// select all headings that have an ID

1311

final List<Element> headings = body.select(String.join(", ", headIds));

1312

for (final Element heading : headings) {

1313

headingItems

1314

.add(new HeadingItem(heading.id(), heading.nodeName(), heading.text(), headingIndex(heading)));

}

}

final List<HeadingItem> topHeadings = new ArrayList<>();

1319

final Stack<HeadingItem> parentHeadings = new Stack<>();

1320

1321

for (final HeadingItem heading : headingItems) {

1322

1323

while (!parentHeadings.isEmpty() && parentHeadings.peek().headingLevel >= heading.headingLevel) {

1324

parentHeadings.pop();

1325

}

1326

1327

if (parentHeadings.isEmpty()) {

1328

// top level heading - no parents

1329

topHeadings.add(heading);

1330

} else {

1331

// add to the children of topmost stack parent

1332

parentHeadings.peek().children.add(heading);

1333

}

1334

1335

// push the heading onto stack

1336

parentHeadings.push(heading);

}

return topHeadings;

}

/**

* Retrieves numeric index of a heading.

* @param element

* @return

private static int headingIndex(final Element element) {

1349

final String tagName = element.tagName();

1350

if (tagName.startsWith("h")) {

1351

try {

1352

return Integer.parseInt(tagName.substring(1));

1353

} catch (final Exception ex) {

1354

throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);

1355

}

1356

} else {

1357

throw new IllegalArgumentException("Must be a header tag: " + tagName);

}

}

/**

* @author Christophe Friederich

1363

1364

private static final class HeadingItem implements IdElement {

1365

1366

/** */

1367

private final String id;

1368

1369

/** */

1370

private final String tagName;

1371

1372

/** */

1373

private final String text;

1374

1375

/** */

1376

private final int headingLevel;

1377

1378

/** */

1379

private final List<HeadingItem> children = new ArrayList<>();

1380

1381

private HeadingItem(final String id, final String tagName, final String text, final int headingLevel) {

1382

this.id = id;

1383

this.tagName = tagName;

1384

this.text = text;

1385

this.headingLevel = headingLevel;

}

@Override

public String getId() {

return id;

}

@Override

public String getTagName() {

return tagName;

}

@Override

public String getText() {

return text;

}

@Override

public List<HeadingItem> getItems() {

1405

return Collections.unmodifiableList(children);

}

@Override

public int getHeadingLevel() {

return headingLevel;

}

}

/**

* Representation of a HTML element with ID and a text content. Other such elements can be nested within.

1416

1417

* @author Andrius Velykis

1418

* @since 1.0

1419

1420

public interface IdElement {

1421

1422

/**

1423

* Retrieves the ID of the HTML element (attribute {@code id}).

1424

1425

* @return element {@code id} value

String getId();

/**

* @return Returns the tag name of element.

String getTagName();

/**

* Retrieves the text contents of the HTML element (rendered for display).

1436

1437

* @return text contents of the element

String getText();

/**

* @return Returns the level of heading.

1443

1444

int getHeadingLevel();

1445

1446

/**

1447

* Retrieves the children of the HTML element (nested within the element).

1448

1449

* @return nested items within the element

1450

1451

List<? extends IdElement> getItems();

1452

}

1453

}

Reflow Maven Skin Parent 2.4.0-beta3-SNAPSHOT

File HtmlTool.java

Coverage histogram

Code metrics

Classes

Class HtmlTool

Class HtmlTool.JoinSeparator

Class HtmlTool.ExtractResult

Class HtmlTool.DefaultExtractResult

Class HtmlTool.HeadingItem

Class HtmlTool.IdElement

Contributing tests

Contributing tests

Source view