View Javadoc
1   package org.sentrysoftware.maven.skin;
2   
3   /*-
4    * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲
5    * Sentry Maven Skin Tools
6    * ჻჻჻჻჻჻
7    * Copyright 2017 - 2024 Sentry Software
8    * ჻჻჻჻჻჻
9    * Licensed under the Apache License, Version 2.0 (the "License");
10   * you may not use this file except in compliance with the License.
11   * You may obtain a copy of the License at
12   *
13   *      http://www.apache.org/licenses/LICENSE-2.0
14   *
15   * Unless required by applicable law or agreed to in writing, software
16   * distributed under the License is distributed on an "AS IS" BASIS,
17   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18   * See the License for the specific language governing permissions and
19   * limitations under the License.
20   * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱
21   */
22  
23  import java.text.Normalizer;
24  import java.text.Normalizer.Form;
25  import java.util.ArrayList;
26  import java.util.Arrays;
27  import java.util.Collections;
28  import java.util.HashMap;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.regex.Pattern;
32  
33  import org.apache.velocity.tools.ToolContext;
34  import org.apache.velocity.tools.config.DefaultKey;
35  import org.apache.velocity.tools.generic.SafeConfig;
36  import org.apache.velocity.tools.generic.ValueParser;
37  import org.jsoup.Jsoup;
38  import org.jsoup.internal.StringUtil;
39  import org.jsoup.nodes.Document;
40  import org.jsoup.nodes.Element;
41  import org.jsoup.parser.Tag;
42  
43  /**
44   * An Apache Velocity tool that provides utility methods to manipulate HTML code using
45   * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
46   * <p>
47   * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS
48   * selectors</a> to refer to specific elements for manipulation.
49   * </p>
50   *
51   * @author Bertrand Martin (originally inspired by Andrius Velykis)
52   * @since 1.0
53   * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
54   * @see <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
55   */
56  @DefaultKey("htmlTool")
57  public class HtmlTool extends SafeConfig {
58  
59  	/** A list of all HTML heading classes (h1-6) */
60  	private static final List<String> HEADINGS = Collections.unmodifiableList(
61  			Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
62  
63  
64  	private String outputEncoding = "UTF-8";
65  
66  	/**
67  	 * Create a new instance
68  	 */
69  	public HtmlTool() {
70  		/* Do nothing */
71  	}
72  
73  	/**
74  	 * {@inheritDoc}
75  	 *
76  	 * @see SafeConfig#configure(ValueParser)
77  	 */
78  	@Override
79  	protected void configure(final ValueParser values) {
80  
81  		// retrieve the Velocity context for output encoding
82  		Object velocityContext = values.get("velocityContext");
83  
84  		if (!(velocityContext instanceof ToolContext)) {
85  			return;
86  		}
87  
88  		ToolContext ctxt = (ToolContext) velocityContext;
89  
90  		// get the output encoding
91  		Object outputEncodingObj = ctxt.get("outputEncoding");
92  		if (outputEncodingObj instanceof String) {
93  			this.outputEncoding = (String) outputEncodingObj;
94  		}
95  	}
96  
97  	/**
98  	 * Sets attribute to the given value on elements in HTML.
99  	 *
100 	 * @param body
101 	 *            HTML content to set attributes on
102 	 * @param selector
103 	 *            CSS selector for elements to modify
104 	 * @param attributeKey
105 	 *            Attribute name
106 	 * @param value
107 	 *            Attribute value
108 	 * @return HTML content with modified elements. If no elements are found, the original content
109 	 *         is returned.
110 	 * @since 1.0
111 	 */
112 	public Element setAttr(final Element body, final String selector, final String attributeKey, final String value) {
113 
114 		List<Element> elements = body.select(selector);
115 
116 		for (Element element : elements) {
117 			element.attr(attributeKey, value);
118 		}
119 
120 		return body;
121 
122 	}
123 
124 	/**
125 	 * Parses HTML fragment
126 	 *
127 	 * @param content HTML fragment to parse
128 	 * @return Element of the specified HTML fragment
129 	 */
130 	public Element parseContent(final String content) {
131 		Document doc = Jsoup.parseBodyFragment(content);
132 		doc.outputSettings().charset(outputEncoding);
133 		return doc.body();
134 	}
135 
136 	/**
137 	 * Retrieves attribute value on elements in HTML. Will return all attribute values for the
138 	 * selector, since there can be more than one element.
139 	 *
140 	 * @param body
141 	 *            HTML content to read attributes from
142 	 * @param selector
143 	 *            CSS selector for elements to find
144 	 * @param attributeKey
145 	 *            Attribute name
146 	 * @return Attribute values for all matching elements. If no elements are found, empty list is
147 	 *         returned.
148 	 * @since 1.0
149 	 */
150 	public List<String> getAttr(final Element body, final String selector, final String attributeKey) {
151 
152 		List<Element> elements = body.select(selector);
153 		List<String> attrs = new ArrayList<String>();
154 
155 		for (Element element : elements) {
156 			String attrValue = element.attr(attributeKey);
157 			attrs.add(attrValue);
158 		}
159 
160 		return attrs;
161 	}
162 
163 	/**
164 	 * Adds given class names to the elements in HTML.
165 	 *
166 	 * @param body
167 	 *            HTML content to modify
168 	 * @param selector
169 	 *            CSS selector for elements to add classes to
170 	 * @param classNames
171 	 *            Names of classes to add to the selected elements
172 	 * @param amount
173 	 *            Maximum number of elements to modify
174 	 * @return HTML content with modified elements. If no elements are found, the original content
175 	 *         is returned.
176 	 * @since 1.0
177 	 */
178 	public Element addClass(final Element body, final String selector, final List<String> classNames, final int amount) {
179 
180 		List<Element> elements = body.select(selector);
181 		if (amount >= 0) {
182 			// limit to the indicated amount
183 			elements = elements.subList(0, Math.min(amount, elements.size()));
184 		}
185 
186 		for (Element element : elements) {
187 			for (String className : classNames) {
188 				element.addClass(className);
189 			}
190 		}
191 
192 		return body;
193 
194 	}
195 
196 	/**
197 	 * Adds given class names to the elements in HTML.
198 	 *
199 	 * @param body
200 	 *            HTML content to modify
201 	 * @param selector
202 	 *            CSS selector for elements to add classes to
203 	 * @param classNames
204 	 *            Names of classes to add to the selected elements
205 	 * @return HTML content with modified elements. If no elements are found, the original content
206 	 *         is returned.
207 	 * @since 1.0
208 	 */
209 	public Element addClass(final Element body, final String selector, final List<String> classNames) {
210 		return addClass(body, selector, classNames, -1);
211 	}
212 
213 	/**
214 	 * Adds given class to the elements in HTML.
215 	 *
216 	 * @param body
217 	 *            HTML content to modify
218 	 * @param selector
219 	 *            CSS selector for elements to add the class to
220 	 * @param className
221 	 *            Name of class to add to the selected elements
222 	 * @return HTML content with modified elements. If no elements are found, the original content
223 	 *         is returned.
224 	 * @since 1.0
225 	 */
226 	public Element addClass(final Element body, final String selector, final String className) {
227 		return addClass(body, selector, Collections.singletonList(className));
228 	}
229 
230 	/**
231 	 * Wraps elements in HTML with the given HTML.
232 	 *
233 	 * @param body
234 	 *            HTML content to modify
235 	 * @param selector
236 	 *            CSS selector for elements to wrap
237 	 * @param wrapHtml
238 	 *            HTML to use for wrapping the selected elements
239 	 * @param amount
240 	 *            Maximum number of elements to modify
241 	 * @return HTML content with modified elements. If no elements are found, the original content
242 	 *         is returned.
243 	 * @since 1.0
244 	 */
245 	public Element wrap(final Element body, final String selector, final String wrapHtml, final int amount) {
246 
247 		List<Element> elements = body.select(selector);
248 		if (amount >= 0) {
249 			// limit to the indicated amount
250 			elements = elements.subList(0, Math.min(amount, elements.size()));
251 		}
252 
253 		for (Element element : elements) {
254 			element.wrap(wrapHtml);
255 		}
256 
257 		return body;
258 
259 	}
260 
261 	/**
262 	 * Append HTML elements to specified elements in the given HTML.
263 	 *
264 	 * @param body
265 	 *            HTML content to modify
266 	 * @param selector
267 	 *            CSS selector for elements that will get the appendice
268 	 * @param appendHtml
269 	 *            HTML to append to the selected elements
270 	 * @param amount
271 	 *            Maximum number of elements to modify
272 	 * @return HTML content with modified elements. If no elements are found, the original content
273 	 *         is returned.
274 	 */
275 	public Element append(final Element body, final String selector, final String appendHtml, final int amount) {
276 
277 		List<Element> elements = body.select(selector);
278 		if (amount >= 0) {
279 			// limit to the indicated amount
280 			elements = elements.subList(0, Math.min(amount, elements.size()));
281 		}
282 
283 		for (Element element : elements) {
284 			element.append(appendHtml);
285 		}
286 
287 		return body;
288 
289 	}
290 
291 
292 	/**
293 	 * Prepend HTML elements to specified elements in the given HTML.
294 	 *
295 	 * @param body
296 	 *            HTML content to modify
297 	 * @param selector
298 	 *            CSS selector for elements that will get the "pre-pendice"
299 	 * @param prependHtml
300 	 *            HTML to prepend to the selected elements
301 	 * @param amount
302 	 *            Maximum number of elements to modify
303 	 * @return HTML content with modified elements. If no elements are found, the original content
304 	 *         is returned.
305 	 */
306 	public Element prepend(final Element body, final String selector, final String prependHtml, final int amount) {
307 
308 		List<Element> elements = body.select(selector);
309 		if (amount >= 0) {
310 			// limit to the indicated amount
311 			elements = elements.subList(0, Math.min(amount, elements.size()));
312 		}
313 
314 		for (Element element : elements) {
315 			element.prepend(prependHtml);
316 		}
317 
318 		return body;
319 
320 	}
321 
322 
323 	/**
324 	 * Removes elements from HTML.
325 	 *
326 	 * @param body
327 	 *            HTML content to modify
328 	 * @param selector
329 	 *            CSS selector for elements to remove
330 	 * @return HTML content with removed elements. If no elements are found, the original content is
331 	 *         returned.
332 	 * @since 1.0
333 	 */
334 	public Element remove(final Element body, final String selector) {
335 
336 		List<Element> elements = body.select(selector);
337 
338 		for (Element element : elements) {
339 			element.remove();
340 		}
341 
342 		return body;
343 
344 	}
345 
346 	/**
347 	 * Replace selected elements with specified elements in the given HTML.
348 	 *
349 	 * @param body
350 	 *            HTML content to modify
351 	 * @param selector
352 	 *            CSS selector for elements that will get the appendice
353 	 * @param replaceHtml
354 	 *            HTML to append to the selected elements
355 	 * @param amount
356 	 *            Maximum number of elements to modify (-1 for unlimited)
357 	 * @return HTML content with modified elements. If no elements are found, the original content
358 	 *         is returned.
359 	 */
360 	public Element replace(final Element body, final String selector, final String replaceHtml, final int amount) {
361 
362 		List<Element> elements = body.select(selector);
363 		if (amount >= 0) {
364 			// limit to the indicated amount
365 			elements = elements.subList(0, Math.min(amount, elements.size()));
366 		}
367 
368 		for (Element element : elements) {
369 			element.before(replaceHtml).remove();
370 		}
371 
372 		return body;
373 
374 	}
375 
376 
377 	/**
378 	 * Retrieves text content of the selected elements in HTML. Renders the element's text as it
379 	 * would be displayed on the web page (including its children).
380 	 *
381 	 * @param body
382 	 *            HTML content with the elements
383 	 * @param selector
384 	 *            CSS selector for elements to extract contents
385 	 * @return A list of element texts as rendered to display. Empty list if no elements are found.
386 	 * @since 1.0
387 	 */
388 	public List<String> text(final Element body, final String selector) {
389 
390 		List<Element> elements = body.select(selector);
391 		List<String> texts = new ArrayList<String>();
392 
393 		for (Element element : elements) {
394 			texts.add(element.text());
395 		}
396 
397 		return texts;
398 	}
399 
400 	/**
401 	 * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to
402 	 * IDs for heading elements.
403 	 * <p>
404 	 * The anchors are used to indicate positions within a HTML page. In HTML5, however, the
405 	 * {@code name} attribute is no longer supported on {@code <a>}) tag. The positions within pages
406 	 * are indicated using {@code id} attribute instead, e.g. {@code <h1 id="myheading">}.
407 	 * </p>
408 	 * <p>
409 	 * The method finds anchors inside, immediately before or after the heading tags and uses their
410 	 * name as heading {@code id} instead. The anchors themselves are removed.
411 	 * </p>
412 	 *
413 	 * @param body
414 	 *            HTML content to modify
415 	 * @return HTML content with modified elements. Anchor names are used for adjacent headings, and
416 	 *         anchor tags are removed. If no elements are found, the original content is returned.
417 	 * @since 1.0
418 	 */
419 	public Element headingAnchorToId(final Element body) {
420 
421 		// selectors for headings without IDs
422 		List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
423 
424 		// selector for anchor with name attribute only
425 		String nameA = "a[name]:not([href])";
426 
427 		// select all headings that have inner named anchor
428 		List<Element> headingsInnerA = body.select(StringUtil.join(
429 				concat(headNoIds, ":has(" + nameA + ")", true), ", "));
430 
431 		for (Element heading : headingsInnerA) {
432 			List<Element> anchors = heading.select(nameA);
433 			// take first
434 			if (!anchors.isEmpty()) {
435 				anchorToId(heading, anchors.get(0));
436 			}
437 		}
438 
439 		// select all headings that have a preceding named anchor
440 		List<Element> headingsPreA = body.select(StringUtil.join(
441 				concat(headNoIds, nameA + " + ", false), ", "));
442 
443 		for (Element heading : headingsPreA) {
444 			Element anchor = heading.previousElementSibling();
445 			if (anchor != null) {
446 				anchorToId(heading, anchor);
447 			}
448 		}
449 
450 		// select all headings that are followed by a named anchor
451 		// no selector available for that, so first select the anchors
452 		// then retrieve the headings
453 		List<Element> anchorsPreH = body.select(StringUtil.join(
454 				concat(headNoIds, " + " + nameA, true), ", "));
455 
456 		for (Element anchor : anchorsPreH) {
457 			Element heading = anchor.previousElementSibling();
458 			if (heading != null) {
459 				anchorToId(heading, anchor);
460 			}
461 		}
462 
463 		return body;
464 	}
465 
466 	/**
467 	 * Moves anchor name to heading id, if one does not exist. Removes the anchor.
468 	 *
469 	 * @param heading
470 	 * @param anchor
471 	 */
472 	private static void anchorToId(final Element heading, final Element anchor) {
473 
474 		if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
475 			String aName = anchor.attr("name");
476 			if (!aName.isEmpty()) {
477 				// set the anchor name as heading ID
478 				heading.attr("id", aName);
479 
480 				// remove the anchor
481 				anchor.remove();
482 			}
483 		}
484 	}
485 
486 
487 	/**
488 	 * Utility method to concatenate a String to a list of Strings. The text can be either appended
489 	 * or prepended.
490 	 *
491 	 * @param elements
492 	 *            list of elements to append/prepend the text to
493 	 * @param text
494 	 *            the given text to append/prepend
495 	 * @param append
496 	 *            if {@code true}, text will be appended to the elements. If {@code false}, it will
497 	 *            be prepended
498 	 * @return list of elements with the text appended/prepended
499 	 * @since 1.0
500 	 */
501 	public static List<String> concat(final List<String> elements, final String text, final boolean append) {
502 		List<String> concats = new ArrayList<String>();
503 
504 		for (String element : elements) {
505 			concats.add(append ? element + text : text + element);
506 		}
507 
508 		return concats;
509 	}
510 
511 
512 	/**
513 	 * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that
514 	 * do not have one.
515 	 * <p>
516 	 * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a
517 	 * heading tag without an {@code id} is found, its "slug" is generated automatically based on
518 	 * the heading contents and used as the ID.
519 	 * </p>
520 	 *
521 	 * @param body
522 	 *            HTML content to modify
523 	 * @return HTML content with all heading elements having {@code id} attributes. If all headings
524 	 *         were with IDs already, the original content is returned.
525 	 * @since 1.0
526 	 */
527 	public Element ensureHeadingIds(final Element body) {
528 
529 		// Find all existing IDs (to avoid generating duplicates)
530 		Map<String, Integer> ids = new HashMap<String, Integer>();
531 		List<Element> idElems = body.select("*[id]");
532 		for (Element idElem : idElems) {
533 			ids.put(idElem.id(), 0);
534 		}
535 
536 		// select all headings that do not have an ID
537 		List<Element> headingsNoId = body.select("h1:not([id]), h2:not([id]), h3:not([id]), h4:not([id]), h5:not([id]), h6:not([id])");
538 
539 		for (Element heading : headingsNoId) {
540 
541 			// Take the text content of the title
542 			String headingText = heading.text();
543 
544 			// Create an ID out of it (trim all unwanted chars)
545 			String headingSlug = slug(headingText);
546 			if (headingSlug.length() > 50) {
547 				headingSlug = headingSlug.substring(0, 50);
548 			}
549 
550 			// If the ID already exists, add an increasing number to it
551 			int slugNumber = ids.merge(headingSlug, 1, (oldValue, newValue) -> oldValue + 1);
552 
553 			// Set the ID attribute with slug_number
554 			if (slugNumber > 1) {
555 				headingSlug = headingSlug + "_" + slugNumber;
556 			}
557 			heading.attr("id", headingSlug);
558 		}
559 
560 		return body;
561 
562 	}
563 
564 	/**
565 	 * Transforms the given HTML content to replace IDs that have symbols not allowed in CSS
566 	 * selectors, e.g. ":", ".", etc. The symbols are removed.
567 	 * <p>
568 	 * Naturally, the references to these IDs (in {@code <a href="#my_id">}) are also modified.
569 	 * </p>
570 	 *
571 	 * @param body
572 	 *            HTML content to modify
573 	 * @return HTML content fixed IDs.
574 	 * @since 1.0
575 	 */
576 	public Element fixIds(final Element body) {
577 
578 		// Find all IDs and remove unsupported characters
579 		List<Element> idElems = body.select("*[id]");
580 		for (Element idElem : idElems) {
581 
582 			String id = idElem.id();
583 			String newId = slug(id);
584 			if (!id.equals(newId)) {
585 				idElem.attr("id", newId);
586 			}
587 		}
588 
589 		// Then find all <a href="#..."> instances and update their values accordingly
590 		List<Element> aElems = body.select("a[href^=#]");
591 		for (Element aElem : aElems) {
592 			// fix all existing IDs - remove colon and other symbols which mess up jQuery
593 			String href = aElem.attr("href");
594 			String newHref = "#" + slug(href.substring(1));
595 			if (!href.equals(newHref)) {
596 				aElem.attr("href", newHref);
597 			}
598 		}
599 
600 		// Return result
601 		return body;
602 	}
603 
604 	/**
605 	 * Fixes table heads: wraps rows with {@code <th>} (table heading) elements into {@code <thead>}
606 	 * element if they are currently in {@code <tbody>}.
607 	 *
608 	 * @param body
609 	 *            HTML content to modify
610 	 * @return HTML content with all table heads fixed. If all heads were correct, the original
611 	 *         content is returned.
612 	 * @since 1.0
613 	 */
614 	public Element fixTableHeads(final Element body) {
615 
616 		// select rows with <th> tags within <tbody>
617 		List<Element> tableHeadRows = body.select("table > tbody > tr:has(th)");
618 
619 		for (Element row : tableHeadRows) {
620 
621 			// get the row's table
622 			Element table = row.parent().parent();
623 
624 			// remove row from its original position
625 			row.remove();
626 
627 			// create table header element with the row
628 			Element thead = new Element(Tag.valueOf("thead"), "");
629 			thead.appendChild(row);
630 			// add at the beginning of the table
631 			table.prependChild(thead);
632 		}
633 
634 		return body;
635 	}
636 
637 
638 	/**
639 	 * Regex that matches with all non-latin chars... and dash
640 	 */
641 	private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
642 
643 	/**
644 	 * Regex that matches with all white spaces and other common word separators
645 	 */
646 	private static final Pattern WORD_SEPARATORS = Pattern.compile("[\\s_'()\\[\\]{}/\\|+=*,;:\\.]+");
647 
648 	/**
649 	 * Regex that matches with leading and trailing dashes
650 	 */
651 	private static final Pattern LEADING_TRAILING_DASHES = Pattern.compile("^-+|-+$");
652 
653 	/**
654 	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
655 	 * use in URLs). Uses "-" as a word separator.
656 	 *
657 	 * @param input The string (free) to be transformed into a valid element ID
658 	 * @return the proper slug
659 	 */
660 	public static String slug(final String input) {
661 		String normalized = Normalizer.normalize(input, Form.NFD);
662 		String nowhitespace = WORD_SEPARATORS.matcher(normalized).replaceAll("-");
663 		String noSpecialChars = NONLATIN.matcher(nowhitespace).replaceAll("");
664 		return LEADING_TRAILING_DASHES.matcher(noSpecialChars).replaceAll("").toLowerCase();
665 	}
666 
667 	/**
668 	 * Replace all <code>&lt;a href="//..."&gt;</code> links with protocol-relative URLs with
669 	 * proper HTTPS URLs
670 	 *
671 	 * @param body
672 	 *            HTML content to modify
673 	 * @return HTML content fixed linkss
674 	 */
675 	public Element fixProtocolRelativeUrls(final Element body) {
676 
677 		// Find all links with HREF that starts with //
678 		// (i.e. protocol-relative)
679 		List<Element> aElems = body.select("*[href^=//]");
680 
681 		// Nothing? Exit immediately
682 		if (aElems.isEmpty()) {
683 			return body;
684 		}
685 
686 		for (Element aElem : aElems) {
687 
688 			// Prepend "https:" in front of each protocol-relative link
689 			String href = aElem.attr("href");
690 			aElem.attr("href", "https:" + href);
691 
692 		}
693 
694 		// Return result
695 		return body;
696 	}
697 
698 
699 }