Sanitize XML String Part 2 Tag(s): XML
In this HowTo, we were removing the characters which are not allowed in XML in a given String. But sometime we don't want to remove them but make them safe in XML context if possible.
The idea is to replace problematic characters with XML entities which is a way to encode a character to make it acceptable.
public static String encodeXML2(String input) { input.replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll("'", "'") .replaceAll("\"", """); return input; }
Here a better solution:
public class Test { // inspired by https://stackoverflow.com/a/48588062/25122 public static String encodeXML(CharSequence s) { StringBuilder sb = new StringBuilder(); int len = s.length(); for (int i=0;i<len;i++) { int c = s.charAt(i); if (c >= 0xd800 && c <= 0xdbff && i + 1 < len) { c = ((c-0xd7c0)<<10) | (s.charAt(++i)&0x3ff); // UTF16 decode } if (c < 0x80) { // ASCII range: test most common case first if (c < 0x20 && (c != '\t' && c != '\r' && c != '\n')) { // Illegal XML character, even encoded. Skip or substitute // sb.append("�"); // but I prefer to skip them } else { switch(c) { case '&': sb.append("&"); break; case '>': sb.append(">"); break; case '<': sb.append("<"); break; case '\'' sb.append("'"); break; // possible problem with XML atttribute case '\"' sb.append("""); break; // possible problem with XML atttribute case '\n' sb.append(" "); break; case '\r' sb.append(" "); break; case '\t' sb.append("	"); break; default: sb.append((char)c); } } } else if ((c >= 0xd800 && c <= 0xdfff) || c == 0xfffe || c == 0xffff) { // Illegal XML character, even encoded. Skip or substitute // sb.append("�"); // but I prefer to skip them } else { sb.append("&#x"); sb.append(Integer.toHexString(c)); sb.append(';'); } } return sb.toString(); } public static void main(String args[]) throws Exception { String s = "Réal & HowTo < test >"; System.out.println(s + " --> " + encodeXML(s)); /* * output * * Réal & HowTo < test > --> Réal & HowTo < test > */ } }
import org.apache.commons.text.StringEscapeUtils; public class TestXml { public static void main(String args[]) throws Exception { String s = "Réal & HowTo < test >"; System.out.println(s + " --> " + StringEscapeUtils.escapeXml11(s)); // or escapeXml10() /* output * * Réal & HowTo < test > --> Réal & HowTo < test > */ } }
<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-text</artifactId> <version>1.6</version> </dependency>