Share this page 

Sanitize XML String Part 2 Tag(s): XML


In this HowTo, we were removing the characters which are not allowed in XML in a given String. But sometime we don't want to remove them but make them safe in XML context if possible.

The idea is to replace problematic characters with XML entities which is a way to encode a character to make it acceptable.

public static String encodeXML2(String input) {
   input.replaceAll("&", "&amp")
        .replaceAll("<", "&lt;")
        .replaceAll(">", "&gt;")
        .replaceAll("'", "&apos;")
        .replaceAll("\"", "&quot;");
   return input;
}
The above solution works with simple case but repeated usage of replaceAll() method is not very optimal and we don't handle all the illegal characters at all.

Here a better solution:

public class Test {

  // inspired by https://stackoverflow.com/a/48588062/25122
  public static String encodeXML(CharSequence s) {
      StringBuilder sb = new StringBuilder();
      int len = s.length();
      for (int i=0;i<len;i++) {
          int c = s.charAt(i);
          if (c >= 0xd800 && c <= 0xdbff && i + 1 < len) {
              c = ((c-0xd7c0)<<10) | (s.charAt(++i)&0x3ff);    // UTF16 decode
          }
          if (c < 0x80) {      // ASCII range: test most common case first
              if (c < 0x20 && (c != '\t' && c != '\r' && c != '\n')) {
                  // Illegal XML character, even encoded. Skip or substitute
                  // sb.append("&#xfffd;");
                  // but I prefer to skip them
              }
              else {
                  switch(c) {
                    case '&':  sb.append("&amp;"); break;
                    case '>':  sb.append("&gt;"); break;
                    case '<':  sb.append("&lt;"); break;
                    case '\''  sb.append("&apos;"); break;  // possible problem with XML atttribute
                    case '\"'  sb.append("&quot;"); break;  // possible problem with XML atttribute
                    case '\n'  sb.append("&#10;"); break;
                    case '\r'  sb.append("&#13;"); break;
                    case '\t'  sb.append("&#9;"); break;

                    default:   sb.append((char)c);
                  }
              }
          }
          else if ((c >= 0xd800 && c <= 0xdfff) || c == 0xfffe || c == 0xffff) {
             // Illegal XML character, even encoded. Skip or substitute
             // sb.append("&#xfffd;");
             // but I prefer to skip them
          }
          else {
              sb.append("&#x");
              sb.append(Integer.toHexString(c));
              sb.append(';');
          }
      }
      return sb.toString();
  }

  public static void main(String args[]) throws Exception {
    String s = "Réal & HowTo < test >";

    System.out.println(s + " --> " + encodeXML(s));

    /*
     * output
     *
     * Réal & HowTo < test > --> R&#xe9;al &amp; HowTo &lt; test &gt;
     */

  }
}
Or you can use Apache Commons Text.
import org.apache.commons.text.StringEscapeUtils;

public class TestXml {
  public static void main(String args[]) throws Exception {
    String s = "Réal & HowTo < test >";
    System.out.println(s + " --> " + StringEscapeUtils.escapeXml11(s));  // or escapeXml10()
    /*  output
     *
     * Réal & HowTo < test > --> R&#xe9;al &amp; HowTo &lt; test &gt;
     */
  }
}
POM
 <dependency>
   <groupId>org.apache.commons</groupId>
   <artifactId>commons-text</artifactId>
   <version>1.6</version>
 </dependency>