Unescape HTML special characters from a String

Unescape HTML special characters from a StringTag(s): String/Number

Using HashMap

import java.util.*;

public class StringUtils {

  private StringUtils() {}

  private static HashMap<String,String> htmlEntities;
  static {
    htmlEntities = new HashMap<String,String>();
    htmlEntities.put("&lt;","<")    ; htmlEntities.put("&gt;",">");
    htmlEntities.put("&amp;","&")   ; htmlEntities.put("&quot;","\"");
    htmlEntities.put("&agrave;","à"); htmlEntities.put("&Agrave;","À");
    htmlEntities.put("&acirc;","â") ; htmlEntities.put("&auml;","ä");
    htmlEntities.put("&Auml;","Ä")  ; htmlEntities.put("&Acirc;","Â");
    htmlEntities.put("&aring;","å") ; htmlEntities.put("&Aring;","Å");
    htmlEntities.put("&aelig;","æ") ; htmlEntities.put("&AElig;","Æ" );
    htmlEntities.put("&ccedil;","ç"); htmlEntities.put("&Ccedil;","Ç");
    htmlEntities.put("&eacute;","é"); htmlEntities.put("&Eacute;","É" );
    htmlEntities.put("&egrave;","è"); htmlEntities.put("&Egrave;","È");
    htmlEntities.put("&ecirc;","ê") ; htmlEntities.put("&Ecirc;","Ê");
    htmlEntities.put("&euml;","ë")  ; htmlEntities.put("&Euml;","Ë");
    htmlEntities.put("&iuml;","ï")  ; htmlEntities.put("&Iuml;","Ï");
    htmlEntities.put("&ocirc;","ô") ; htmlEntities.put("&Ocirc;","Ô");
    htmlEntities.put("&ouml;","ö")  ; htmlEntities.put("&Ouml;","Ö");
    htmlEntities.put("&oslash;","ø") ; htmlEntities.put("&Oslash;","Ø");
    htmlEntities.put("&szlig;","ß") ; htmlEntities.put("&ugrave;","ù");
    htmlEntities.put("&Ugrave;","Ù"); htmlEntities.put("&ucirc;","û");
    htmlEntities.put("&Ucirc;","Û") ; htmlEntities.put("&uuml;","ü");
    htmlEntities.put("&Uuml;","Ü")  ; htmlEntities.put("&nbsp;"," ");
    htmlEntities.put("&copy;","\u00a9");
    htmlEntities.put("&reg;","\u00ae");
    htmlEntities.put("&euro;","\u20a0");
  }

/*
   Here the original recursive version.
   It is fine unless you pass a big string then a Stack Overflow is possible :-(


  public static final String unescapeHTML(String source, int start){
     int i,j;

     i = source.indexOf("&", start);
     if (i > -1) {
        j = source.indexOf(";" ,i);
        if (j > i) {
           String entityToLookFor = source.substring(i , j + 1);
           String value = (String)htmlEntities.get(entityToLookFor);
           if (value != null) {
             source = new StringBuffer().append(source.substring(0 , i))
                                   .append(value)
                                   .append(source.substring(j + 1))
                                   .toString();
             return unescapeHTML(source, i + 1); // recursive call
           }
         }
     }
     return source;
  }

   M. McNeely Jr. has sent a version with do...while()loop which is more robust.
   Thanks to him!
*/

  public static final String unescapeHTML(String source) {
      int i, j;

      boolean continueLoop;
      int skip = 0;
      do {
         continueLoop = false;
         i = source.indexOf("&", skip);
         if (i > -1) {
           j = source.indexOf(";", i);
           if (j > i) {
             String entityToLookFor = source.substring(i, j + 1);
             String value = (String) htmlEntities.get(entityToLookFor);
             if (value != null) {
               source = source.substring(0, i)
                        + value + source.substring(j + 1);
               continueLoop = true;
             }
             else if (value == null){
                skip = i+1;
                continueLoop = true;
             }
           }
         }
      } while (continueLoop);
      return source;
  }

  public static void main(String args[]) throws Exception {
      // to see accented character to the console (Windows DOS Shell)
      java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850");
      String test = "&copy; 2007  R&eacute;al Gagnon &lt;www.rgagnon.com&gt;";
      ps.println(test + "\n-->\n" +unescapeHTML(test));

      /*
         output ((Windows DOS Shell):
         &copy; 2007  R&eacute;al Gagnon &lt;www.rgagnon.com&gt;
         -->
         © 2007  Réal Gagnon <www.rgagnon.com>
      */
  }
}

Using Array

public class StringUtils {

  private StringUtils() {}

  private static String [][] htmlEscape =
     {{  "&lt;"     , "<" } ,  {  "&gt;"     , ">" } ,
      {  "&amp;"    , "&" } ,  {  "&quot;"   , "\"" } ,
      {  "&agrave;" , "à" } ,  {  "&Agrave;" , "À" } ,
      {  "&acirc;"  , "â" } ,  {  "&auml;"   , "ä" } ,
      {  "&Auml;"   , "Ä" } ,  {  "&Acirc;"  , "Â" } ,
      {  "&aring;"  , "å" } ,  {  "&Aring;"  , "Å" } ,
      {  "&aelig;"  , "æ" } ,  {  "&AElig;"  , "Æ" } ,
      {  "&ccedil;" , "ç" } ,  {  "&Ccedil;" , "Ç" } ,
      {  "&eacute;" , "é" } ,  {  "&Eacute;" , "É" } ,
      {  "&egrave;" , "è" } ,  {  "&Egrave;" , "È" } ,
      {  "&ecirc;"  , "ê" } ,  {  "&Ecirc;"  , "Ê" } ,
      {  "&euml;"   , "ë" } ,  {  "&Euml;"   , "Ë" } ,
      {  "&iuml;"   , "ï" } ,  {  "&Iuml;"   , "Ï" } ,
      {  "&ocirc;"  , "ô" } ,  {  "&Ocirc;"  , "Ô" } ,
      {  "&ouml;"   , "ö" } ,  {  "&Ouml;"   , "Ö" } ,
      {  "&oslash;" , "ø" } ,  {  "&Oslash;" , "Ø" } ,
      {  "&szlig;"  , "ß" } ,  {  "&ugrave;" , "ù" } ,
      {  "&Ugrave;" , "Ù" } ,  {  "&ucirc;"  , "û" } ,
      {  "&Ucirc;"  , "Û" } ,  {  "&uuml;"   , "ü" } ,
      {  "&Uuml;"   , "Ü" } ,  {  "&nbsp;"   , " " } ,
      {  "&copy;"   , "\u00a9" } ,
      {  "&reg;"    , "\u00ae" } ,
      {  "&euro;"   , "\u20a0" }
     };

  public static final String unescapeHTML(String s, int start){
     int i, j, k;

     i = s.indexOf("&", start);
     start = i + 1;
     if (i > -1) {
        j = s.indexOf(";" ,i);
        /*
           we don't want to start from the beginning
           the next time, to handle the case of the &
           thanks to Pieter Hertogh for the bug fix!
        */
        if (j > i) {
           // ok this is not most optimized way to
           // do it, a StringBuffer would be better,
           // this is left as an exercise to the reader!
           String temp = s.substring(i , j + 1);
           // search in htmlEscape[][] if temp is there
           k = 0;
           while (k < htmlEscape.length) {
             if (htmlEscape[k][0].equals(temp)) break;
             else k++;
           }
           if (k < htmlEscape.length) {
             s = s.substring(0 , i)
                    + htmlEscape[k][1] + s.substring(j + 1);
             return unescapeHTML(s, i); // recursive call
           }
         }
     }
     return s;
  }

  public static void main(String args[]) throws Exception {
      // to see accented character to the console
      java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850");
      String test = "&copy; 2000  R&eacute;al Gagnon &lt;www.rgagnon.com&gt;";
      ps.println(test + "\n-->\n" +unescapeHTML(test, 0));

      /*
         output :
         &copy; 2000  R&eacute;al Gagnon &lt;www.rgagnon.com&gt;
         -->
         © 2000  Réal Gagnon <www.rgagnon.com>
      */
  }
}

This HowTo deals only with a small subset of the available HTML entities. See this Wikipedia article for a complete list : http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Character_entities_in_HTML.

Send comment, question or suggestion to howto@rgagnon.com

Unescape HTML special characters from a StringTag(s): String/Number

About cookies on this site

Using HashMap

Using Array