Unescape HTML special characters from a StringTag(s): String/Number
Using HashMap
import java.util.*; public class StringUtils { private StringUtils() {} private static HashMap<String,String> htmlEntities; static { htmlEntities = new HashMap<String,String>(); htmlEntities.put("<","<") ; htmlEntities.put(">",">"); htmlEntities.put("&","&") ; htmlEntities.put(""","\""); htmlEntities.put("à","à"); htmlEntities.put("À","À"); htmlEntities.put("â","â") ; htmlEntities.put("ä","ä"); htmlEntities.put("Ä","Ä") ; htmlEntities.put("Â","Â"); htmlEntities.put("å","å") ; htmlEntities.put("Å","Å"); htmlEntities.put("æ","æ") ; htmlEntities.put("Æ","Æ" ); htmlEntities.put("ç","ç"); htmlEntities.put("Ç","Ç"); htmlEntities.put("é","é"); htmlEntities.put("É","É" ); htmlEntities.put("è","è"); htmlEntities.put("È","È"); htmlEntities.put("ê","ê") ; htmlEntities.put("Ê","Ê"); htmlEntities.put("ë","ë") ; htmlEntities.put("Ë","Ë"); htmlEntities.put("ï","ï") ; htmlEntities.put("Ï","Ï"); htmlEntities.put("ô","ô") ; htmlEntities.put("Ô","Ô"); htmlEntities.put("ö","ö") ; htmlEntities.put("Ö","Ö"); htmlEntities.put("ø","ø") ; htmlEntities.put("Ø","Ø"); htmlEntities.put("ß","ß") ; htmlEntities.put("ù","ù"); htmlEntities.put("Ù","Ù"); htmlEntities.put("û","û"); htmlEntities.put("Û","Û") ; htmlEntities.put("ü","ü"); htmlEntities.put("Ü","Ü") ; htmlEntities.put(" "," "); htmlEntities.put("©","\u00a9"); htmlEntities.put("®","\u00ae"); htmlEntities.put("€","\u20a0"); } /* Here the original recursive version. It is fine unless you pass a big string then a Stack Overflow is possible :-( public static final String unescapeHTML(String source, int start){ int i,j; i = source.indexOf("&", start); if (i > -1) { j = source.indexOf(";" ,i); if (j > i) { String entityToLookFor = source.substring(i , j + 1); String value = (String)htmlEntities.get(entityToLookFor); if (value != null) { source = new StringBuffer().append(source.substring(0 , i)) .append(value) .append(source.substring(j + 1)) .toString(); return unescapeHTML(source, i + 1); // recursive call } } } return source; } M. McNeely Jr. has sent a version with do...while()loop which is more robust. Thanks to him! */ public static final String unescapeHTML(String source) { int i, j; boolean continueLoop; int skip = 0; do { continueLoop = false; i = source.indexOf("&", skip); if (i > -1) { j = source.indexOf(";", i); if (j > i) { String entityToLookFor = source.substring(i, j + 1); String value = (String) htmlEntities.get(entityToLookFor); if (value != null) { source = source.substring(0, i) + value + source.substring(j + 1); continueLoop = true; } else if (value == null){ skip = i+1; continueLoop = true; } } } } while (continueLoop); return source; } public static void main(String args[]) throws Exception { // to see accented character to the console (Windows DOS Shell) java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850"); String test = "© 2007 Réal Gagnon <www.rgagnon.com>"; ps.println(test + "\n-->\n" +unescapeHTML(test)); /* output ((Windows DOS Shell): © 2007 Réal Gagnon <www.rgagnon.com> --> © 2007 Réal Gagnon <www.rgagnon.com> */ } }
Using Array
public class StringUtils { private StringUtils() {} private static String [][] htmlEscape = {{ "<" , "<" } , { ">" , ">" } , { "&" , "&" } , { """ , "\"" } , { "à" , "à" } , { "À" , "À" } , { "â" , "â" } , { "ä" , "ä" } , { "Ä" , "Ä" } , { "Â" , "Â" } , { "å" , "å" } , { "Å" , "Å" } , { "æ" , "æ" } , { "Æ" , "Æ" } , { "ç" , "ç" } , { "Ç" , "Ç" } , { "é" , "é" } , { "É" , "É" } , { "è" , "è" } , { "È" , "È" } , { "ê" , "ê" } , { "Ê" , "Ê" } , { "ë" , "ë" } , { "Ë" , "Ë" } , { "ï" , "ï" } , { "Ï" , "Ï" } , { "ô" , "ô" } , { "Ô" , "Ô" } , { "ö" , "ö" } , { "Ö" , "Ö" } , { "ø" , "ø" } , { "Ø" , "Ø" } , { "ß" , "ß" } , { "ù" , "ù" } , { "Ù" , "Ù" } , { "û" , "û" } , { "Û" , "Û" } , { "ü" , "ü" } , { "Ü" , "Ü" } , { " " , " " } , { "©" , "\u00a9" } , { "®" , "\u00ae" } , { "€" , "\u20a0" } }; public static final String unescapeHTML(String s, int start){ int i, j, k; i = s.indexOf("&", start); start = i + 1; if (i > -1) { j = s.indexOf(";" ,i); /* we don't want to start from the beginning the next time, to handle the case of the & thanks to Pieter Hertogh for the bug fix! */ if (j > i) { // ok this is not most optimized way to // do it, a StringBuffer would be better, // this is left as an exercise to the reader! String temp = s.substring(i , j + 1); // search in htmlEscape[][] if temp is there k = 0; while (k < htmlEscape.length) { if (htmlEscape[k][0].equals(temp)) break; else k++; } if (k < htmlEscape.length) { s = s.substring(0 , i) + htmlEscape[k][1] + s.substring(j + 1); return unescapeHTML(s, i); // recursive call } } } return s; } public static void main(String args[]) throws Exception { // to see accented character to the console java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850"); String test = "© 2000 Réal Gagnon <www.rgagnon.com>"; ps.println(test + "\n-->\n" +unescapeHTML(test, 0)); /* output : © 2000 Réal Gagnon <www.rgagnon.com> --> © 2000 Réal Gagnon <www.rgagnon.com> */ } }
This HowTo deals only with a small subset of the available HTML entities. See this Wikipedia article for a complete list : http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Character_entities_in_HTML.