Unescape HTML special characters from a StringTag(s): String/Number
Using HashMap
import java.util.*;
public class StringUtils {
private StringUtils() {}
private static HashMap<String,String> htmlEntities;
static {
htmlEntities = new HashMap<String,String>();
htmlEntities.put("<","<") ; htmlEntities.put(">",">");
htmlEntities.put("&","&") ; htmlEntities.put(""","\"");
htmlEntities.put("à","à"); htmlEntities.put("À","À");
htmlEntities.put("â","â") ; htmlEntities.put("ä","ä");
htmlEntities.put("Ä","Ä") ; htmlEntities.put("Â","Â");
htmlEntities.put("å","å") ; htmlEntities.put("Å","Å");
htmlEntities.put("æ","æ") ; htmlEntities.put("Æ","Æ" );
htmlEntities.put("ç","ç"); htmlEntities.put("Ç","Ç");
htmlEntities.put("é","é"); htmlEntities.put("É","É" );
htmlEntities.put("è","è"); htmlEntities.put("È","È");
htmlEntities.put("ê","ê") ; htmlEntities.put("Ê","Ê");
htmlEntities.put("ë","ë") ; htmlEntities.put("Ë","Ë");
htmlEntities.put("ï","ï") ; htmlEntities.put("Ï","Ï");
htmlEntities.put("ô","ô") ; htmlEntities.put("Ô","Ô");
htmlEntities.put("ö","ö") ; htmlEntities.put("Ö","Ö");
htmlEntities.put("ø","ø") ; htmlEntities.put("Ø","Ø");
htmlEntities.put("ß","ß") ; htmlEntities.put("ù","ù");
htmlEntities.put("Ù","Ù"); htmlEntities.put("û","û");
htmlEntities.put("Û","Û") ; htmlEntities.put("ü","ü");
htmlEntities.put("Ü","Ü") ; htmlEntities.put(" "," ");
htmlEntities.put("©","\u00a9");
htmlEntities.put("®","\u00ae");
htmlEntities.put("€","\u20a0");
}
/*
Here the original recursive version.
It is fine unless you pass a big string then a Stack Overflow is possible :-(
public static final String unescapeHTML(String source, int start){
int i,j;
i = source.indexOf("&", start);
if (i > -1) {
j = source.indexOf(";" ,i);
if (j > i) {
String entityToLookFor = source.substring(i , j + 1);
String value = (String)htmlEntities.get(entityToLookFor);
if (value != null) {
source = new StringBuffer().append(source.substring(0 , i))
.append(value)
.append(source.substring(j + 1))
.toString();
return unescapeHTML(source, i + 1); // recursive call
}
}
}
return source;
}
M. McNeely Jr. has sent a version with do...while()loop which is more robust.
Thanks to him!
*/
public static final String unescapeHTML(String source) {
int i, j;
boolean continueLoop;
int skip = 0;
do {
continueLoop = false;
i = source.indexOf("&", skip);
if (i > -1) {
j = source.indexOf(";", i);
if (j > i) {
String entityToLookFor = source.substring(i, j + 1);
String value = (String) htmlEntities.get(entityToLookFor);
if (value != null) {
source = source.substring(0, i)
+ value + source.substring(j + 1);
continueLoop = true;
}
else if (value == null){
skip = i+1;
continueLoop = true;
}
}
}
} while (continueLoop);
return source;
}
public static void main(String args[]) throws Exception {
// to see accented character to the console (Windows DOS Shell)
java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850");
String test = "© 2007 Réal Gagnon <www.rgagnon.com>";
ps.println(test + "\n-->\n" +unescapeHTML(test));
/*
output ((Windows DOS Shell):
© 2007 Réal Gagnon <www.rgagnon.com>
-->
© 2007 Réal Gagnon <www.rgagnon.com>
*/
}
}
Using Array
public class StringUtils {
private StringUtils() {}
private static String [][] htmlEscape =
{{ "<" , "<" } , { ">" , ">" } ,
{ "&" , "&" } , { """ , "\"" } ,
{ "à" , "à" } , { "À" , "À" } ,
{ "â" , "â" } , { "ä" , "ä" } ,
{ "Ä" , "Ä" } , { "Â" , "Â" } ,
{ "å" , "å" } , { "Å" , "Å" } ,
{ "æ" , "æ" } , { "Æ" , "Æ" } ,
{ "ç" , "ç" } , { "Ç" , "Ç" } ,
{ "é" , "é" } , { "É" , "É" } ,
{ "è" , "è" } , { "È" , "È" } ,
{ "ê" , "ê" } , { "Ê" , "Ê" } ,
{ "ë" , "ë" } , { "Ë" , "Ë" } ,
{ "ï" , "ï" } , { "Ï" , "Ï" } ,
{ "ô" , "ô" } , { "Ô" , "Ô" } ,
{ "ö" , "ö" } , { "Ö" , "Ö" } ,
{ "ø" , "ø" } , { "Ø" , "Ø" } ,
{ "ß" , "ß" } , { "ù" , "ù" } ,
{ "Ù" , "Ù" } , { "û" , "û" } ,
{ "Û" , "Û" } , { "ü" , "ü" } ,
{ "Ü" , "Ü" } , { " " , " " } ,
{ "©" , "\u00a9" } ,
{ "®" , "\u00ae" } ,
{ "€" , "\u20a0" }
};
public static final String unescapeHTML(String s, int start){
int i, j, k;
i = s.indexOf("&", start);
start = i + 1;
if (i > -1) {
j = s.indexOf(";" ,i);
/*
we don't want to start from the beginning
the next time, to handle the case of the &
thanks to Pieter Hertogh for the bug fix!
*/
if (j > i) {
// ok this is not most optimized way to
// do it, a StringBuffer would be better,
// this is left as an exercise to the reader!
String temp = s.substring(i , j + 1);
// search in htmlEscape[][] if temp is there
k = 0;
while (k < htmlEscape.length) {
if (htmlEscape[k][0].equals(temp)) break;
else k++;
}
if (k < htmlEscape.length) {
s = s.substring(0 , i)
+ htmlEscape[k][1] + s.substring(j + 1);
return unescapeHTML(s, i); // recursive call
}
}
}
return s;
}
public static void main(String args[]) throws Exception {
// to see accented character to the console
java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850");
String test = "© 2000 Réal Gagnon <www.rgagnon.com>";
ps.println(test + "\n-->\n" +unescapeHTML(test, 0));
/*
output :
© 2000 Réal Gagnon <www.rgagnon.com>
-->
© 2000 Réal Gagnon <www.rgagnon.com>
*/
}
}
This HowTo deals only with a small subset of the available HTML entities. See this Wikipedia article for a complete list : http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Character_entities_in_HTML.
mail_outline
Send comment, question or suggestion to howto@rgagnon.com
Send comment, question or suggestion to howto@rgagnon.com