Remove XML tags from a string to keep only text Tag(s): XML
First we define an XSLT template.
[onlytext.xsl]
<?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fo="http://www.w3.org/1999/XSL/Format"> <xsl:output method="text" indent="no"/> <xsl:template match="//text()[normalize-space(.) = '']> <xsl:text>
</xsl:text> </xsl:template> </xsl:stylesheet>
[howto.xml]
<?xml version="1.0"?> <howto> <topic id="1"> <title>Java</title> <url>http://www.rgagnon.com/topics/java-io.html</url> </topic> <topic id="2"> <title>XML</title> <url>http://www.rgagnon.com/topics/java-xml.html</url> </topic> <topic id="3"> <title>Javascript</title> <url>http://www.rgagnon.com/topics/js-language.html</url> </topic> <topic id="4"> <title>VBScript</title> <url>http://www.rgagnon.com/topics/wsh-vbs.html</url> </topic> </howto>
And the Java code to apply template to the XML.
import java.io.File; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.w3c.dom.Document; public class XMLUtils { public static void main(String args[]) throws Exception { File stylesheet = new File("/temp/onlytext.xsl"); File xmlSource = new File("/temp/howto.xml"); File txtOutput = new File("/temp/howto.txt"); StreamSource stylesource = new StreamSource(stylesheet); Transformer transformer = TransformerFactory.newInstance() .newTransformer(stylesource); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document document = builder.parse(xmlSource); transformer.transform(new DOMSource(document), new StreamResult(txtOutput)); // output to console : // transformer.transform(new DOMSource(document), new StreamResult(System.out)); System.out.println("Done."); } }
Java http://www.rgagnon.com/topics/java-io.html XML http://www.rgagnon.com/topics/java-xml.html Javascript http://www.rgagnon.com/topics/js-language.html VBScript http://www.rgagnon.com/topics/wsh-vbs.html
See also : Remove HTML tags from a file to extract only the TEXT.