[MPLUGIN-96] Handle character encoding properly in makeHtmlValid()

git-svn-id: https://svn.apache.org/repos/asf/maven/plugin-tools/trunk@643558 13f79535-47bb-0310-9956-ffa450edef68
master
Benjamin Bentmann 2008-04-01 20:42:11 +00:00
parent 6e025f5245
commit 6ebf227d09
2 changed files with 36 additions and 10 deletions

View File

@ -19,7 +19,10 @@ package org.apache.maven.tools.plugin.util;
* under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLClassLoader;
@ -38,10 +41,9 @@ import org.apache.maven.reporting.MavenReport;
import org.codehaus.plexus.component.repository.ComponentDependency;
import org.codehaus.plexus.util.DirectoryScanner;
import org.codehaus.plexus.util.FileUtils;
import org.codehaus.plexus.util.StringInputStream;
import org.codehaus.plexus.util.StringOutputStream;
import org.codehaus.plexus.util.StringUtils;
import org.codehaus.plexus.util.xml.XMLWriter;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
/**
@ -241,29 +243,42 @@ public final class PluginUtils
return "";
}
StringOutputStream out = new StringOutputStream();
String commentCleaned = decodeJavadocTags( description );
// Using jTidy to clean comment
Tidy tidy = new Tidy();
tidy.setDocType( "loose" );
tidy.setXHTML( true );
tidy.setXmlOut( true );
tidy.setCharEncoding( Configuration.UTF8 );
tidy.setMakeClean( true );
tidy.setNumEntities( true );
tidy.setQuoteNbsp( false );
tidy.setQuiet( true );
tidy.setShowWarnings( false );
tidy.parse( new StringInputStream( decodeJavadocTags( description ) ), out );
try
{
ByteArrayOutputStream out = new ByteArrayOutputStream( commentCleaned.length() + 256 );
tidy.parse( new ByteArrayInputStream( commentCleaned.getBytes( "UTF-8" ) ), out );
commentCleaned = out.toString("UTF-8");
}
catch ( UnsupportedEncodingException e )
{
// cannot happen as every JVM must support UTF-8, see also class javadoc for java.nio.charset.Charset
}
// strip the header/body stuff
String LS = System.getProperty( "line.separator" );
String commentCleaned = out.toString();
if ( StringUtils.isEmpty( commentCleaned ) )
{
return "";
}
// strip the header/body stuff
String LS = System.getProperty( "line.separator" );
int startPos = commentCleaned.indexOf( "<body>" + LS ) + 6 + LS.length();
int endPos = commentCleaned.indexOf( LS + "</body>" );
commentCleaned = commentCleaned.substring( startPos, endPos );
return commentCleaned.substring( startPos, endPos );
return commentCleaned;
}
/**

View File

@ -130,8 +130,19 @@ public class PluginUtilsTest
// wrong HTML
javadoc = "Generates <i>something</i> <b> for the project.";
assertEquals( "Generates <i>something</i> <b> for the project.</b>", PluginUtils
.makeHtmlValid( javadoc ) );
assertEquals( "Generates <i>something</i> <b> for the project.</b>", PluginUtils.makeHtmlValid( javadoc ) );
// special characters
javadoc = "& &amp; < > \u00A0";
assertEquals( "&amp; &amp; &lt; &gt; \u00A0", PluginUtils.makeHtmlValid( javadoc ) );
// non ASCII characters
javadoc = "\u00E4 \u00F6 \u00FC \u00DF";
assertEquals( javadoc, PluginUtils.makeHtmlValid( javadoc ) );
// non Latin1 characters
javadoc = "\u0130 \u03A3 \u05D0 \u06DE";
assertEquals( javadoc, PluginUtils.makeHtmlValid( javadoc ) );
}
public void testDecodeJavadocTags()